In [1]:
import pandas as pd
import pickle
import numpy as np
import nltk
nltk.download('punkt')
from tqdm.auto import tqdm
import copy

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence
import torch.nn.functional as F

from sklearn.metrics import f1_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lalor\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
class agression_dataset(Dataset):
    def __init__(self, split):
        super(Dataset, self).__init__()
        self.load_data(split)
        self.vocab, self.emb_mat = self.load_vocab_embeddings()
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        '''Método principal para cargar una observación del dataset.
           label: categoría a la que pertenece la observación.
           words_ids: lista de índices de las palabras en el vocabulario.
        '''
        label = self.data.iloc[index]['target']
        words, words_ids = self.preprocessed_text(index)
        return words_ids, label, words
    
    def preprocessed_text(self, index):
        '''Preprocess text and '''

        text = self.data.iloc[index]['text']
        words = nltk.word_tokenize(text)
        words_ids = [self.vocab[word] if word in self.vocab.keys() else self.emb_mat.shape[0]-1\
                        for word in words]
        return words, words_ids
    
    def load_data(self, split):
        '''Método para cargar los datos.
           El texto está en la columna "text" y las categorías en la columna "target".        
        '''
        self.data = pd.read_csv('%s.csv'%(split))

    def load_vocab_embeddings(self):
        '''Embeddings preentrenados en twitter.
           emb_mat: Matriz de embeddings. Un vector de tamaño 200 para cada palabra del vocabulario.
           vocab: Diccionario, asigna a cada palabra su renglón correspondiente en la matriz de embeddings.        
        '''
        embeddings_list = []
        self.vocab_dict = {}
        vocab = {}
        prefix = '../../data/'
        with open(prefix + 'word2vec_col.txt', 'r') as f:
            for i, line in enumerate(f):
                if i!=0:
                    values = line.split()
                    self.vocab_dict[i+1] = values[0]
                    vocab[values[0]] = i+1
                    vector = np.asarray(values[1:], 'float32')
                    embeddings_list.append(vector)
        embeddings_list.insert(0,np.mean(np.vstack(embeddings_list), axis=0))
        embeddings_list.insert(0,np.zeros(100))
        self.vocab_dict[0] = '[PAD]'
        self.vocab_dict[1] = '[UNK]'
        vocab['[PAD]'] = 0
        vocab['[UNK]'] = 1
        emb_mat = np.vstack(embeddings_list)

        return vocab, emb_mat

    def get_weights(self):
        '''Devuelve pesos inversos para cada categoría. Mayor peso para la categoría con menos observaciones.'''
        
        cat_0 = len(self.data[self.data['target']==0])
        cat_1 = len(self.data[self.data['target']==1])
        maxi = max(cat_0, cat_1)
        return torch.tensor([maxi/cat_0, maxi/cat_1])
    
    def collate_fn(self, batch):
        '''Función que ejecuta el dataloader para formar batches de datos.'''
        zipped_batch = list(zip(*batch))
        word_ids = [torch.tensor(t) for t in zipped_batch[0]]
        word_ids = torch.cat(word_ids, dim=0)
        lengths = torch.tensor([len(t) for t in zipped_batch[0]])
        labels = torch.tensor(zipped_batch[1])
        words = zipped_batch[2]
        return word_ids, lengths, labels, words

In [3]:
class SimpleRNN(nn.Module):
    def __init__(self, input_size=100, hidden_size=128, num_layers=1, 
                 bidirectional=False, emb_mat=None, dense_hidden_size=256):
        '''Constructor, aquí definimos las capas.
        input:
            input_size: Tamaño de los embeddings de las palabras.
            hidden_size: Tamaño de la capa oculta de la GRU. 
            num_layers: Número de capas de la GRU.
            bidirectional: True si se quiere una GRU bidireccional. 
            emb_mat: Matriz de embeddings del vocabulario.
            dense_hidden_size: Tamaño de la capa ocula del clasificador.
        '''
        super(SimpleRNN, self).__init__()
        # Matriz entrenable de embeddings, tamaño del vocab_size x 100
        self.embeddings = nn.Embedding.from_pretrained(\
                            torch.FloatTensor(emb_mat), freeze=False)
        # Gated Recurrent Unit
        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size,
                          num_layers=num_layers, bidirectional=bidirectional)
        # Número de direcciones de la GRU
        directions = 2 if bidirectional else 1
        # Clasificador MLP
        self.classifier = nn.Sequential(\
                            nn.Linear(hidden_size*directions, dense_hidden_size),
                            nn.BatchNorm1d(dense_hidden_size),
                            nn.ReLU(),
                            nn.Linear(dense_hidden_size, 2))
        
    def forward(self, input_seq, lengths):
        '''Función feed-forward de la red.
        input:
            input_seq: Lista de ids para cada palabra.
            lengths: Número de palabras en cada una de las observaciones del batch.
        output:
            x: vectores para clasificar.
            return None for consistency with the next model
        '''
        # Calcula el embedding para cada palabra.
        x = self.embeddings(input_seq)
        # Forma las secuencias de palabras que entran a la GRU.
        x = x.split(lengths.tolist())
        # Añade pading y empaqueta las secuencias (mayor velocidad de cómputo).
        x = pad_sequence(x)
        x = pack_padded_sequence(x, lengths, enforce_sorted=False)
        output, hn = self.gru(x)
        hn = torch.cat([h for h in hn], dim=-1)
        x = self.classifier(hn)
        return x, None

In [4]:
def eval_model(model, dataloader, criterion, device):
    '''Función para evaluar el modelo.'''
    with torch.no_grad():
        model.eval()
        losses = []
        preds = torch.empty(0).long()
        targets = torch.empty(0).long()
        scores_list = []
        words_list = []
        pred_list = []
        for data in tqdm(dataloader):
            torch.cuda.empty_cache()
            seq, seq_len, labels, words = data 
            seq, labels = seq.to(device), labels.to(device)
            output, scores = model(seq, seq_len)
            output = F.log_softmax(output, dim=1)
            loss = criterion(output, labels)
            losses.append(loss.item())
            predictions = F.log_softmax(output, dim=1).argmax(1)
            preds = torch.cat([preds, predictions.cpu()],dim=0)
            targets = torch.cat([targets, labels.cpu()],dim=0)
            if scores is not None:
                pred_list += predictions.tolist()
                scores = scores.cpu().squeeze(2).tolist()
                scores_list += scores 
                words_list += words 

        model.train()
        preds = preds.numpy()
        targets = targets.numpy()
        f1 = f1_score(targets, preds, average='binary')

        return np.mean(losses), f1, scores_list, words_list, pred_list

In [5]:
batch_size=128

In [6]:
prefix = '../../data/agresividad/'
train_dataset = agression_dataset(prefix + 'train')
val_dataset = agression_dataset(prefix + 'val')
test_dataset = agression_dataset(prefix + 'test')
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn = train_dataset.collate_fn, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, collate_fn = val_dataset.collate_fn, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn = test_dataset.collate_fn, shuffle=False)

In [7]:
lr = 0.001
epochs = 10
weight_decay=0.0001
beta1=0
beta2=0.999
device = torch.device('cuda')

In [8]:
model = SimpleRNN(emb_mat=train_dataset.emb_mat, bidirectional=False).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr,weight_decay=weight_decay, betas = (beta1, beta2))
weight = train_dataset.get_weights().to(device)
criterion = nn.NLLLoss(weight = weight)

In [9]:
best_val_f1 = 0
for epoch in range(epochs):
    for data in tqdm(train_dataloader):
        # Limpia basura de la memoria GPU
        torch.cuda.empty_cache()
        # Reiniciamos el cálculo del gradiente
        optimizer.zero_grad()
        # Desempaca los datos que salen del dataloader
        seq, seq_len, labels, _ = data 
        # Mueve los datos al mismo device en el que esté el modelo
        seq, labels = seq.to(device), labels.to(device)
        output, _ = model(seq, seq_len)
        output = F.log_softmax(output, dim=1)
        loss = criterion(output, labels)
        # Calcula el gradiente de la pérdida
        loss.backward()
        # Realiza un paso de la optimización
        optimizer.step()

    # Evalúa los modelos en los conjuntos de entrenamiento y validación
    train_loss, train_f1, _, _, _ = eval_model(model, train_dataloader, criterion, device)
    val_loss, val_f1, _, _, _ = eval_model(model, val_dataloader, criterion, device)
    print('epoch: %d'%(epoch))
    print('train_loss: %5f | val_loss: %5f | train_f1: %5f | val_f1: %5f'%(train_loss, val_loss, train_f1, val_f1))
    if val_f1 > best_val_f1:
        best_val_f1=val_f1
        best_state_dict=copy.deepcopy(model.state_dict())

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 0
train_loss: 0.493919 | val_loss: 0.553115 | train_f1: 0.706645 | val_f1: 0.650376


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 1
train_loss: 0.376624 | val_loss: 0.504548 | train_f1: 0.785042 | val_f1: 0.697769


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 2
train_loss: 0.298373 | val_loss: 0.503930 | train_f1: 0.852925 | val_f1: 0.675737


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 3
train_loss: 0.221822 | val_loss: 0.537863 | train_f1: 0.886999 | val_f1: 0.697892


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 4
train_loss: 0.202810 | val_loss: 0.527442 | train_f1: 0.875187 | val_f1: 0.714563


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 5
train_loss: 0.346746 | val_loss: 1.122172 | train_f1: 0.798799 | val_f1: 0.524691


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 6
train_loss: 0.095411 | val_loss: 0.763919 | train_f1: 0.957270 | val_f1: 0.654378


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 7
train_loss: 0.115390 | val_loss: 0.739708 | train_f1: 0.925342 | val_f1: 0.715953


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 8
train_loss: 0.064261 | val_loss: 1.125448 | train_f1: 0.976000 | val_f1: 0.620865


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 9
train_loss: 0.068213 | val_loss: 0.849183 | train_f1: 0.959783 | val_f1: 0.685950


In [10]:
model.load_state_dict(best_state_dict)
train_loss, train_f1, _, _, _ = eval_model(model, train_dataloader, criterion, device)
val_loss, val_f1, _, _, _ = eval_model(model, val_dataloader, criterion, device)
test_loss, test_f1, _, _, _ = eval_model(model, test_dataloader, criterion, device)
print('train_loss: %5f | train_f1: %5f'%(train_loss, train_f1))
print('val_loss: %5f | val_f1: %5f'%(val_loss, val_f1))
print('test_loss: %5f | test_f1: %5f'%(test_loss, test_f1))

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

train_loss: 0.116644 | train_f1: 0.925342
val_loss: 0.739708 | val_f1: 0.715953
test_loss: 0.133533 | test_f1: 0.920502


# Attention

La sintaxis del modelo es similar al anterior pero se anade un modulo de atencion. El modulo de atencion toma los vectores de salida $h_t$ de la GRU y calcula una representacion $s$ como suma ponderada.

$$ s = \sum_t \alpha_t h_t $$

donde

$$ u_t = tanh(Wh_t + b), $$

$$ \alpha_t = \frac{exp(u_t^T)}{\sum_i exp(u_i^T u)}

In [11]:
class AttnModule(nn.Module):
    def __init__(self, input_size, attn_hidden_size=128):
        '''
        input:
            input_size: tamano de la capa oculta del GRU. 
            attn_hidden_size: tamaño de la capa oculta.
        '''
        super(AttnModule, self).__init__()
        self.fc1 = nn.Linear(input_size, attn_hidden_size)
        self.fc2 = nn.Linear(attn_hidden_size, 1, bias=False)

    def forward(self, seq, lengths):
        '''
        input:
            seq: secuencia de vectores ocultos de la GRU.
            lengths: numero de palabras en cada observacion.
        '''
        x = pad_packed_sequence(seq)[0]
        seq_len, batch_size, nhid = x.size()
        u = self.fc1(x.view(batch_size*seq_len, nhid))
        u = torch.tanh(u)
        scores = self.fc2(u)
        scores = scores.view(seq_len, batch_size, 1)
        #Asigna -100 a las posiciones con padding para que no sean considerados en la atencion.
        scores = nn.utils.rnn.pack_padded_sequence(scores, lengths=lengths,enforce_sorted=False)
        scores = nn.utils.rnn.pad_packed_sequence(scores, padding_value=-100)[0]
        scores = F.softmax(scores, dim=0)
        scores = scores.transpose(0,1)
        x = x.transpose(0,1).transpose(1,2)
        x = torch.bmm(x, scores)
        return x.squeeze(2), scores

In [12]:
class AttnRNN(nn.Module):
    def __init__(self, input_size=100, hidden_size=128, num_layers=1, 
                 bidirectional=False, emb_mat=None, dense_hidden_size=256,
                 attn_hidden_size=128):
        super(AttnRNN, self).__init__()
        self.embeddings = nn.Embedding.from_pretrained(\
                            torch.FloatTensor(emb_mat), freeze=False)
        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size,
                          num_layers=num_layers, bidirectional=bidirectional)
        directions = 2 if bidirectional else 1
        self.attn = AttnModule(input_size=hidden_size*directions)
        self.classifier = nn.Sequential(\
                            nn.Linear(hidden_size*directions, dense_hidden_size),
                            nn.BatchNorm1d(dense_hidden_size),
                            nn.ReLU(),
                            nn.Linear(dense_hidden_size, 2))
        
    def forward(self, input_seq, lengths):
        x = self.embeddings(input_seq)
        x = x.split(lengths.tolist())
        x = pad_sequence(x)
        x = pack_padded_sequence(x, lengths, enforce_sorted=False)
        output, hn = self.gru(x)
        x, scores = self.attn(output, lengths)
        x = self.classifier(x)
        return x, scores.detach()

In [13]:
lr = 0.0001
epochs = 20
device = torch.device('cuda')
weight_decay=0.0001
beta1=0
beta2=0.999

In [14]:
model = AttnRNN(emb_mat=train_dataset.emb_mat, bidirectional=False).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr,weight_decay=weight_decay, betas = (beta1, beta2))
weight = train_dataset.get_weights().to(device)
criterion = nn.NLLLoss(weight = weight) 

In [15]:
best_val_f1 = 0
for epoch in range(epochs):
    for data in tqdm(train_dataloader):
        torch.cuda.empty_cache()
        optimizer.zero_grad()
        seq, seq_len, labels, _ = data 
        seq, labels = seq.to(device), labels.to(device)
        output, _ = model(seq, seq_len)
        output = F.log_softmax(output, dim=1)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

    train_loss, train_f1, _, _, _ = eval_model(model, train_dataloader, criterion, device)
    val_loss, val_f1, _, _, _ = eval_model(model, val_dataloader, criterion, device)
    print('epoch: %d'%(epoch))
    print('train_loss: %5f | val_loss: %5f | train_f1: %5f | val_f1: %5f'%(train_loss, val_loss, train_f1, val_f1))
    if val_f1>best_val_f1:
        best_val_f1=val_f1
        best_state_dict=copy.deepcopy(model.state_dict())

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 0
train_loss: 0.608760 | val_loss: 0.623089 | train_f1: 0.627985 | val_f1: 0.617124


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 1
train_loss: 0.508509 | val_loss: 0.539510 | train_f1: 0.691095 | val_f1: 0.687732


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 2
train_loss: 0.467544 | val_loss: 0.503739 | train_f1: 0.722156 | val_f1: 0.707692


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 3
train_loss: 0.429059 | val_loss: 0.481098 | train_f1: 0.744877 | val_f1: 0.713427


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 4
train_loss: 0.405617 | val_loss: 0.465587 | train_f1: 0.765132 | val_f1: 0.724280


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 5
train_loss: 0.385841 | val_loss: 0.457205 | train_f1: 0.771518 | val_f1: 0.725000


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 6
train_loss: 0.374075 | val_loss: 0.456994 | train_f1: 0.790110 | val_f1: 0.700000


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 7
train_loss: 0.352552 | val_loss: 0.446619 | train_f1: 0.798958 | val_f1: 0.735967


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 8
train_loss: 0.344459 | val_loss: 0.454124 | train_f1: 0.807823 | val_f1: 0.712018


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 9
train_loss: 0.333452 | val_loss: 0.435228 | train_f1: 0.814721 | val_f1: 0.733607


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 10
train_loss: 0.308624 | val_loss: 0.441956 | train_f1: 0.835726 | val_f1: 0.740260


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 11
train_loss: 0.299380 | val_loss: 0.437195 | train_f1: 0.837317 | val_f1: 0.747390


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 12
train_loss: 0.286442 | val_loss: 0.439384 | train_f1: 0.843931 | val_f1: 0.742004


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 13
train_loss: 0.269822 | val_loss: 0.438794 | train_f1: 0.864807 | val_f1: 0.738661


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 14
train_loss: 0.260168 | val_loss: 0.438475 | train_f1: 0.867580 | val_f1: 0.737527


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 15
train_loss: 0.248912 | val_loss: 0.443226 | train_f1: 0.874576 | val_f1: 0.746331


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 16
train_loss: 0.232347 | val_loss: 0.448232 | train_f1: 0.887871 | val_f1: 0.734783


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 17
train_loss: 0.217619 | val_loss: 0.455885 | train_f1: 0.894652 | val_f1: 0.751092


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 18
train_loss: 0.219504 | val_loss: 0.484484 | train_f1: 0.903262 | val_f1: 0.716279


  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

epoch: 19
train_loss: 0.195338 | val_loss: 0.472059 | train_f1: 0.914982 | val_f1: 0.733032


In [16]:
model.load_state_dict(best_state_dict)
train_loss, train_f1, train_scores, train_words, train_pred = eval_model(model, train_dataloader, criterion, device)
val_loss, val_f1, val_scores, val_words, val_pred = eval_model(model, val_dataloader, criterion, device)
test_loss, test_f1, test_scores, test_words, test_pred = eval_model(model, test_dataloader, criterion, device)
print('train_loss: %5f | train_f1: %5f'%(train_loss, train_f1))
print('val_loss: %5f | val_f1: %5f'%(val_loss, val_f1))
print('test_loss: %5f | test_f1: %5f'%(test_loss, test_f1))

  0%|          | 0/39 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

train_loss: 0.217306 | train_f1: 0.894652
val_loss: 0.455885 | val_f1: 0.751092
test_loss: 0.230557 | test_f1: 0.881579


## Visualizando la atencion

Uno de los beneficios de los mecanismos de atencion es que nos permiten identificar que elementos de las oraciones resultan mas importantes.

In [17]:
from IPython.display import display, HTML
import matplotlib
import matplotlib.pyplot as plt

In [18]:
def colorize(words, color_array):
    '''
        Funcion para visualizar la atencion, tomada de https://gist.github.com/ihsgnef/f13c35cd46624c8f458a4d23589ac768,
    '''
    # words is a list of words
    # color_array is an array of numbers between 0 and 1 of length equal to words
    cmap = matplotlib.cm.get_cmap('Reds')
    template = '<span class="barcode"; style="color: black; background-color: {}">{}</span>'
    colored_string = ''
    for word, color in zip(words, color_array):
        color = matplotlib.colors.rgb2hex(cmap(color)[:3])
        colored_string += template.format(color, '&nbsp' + word + '&nbsp')
    return colored_string

Las palabras con mas atencion se muestran en color azul y aquellas con menor atencion en color rojo. 

In [19]:
att = np.linspace(0,1,50)
p = [' ']*50
s = colorize(p, att)
# to display in ipython notebook
display(HTML(s))

In [20]:
max_attn = [np.max(scores) for scores in train_scores]
maxi = np.flip(np.argsort(max_attn))
for j in range(30):
    i = maxi[j]
    s = colorize(train_words[i], train_scores[i][:len(train_words[i])])
    # to display in ipython notebook
    category = 'Agresivo' if train_pred[maxi[j]]==1 else 'no agresivo'
    print('Categoría predicha: %s'%(category))
    display(HTML(s))

Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: no agresivo


Categoría predicha: Agresivo


Categoría predicha: no agresivo


Categoría predicha: Agresivo


Categoría predicha: no agresivo


Categoría predicha: no agresivo


Categoría predicha: Agresivo


Categoría predicha: no agresivo


Categoría predicha: no agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: no agresivo
