# David Gamaliel Arcos Bravo

GRU Implementation

In [25]:
import pandas as pd
import pickle
import numpy as np
import nltk
nltk.download('punkt')
from tqdm.auto import tqdm
import copy
import os

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import torch.nn.functional as F

from sklearn.metrics import f1_score

[nltk_data] Downloading package punkt to C:\Users\David
[nltk_data]     Arcos\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Preprocesar datasets

In [26]:
def txt_to_csv(path_text, path_labels, save_path, split=False, dist=0.00, save_path_split=None):
    with open(path_text, 'r', encoding='utf-8') as f:
        text = f.readlines()
        text = [t[:-1] for t in text]
    with open(path_labels, 'r', encoding='utf-8') as f:
        labels = f.readlines()
        labels = [l[:-1] for l in labels]
    if split:
        labels, text = np.array(labels), np.array(text)
        idx = np.arange(len(labels))
        np.random.shuffle(idx)
        labels, text = labels[idx], text[idx]
        n = int(dist*len(labels))
        train_labels, train_text = labels[:-n], text[:-n]
        test_labels, test_text = labels[-n:], text[-n:]
        df_train = pd.DataFrame({'text': train_text, 'target': train_labels})
        df_test = pd.DataFrame({'text': test_text, 'target': test_labels})
        df_train = df_train[df_train['target'] != '']
        df_test = df_test[df_test['target'] != '']
        df_train.to_csv(save_path, index=False)
        df_test.to_csv(save_path_split, index=False)
    else:
        df = pd.DataFrame({'text': text, 'target': labels})
        df = df[df['target'] != '']
        df.to_csv(save_path, index=False)

txt_to_csv('DatasetAgresividad/mex_train.txt', 'DatasetAgresividad/mex_train_labels.txt', 'train.csv', split=True, dist=0.25, save_path_split='test.csv')
txt_to_csv('DatasetAgresividad/mex_val.txt', 'DatasetAgresividad/mex_val_labels.txt', 'val.csv', split=False)

In [27]:
df = pd.read_csv('train.csv')
print(df['target'].unique())
df = pd.read_csv('val.csv')
print(df['target'].unique())
df = pd.read_csv('test.csv')
print(df['target'].unique())

[1 0]
[1 0]
[0 1]


In [28]:
class Agression_dataset(Dataset):

    def __init__(self, split):
        super(Dataset, self).__init__()
        self.load_data(split)
        self.vocab, self.emb_mat = self.load_vocab_embeddings()

    def __len__(self):
        return len(self.data)    

    def __getitem__(self, index):
        label = self.data.iloc[index]['target']
        words, word_ids = self.preprocessed_text(index)
        return word_ids, label, words
    
    def preprocessed_text(self, index):
        text = self.data.iloc[index]['text']
        words = nltk.word_tokenize(text)
        word_ids = [self.vocab[word] if word in self.vocab.keys() else 
                    self.emb_mat.shape[0]-1
                    for word in words]
        return words, word_ids

    def load_data(self, split):
        """Texto en columna text y categorias en target"""
        self.data = pd.read_csv('%s.csv'%(split))

    def load_vocab_embeddings(self):
        """Carga vocabulario y matriz de embeddings entrenados en twitter"""
        embeddings_list = []
        self.vocab_dict = {}
        vocab = {}
        with open('word2vec_col.txt', 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                if i != 0:
                    values = line.split()
                    self.vocab_dict[i+1] = values[0]
                    vocab[values[0]] = i+1
                    vector = np.asarray(values[1:], "float32")
                    embeddings_list.append(vector)
        embeddings_list.insert(0, np.mean(np.vstack(embeddings_list), axis=0))
        embeddings_list.append(np.zeros(100))
        self.vocab_dict[0] = 'PAD'
        self.vocab_dict[1] = 'UNK'
        vocab['PAD'] = 0
        vocab['UNK'] = 1
        emb_mat = np.vstack(embeddings_list)
        return vocab, emb_mat
        
    def get_weights(self):
        '''Devuelve pesos inversos para cada categoría. Mayor peso para la categoría con menos observaciones.'''
        cat_0 = len(self.data[self.data['target']==0])
        cat_1 = len(self.data[self.data['target']==1])
        maxi = max(cat_0, cat_1)
        return torch.tensor([maxi/cat_0, maxi/cat_1])

    def collate_fn(self, batch):
        '''Función que ejecuta el dataloader para formar batches de datos.'''
        zipped_batch = list(zip(*batch))
        word_ids = [torch.tensor(t) for t in zipped_batch[0]]
        word_ids = torch.cat(word_ids, dim=0)
        lengths = torch.tensor([len(t) for t in zipped_batch[0]])
        labels = torch.tensor(zipped_batch[1])
        words = zipped_batch[2]
        return word_ids, lengths, labels, words

In [29]:
class SimpleRNN(nn.Module):

    def __init__(self, input_size=100, hidden_size=128, num_layers=1, bidirectional=False, emb_mat=None, dense_hidden_size=256):
        super(SimpleRNN, self).__init__()
        self.embeddings = nn.Embedding.from_pretrained(
            torch.FloatTensor(emb_mat), freeze=False)
        self.gru = nn.GRU(input_size, hidden_size, num_layers, bidirectional=bidirectional)
        self.dense = nn.Linear(hidden_size, dense_hidden_size)  
        directions = 2 if bidirectional else 1
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size*directions, dense_hidden_size),
            nn.BatchNorm1d(dense_hidden_size),
            nn.ReLU(),
            nn.Linear(dense_hidden_size, 2),
        )

    def forward(self, x, lengths):
        # Embedding por palabra
        x = self.embeddings(x)
        # Forma las secuencias de palabras 
        x = x.split(lengths.tolist())
        # Padding y secuencias (mayor velocidad de computo)
        x = pad_sequence(x)
        x = pack_padded_sequence(x, lengths=lengths.tolist(), enforce_sorted=False)
        # GRU
        output, hn = self.gru(x)
        hn = torch.cat([h for h in hn], dim=-1)
        x = self.classifier(hn)
        return x, None

In [39]:
def eval_model(model, dataloader, criterion, device):
    model.eval()
    with torch.no_grad():
        losses = []
        preds = torch.empty(0).long()
        targets = torch.empty(0).long()
        scores_list, words_list, pred_list = [], [], []
        for data in tqdm(dataloader):
            torch.cuda.empty_cache()
            seq, seq_len, labels, words = data
            seq, labels = seq.to(device), labels.to(device)
            output, scores = model(seq, seq_len)
            output = F.log_softmax(output, dim=1)
            if labels.dtype != torch.long:
                labels = labels.type(torch.LongTensor)
            output, labels = output.to(device), labels.to(device)
            loss = criterion(output, labels)
            losses.append(loss.item())
            predictions = torch.argmax(output, dim=1)
            preds = torch.cat([preds, predictions.cpu()], dim=0)
            targets = torch.cat([targets, labels.cpu()], dim=0)
            if scores is not None:
                pred_list += predictions.tolist()
                scores_list += scores.cpu().squezee(2).tolist()
                words_list += words.cpu()

        model.train()
        torch.cuda.empty_cache()
        preds = preds.numpy()
        targets = targets.numpy()
        f1 = f1_score(targets, preds, average='binary')

    return np.mean(losses), f1, scores_list, words_list, pred_list

In [31]:
batch_size = 64

In [32]:
# load datasets
train_dataset = Agression_dataset('train')
val_dataset = Agression_dataset('val')
test_dataset = Agression_dataset('test')
# load dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=val_dataset.collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=test_dataset.collate_fn)

In [36]:
lr = 0.001
epochs = 10
weight_decay = 0.001
beta_1 = 0
beta_2 = 0.999
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
device

device(type='cuda')

In [41]:
model = SimpleRNN(emb_mat=train_dataset.emb_mat, bidirectional=False).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay, betas=(beta_1, beta_2))
weight=train_dataset.get_weights().to(device)
criterion = nn.NLLLoss(weight=weight)

In [42]:
torch.cuda.empty_cache()
best_val_f1 = 0
model.train()
for epoch in range(epochs):
    for data in tqdm(train_loader):
        torch.cuda.empty_cache()
        seq, seq_len, labels, words = data
        seq, labels = seq.to(device), labels.to(device)
        optimizer.zero_grad()
        output, _ = model(seq, seq_len)
        output = F.log_softmax(output, dim=1)
        if labels.dtype != torch.long:
            labels = labels.type(torch.LongTensor)
        output, labels = output.to(device), labels.to(device)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
    train_loss, train_f1, _, _, _ = eval_model(model, train_loader, criterion, device)
    val_loss, val_f1, _, _, _ = eval_model(model, val_loader, criterion, device)
    print(f'Epoch {epoch+1}/{epochs} - train_loss: {train_loss:.4f} - train_f1: {train_f1:.4f} - val_loss: {val_loss:.4f} - val_f1: {val_f1:.4f}')
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_state_dict = copy.deepcopy(model.state_dict())

  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/10 - train_loss: 0.5598 - train_f1: 0.6614 - val_loss: 0.6581 - val_f1: 0.5323


  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 2/10 - train_loss: 0.4241 - train_f1: 0.7557 - val_loss: 0.6076 - val_f1: 0.5989


  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 3/10 - train_loss: 0.3346 - train_f1: 0.8222 - val_loss: 0.5932 - val_f1: 0.6157


  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 4/10 - train_loss: 0.2738 - train_f1: 0.8655 - val_loss: 0.6269 - val_f1: 0.5861


  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 5/10 - train_loss: 0.1744 - train_f1: 0.9202 - val_loss: 0.6757 - val_f1: 0.6258


  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 6/10 - train_loss: 0.1742 - train_f1: 0.8959 - val_loss: 0.8047 - val_f1: 0.6221


  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 7/10 - train_loss: 0.1029 - train_f1: 0.9559 - val_loss: 0.8878 - val_f1: 0.5791


  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 8/10 - train_loss: 0.0628 - train_f1: 0.9794 - val_loss: 0.9772 - val_f1: 0.5755


  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 9/10 - train_loss: 0.0696 - train_f1: 0.9641 - val_loss: 1.1071 - val_f1: 0.6061


  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 10/10 - train_loss: 0.0606 - train_f1: 0.9764 - val_loss: 1.3219 - val_f1: 0.5258


In [43]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")

memory_allocated = torch.cuda.memory_allocated()
print(f"Memoria asignada actualmente: {memory_allocated / 1024**2:.2f} MB")

allocated_memory = torch.cuda.memory_allocated(device=device)
max_allocated_memory = torch.cuda.max_memory_allocated(device=device)
free_memory = max_allocated_memory - allocated_memory
print(f"Memoria libre en {device}: {free_memory / 1024**2:.2f} MB")

PyTorch version: 1.13.1+cu116
CUDA version: 11.6
Memoria asignada actualmente: 2607.68 MB
Memoria libre en cuda: 1116.00 MB


In [44]:
model.load_state_dict(best_state_dict)
train_loss, train_f1, _, _, _ = eval_model(model, train_loader, criterion, device)
val_loss, val_f1, _, _, _ = eval_model(model, val_loader, criterion, device)
test_loss, test_f1, _, _, _ = eval_model(model, test_loader, criterion, device)

print(f'Best val f1: {best_val_f1:.4f}')
print(f'Train loss: {train_loss:.4f} - Train f1: {train_f1:.4f}')
print(f'Val loss: {val_loss:.4f} - Val f1: {val_f1:.4f}')
print(f'Test loss: {test_loss:.4f} - Test f1: {test_f1:.4f}')   

  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

Best val f1: 0.6258
Train loss: 0.1746 - Train f1: 0.9202
Val loss: 0.6757 - Val f1: 0.6258
Test loss: 0.6941 - Test f1: 0.6357
