<a href="https://colab.research.google.com/github/Juan-Baldelomar/Agressiveness_Detection/blob/main/Tarea7_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tarea 7 - Juan Luis Baldelomar Cabrera

# Load Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Librerías y Archivos

In [None]:
import pandas as pd
import pickle
import numpy as np
import nltk
import re
nltk.download('punkt')
from tqdm.auto import tqdm
import copy

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence
import torch.nn.functional as F

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

from google_drive_downloader import GoogleDriveDownloader as gdd


# Tools
import os
import time
import random
import shutil
from argparse import Namespace
import matplotlib.pyplot as plt
from typing import Callable, Tuple

# Preprocessing
from nltk.tokenize import TweetTokenizer
from nltk import FreqDist
import pandas as pd
import numpy as np

# PyTorch
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# word embeddings
from gensim.models import Word2Vec
from gensim.models.keyedvectors import Word2VecKeyedVectors

# Import pre trained data 
import gensim

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
seed = 1111
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False

<h2>  Load Data</h2>

In [None]:
X_train = pd.read_csv('data/mex20_train.txt', sep='\r\n', engine='python', header=None).loc[:,0]
y_train = pd.read_csv('data/mex20_train_labels.txt', header=None).loc[:,0]
X_val   = pd.read_csv('data/mex20_val.txt', sep='\r\n', engine='python', header=None).loc[:,0]
y_val   = pd.read_csv('data/mex20_val_labels.txt', header=None).loc[:,0]
X_test = pd.read_csv('data/mex20_test_full.txt', sep='\r\n', engine='python', header=None).loc[:,0]

# Load Vocabulary and Embeddings

In [None]:
def get_vocab(corpus: pd.DataFrame,
              tokenizer: Callable[[str], list],
              max_features: int) -> set:
    freq_dist = FreqDist([w.lower() for sentence in corpus\
                                    for w in tokenizer(sentence)])
    
    sorted_words = sortFreqDict(freq_dist)[:min(max_features-1, len(freq_dist))]
    w2idx = {word: i+2 for i, word in enumerate(sorted_words)}

    # Append <pad> token with 0 index
    sorted_words.append('<pad>')
    sorted_words.append('<unk>')
    w2idx['<pad>'] = 0
    w2idx['<unk>'] = 1

    return set(sorted_words), w2idx
        
def sortFreqDict(freq_dist: FreqDist) -> list:
    freq_dict = dict(freq_dist)
    return sorted(freq_dict, key=freq_dict.get, reverse=True)

tk = TweetTokenizer()
vocab, w2idx = get_vocab(X_train, tk.tokenize, 10000)

In [None]:
def get_embeddings_matrix(vocab, w2idx, word2vec):
  embeddings_matrix = np.empty([len(vocab), word2vec.vector_size])
  for word in vocab:
      if word in word2vec:
          embeddings_matrix[w2idx[word]] = word2vec[word]
      else:
          embeddings_matrix[w2idx[word]] = np.random.rand(word2vec.vector_size)
  
  embeddings_matrix[w2idx['<unk>']] = np.mean(embeddings_matrix[2:], axis=0)
  embeddings_matrix[w2idx['<pad>']] = np.zeros(word2vec.vector_size)
  return embeddings_matrix

word2vec_data = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Colab_Notebooks/NLP/word2vec_col.txt')

In [None]:
embeddings_matrix = get_embeddings_matrix(vocab, w2idx, word2vec_data)

# Data Augmentation

In [None]:
X_tra = pd.read_csv('data/extra.txt', sep='\r\n', engine='python', header=None).loc[:,0]
y_tra = pd.read_csv('data/extra_lab.txt', header=None).loc[:,0]

In [None]:
X_train = X_tra.tolist() + X_train.tolist()
y_train = y_tra.tolist() + y_train.tolist()

#X_train = X_train.tolist()
#y_train = y_train.tolist()

# Dataset Class

 

In [None]:
class aggr_dataset(Dataset):
    def __init__(self, data, labels, vocab, w2id, emb_matrix, tk):
        super(Dataset, self).__init__()
        self.data = data
        self.labels = labels
        self.vocab = vocab
        self.emb_matrix = emb_matrix
        self.tk = tk
        self.w2id = w2id
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        '''Método principal para cargar una observación del dataset.
           label: categoría a la que pertenece la observación.
           word_ids: lista de índices de las palbras en el vocabulario.
        '''
        label = self.labels[index] if self.labels is not None else -1
        words, word_ids = self.preprocessed_text(index)
        return word_ids, label, words
        
    def preprocessed_text(self, index):
        '''Preprocess text and '''

        # remove links, usernames and lower the text
        text = self.data[index]
        #text = re.sub(r"http\S+", "http", text)
        #text = re.sub(r"@([a-z]|[A-Z]|[0-9]|_)+", "@usuario", text)
        text = text.lower()
        words = self.tk.tokenize(text)
        word_ids = [self.w2id[word] if word in self.vocab else 1 for word in words]
        return words, word_ids

    def get_weights(self):
        '''Devuelve pesos inversos para cada categoría. Mayor peso para la categoría con menos observaciones.'''
        cat_1 = 0
        for l in self.labels:
          cat_1 += l

        cat_0 = len(self.labels) - cat_1
        maxi = max(cat_0, cat_1)
        return torch.tensor([maxi/cat_0, maxi/cat_1])

    def collate_fn(self, batch):
        '''Función que ejecuta el dataloader para formar batches de datos.'''
        zipped_batch = list(zip(*batch))
        word_ids = [torch.tensor(t) for t in zipped_batch[0]]
        word_ids = torch.cat(word_ids, dim=0)
        lengths = torch.tensor([len(t) for t in zipped_batch[0]])
        labels = torch.tensor(zipped_batch[1])
        words = zipped_batch[2]
        return word_ids, lengths, labels, words

In [None]:
train_dataset = aggr_dataset(X_train, y_train, vocab, w2idx, embeddings_matrix, tk)
val_dataset = aggr_dataset(X_val, y_val, vocab, w2idx, embeddings_matrix, tk)
test_dataset = aggr_dataset(X_test, None, vocab, w2idx, embeddings_matrix, tk)

In [None]:
def eval_model(model, dataloader, criterion, device, use_acc=False, all_labels = None):
    '''Función para evaluar el modelo.'''
    accumulator = {}

    with torch.no_grad():
        model.eval()
        losses = []
        preds = torch.empty(0).long()
        targets = torch.empty(0).long()
        scores_list = []
        words_list = []
        pred_list = []

        for data in tqdm(dataloader):
            torch.cuda.empty_cache()
            seq, seq_len, labels, words = data
            seq, labels = seq.to(device), labels.to(device)
            output, scores = model(seq, seq_len)
            output = F.log_softmax(output, dim=1)
            loss = criterion(output, labels)
            losses.append(loss.item())
            predictions = F.log_softmax(output, dim=1).argmax(1)

            preds = torch.cat([preds, predictions.cpu()], dim=0)
            targets = torch.cat([targets, labels.cpu()], dim=0)

            if scores is not None:
                pred_list += predictions.tolist()
                scores = scores.cpu().squeeze(2).tolist()
                scores_list += scores
                words_list += words

        model.train()
        preds = preds.numpy()
        targets = targets.numpy()
        metric = accuracy_score(targets, preds) if use_acc else f1_score(targets, preds, average='binary')

        return np.mean(losses), metric, scores_list, words_list, pred_list

# Utils

In [None]:
def print_predictions(preds):
  print('Id,Expected')
  for i, p in enumerate(preds):
    print(i, end='')
    print(',', end='')
    print(p)

# GRU con atención

In [None]:
class AttnModule(nn.Module):
    def __init__(self, input_size, attn_hidden_size=128):
        '''
        input:
            input_size: tamaño de la capa oculta de la GRU.
            attn_hidden_size: tamaño de la capa oculta.
        '''
        super(AttnModule, self).__init__()
        self.fc1 = nn.Linear(input_size, attn_hidden_size)
        self.fc2 = nn.Linear(attn_hidden_size, 1, bias=False)
        
    def forward(self, seq, lengths):
        '''
        input:
            seq: secuencia de vectores ocultos de la GRU.
            lengths: número de palabras en cada observación.
        '''
        # unpack hidden states from the GRU
        x = pad_packed_sequence(seq)[0]
        seq_len, batch_size, nhid = x.size()

        # linearize the access to all the hidden states of each batch and compute  their output with tanh as activation function
        u = self.fc1(x.view(batch_size*seq_len, nhid))
        u = torch.tanh(u)

        # u^t * h_i (dot product with the u query vector) to get the score
        scores = self.fc2(u)

        # get back the scores into their original shape before being linearized in the output calculation
        scores = scores.view(seq_len, batch_size, 1)

        
        # Assign -100 to positions with padding to avoid them being considered through the softmax function.
        # this is also needed due to the way output is computed with fc1 and the bias of this layer could modify the padding values 
        # NOTE: remember that pack_padded_sequence and pad_packed_sequence are inverse operations
        scores = nn.utils.rnn.pack_padded_sequence(scores, lengths=lengths,enforce_sorted=False)
        scores = nn.utils.rnn.pad_packed_sequence(scores, padding_value=-100)[0]
        
        # softmaxt to the scores in dim = 0 because the Batch is in dim = 1, and the sequence is along dim = 0
        scores = F.softmax(scores, dim=0)

        # transpose scores and X to put Batch Dimension first. 
        # Then transpose dimension 1 and 2 of x to have hidden states from the GRU in columns instead of rows
        scores = scores.transpose(0,1)
        x = x.transpose(0,1).transpose(1,2)

        # for each batch multiply the hidden states by their scores and sum them (a matrix multiplication x * scores does this)
        x = torch.bmm(x, scores)
        return x.squeeze(2), scores

In [None]:
class AttnRNN(nn.Module):
    def __init__(self, input_size=100, hidden_size=128, num_layers=1,
                 bidirectional=False, emb_mat=None, dense_hidden_size=256,
                 attn_hidden_size=128, freeze_emb=False):
        super(AttnRNN, self).__init__()
        self.embeddings = nn.Embedding.from_pretrained(\
                            torch.FloatTensor(emb_mat), freeze=freeze_emb)
        self.gru = nn.LSTM(input_size=input_size, hidden_size=hidden_size, 
                          num_layers=num_layers, bidirectional=bidirectional)
        directions = 2 if bidirectional else 1
        self.attn = AttnModule(hidden_size*directions, attn_hidden_size)
        self.classifier = nn.Sequential(\
                            nn.Linear(hidden_size*directions, dense_hidden_size),
                            nn.BatchNorm1d(dense_hidden_size),
                            nn.ReLU(),
                            #nn.Linear(dense_hidden_size, dense_hidden_size//2),
                            #nn.Dropout(0.3),
                            nn.Linear(dense_hidden_size, 2))
        
    def forward(self, input_seq, lengths):
        x = self.embeddings(input_seq)
        x = x.split(lengths.tolist())
        x = pad_sequence(x)
        x = pack_padded_sequence(x, lengths, enforce_sorted=False)
        output, hn = self.gru(x)
        x, scores = self.attn(output, lengths)
        x = self.classifier(x)
        return x, scores.detach()

**Construimos el Modelo y el Optimizador a Utilizar**

In [None]:
batch_size = 8
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn = train_dataset.collate_fn, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, collate_fn = val_dataset.collate_fn, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn = test_dataset.collate_fn, shuffle=False)

In [None]:
lr = 0.0001
epochs = 20
device = torch.device('cuda')
weight_decay=0.0001
beta1=0
beta2=0.999

In [None]:
# best model in drive is a GRU with n_lays = 2, attn_h_s, hidden_s, dense_hidden_s = 128, 128, 256. 
# bidirectional = TRUE

# best model in drive is a LSTM with n_lays = 2, attn_h_s, hidden_s, dense_hidden_s = 128, 128, 256. 
# bidirectional = TRUE

In [None]:
#model = AttnRNN(emb_mat=train_dataset.emb_matrix, bidirectional=True).to(device)
attn_h_s, hidden_s, dense_hidden_s = 128, 128, 256
n_lays = 1
parameters = {'attn_hidden_size':attn_h_s, 'hidden_size': hidden_s, 'dense_hidden_size': dense_hidden_s, 'layers': n_lays}
model = AttnRNN(emb_mat=train_dataset.emb_matrix, 
                bidirectional=True, 
                num_layers=n_lays, 
                attn_hidden_size=attn_h_s, 
                hidden_size=hidden_s, 
                dense_hidden_size=dense_hidden_s
                ).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr,weight_decay=weight_decay, betas = (beta1, beta2))
weight = train_dataset.get_weights().to(device)
criterion = nn.NLLLoss(weight = weight)

**Cargamos el Modelo del archivo correspondiente de pesos descargado al inicio** (Omitir este paso si se desea entrenar desde 0) 

**Entrenamos el Modelo**

In [None]:
best_val_f1 = 0
for epoch in range(epochs):
    for data in tqdm(train_dataloader):
        torch.cuda.empty_cache()
        optimizer.zero_grad()
        seq, seq_len, labels, _ = data
        seq, labels = seq.to(device), labels.to(device)
        output, _ = model(seq, seq_len)
        output = F.log_softmax(output, dim=1)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
    
    model.eval()
    train_loss, train_f1, _, _, _ = eval_model(model, train_dataloader, criterion, device, use_acc=True)
    val_loss, val_f1, _, _, _ = eval_model(model, val_dataloader, criterion, device, use_acc=True)
    model.train()
    print('epoch: %d'%(epoch))
    print('train_loss: %5f | val_loss: %5f | train_acc: %5f | val_acc: %5f'%(train_loss, val_loss, train_f1, val_f1)) 
    if val_f1>best_val_f1:
        best_val_f1=val_f1
        best_state_dict=copy.deepcopy(model.state_dict())

NameError: ignored

**DESCOMENTAR SOLO SI DESEA GUARDAR LOS PESOS EN EL DRIVE MONTADO**

In [None]:
model.load_state_dict(best_state_dict)

<All keys matched successfully>

In [None]:
model.eval()
val_loss, val_f1, _, _, _ = eval_model(model, val_dataloader, criterion, device, use_acc=True)

  0%|          | 0/74 [00:00<?, ?it/s]

In [None]:
val_f1

0.8739352640545145

### Test Predictions

In [None]:
def perform_predictions(test_data, model):
  predictions = []
  for data in test_data:
    seq, seq_len, _, _ = data
    seq = seq.to(device)
    output, _ = model(seq, seq_len)
    preds = F.softmax(output, dim=1)#.argmax(1)
    predictions += preds.tolist()
  
  return predictions

In [None]:
preds = perform_predictions(test_dataloader, model)
preds = np.array(preds)

In [None]:
print_predictions(preds)

**Evaluamos el Modelo** 

In [None]:
model.load_state_dict(best_state_dict)
train_loss, train_f1, train_scores, train_words, train_pred = eval_model(model, train_dataloader, criterion, device, use_acc=True)
val_loss, val_f1, val_scores, val_words, val_pred = eval_model(model, val_dataloader, criterion, device, use_acc=True)
test_loss, test_f1, test_scores, test_words, test_pred = eval_model(model, test_dataloader, criterion, device, use_acc=True)
print('train_loss: %5f | train_acc: %5f'%(train_loss, train_f1)) 
print('val_loss: %5f | val_acc: %5f'%(val_loss, val_f1)) 
print('test_loss: %5f | test_acc: %5f'%(test_loss, test_f1)) 

# Transformer

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 33.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 65.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 62.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.4 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalli

# Build Dataset

In [None]:
from transformers.utils.dummy_pt_objects import RobertaForSequenceClassification
from transformers import AutoTokenizer, RobertaTokenizer, AutoModelForSequenceClassification, RobertaForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("pysentimiento/robertuito-base-uncased")
#robertuito = AutoModelForSequenceClassification.from_pretrained("pysentimiento/robertuito-base-uncased")
#robertuito = robertuito.to(device)

Downloading:   0%|          | 0.00/323 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/838k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [None]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    X_train,
    #X_train.tolist(),
    max_length = 100,
    pad_to_max_length=True,
    add_special_tokens=True,
    truncation = 'longest_first'
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    X_val.tolist(),
    max_length = 100,
    pad_to_max_length=True,
    add_special_tokens=True,
    truncation = 'longest_first'
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    X_test.tolist(),
    max_length = 100,
    pad_to_max_length=True,
    add_special_tokens=True,
    truncation = 'longest_first'
)



In [None]:
class aggr_dataset(Dataset):
    def __init__(self, data, labels, vocab, w2id, emb_matrix, tk, t_ids, t_masks):
        super(Dataset, self).__init__()
        self.data = data
        self.labels = labels
        self.vocab = vocab
        self.emb_matrix = emb_matrix
        self.tk = tk
        self.w2id = w2id
        self.t_ids = t_ids
        self.t_masks = t_masks
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        '''Método principal para cargar una observación del dataset.
           label: categoría a la que pertenece la observación.
           word_ids: lista de índices de las palbras en el vocabulario.
        '''
        label = self.labels[index] if self.labels is not None else -1
        words, word_ids = self.preprocessed_text(index)
        return word_ids, label, words, self.t_ids[index], self.t_masks[index]
        
    def preprocessed_text(self, index):
        '''Preprocess text and '''

        # remove links, usernames and lower the text
        text = self.data[index]
        #text = re.sub(r"http\S+", "http", text)
        #text = re.sub(r"@([a-z]|[A-Z]|[0-9]|_)+", "@usuario", text)
        text = text.lower()
        words = self.tk.tokenize(text)
        word_ids = [self.w2id[word] if word in self.vocab else 1 for word in words]
        return words, word_ids

    def get_weights(self):
        '''Devuelve pesos inversos para cada categoría. Mayor peso para la categoría con menos observaciones.'''
        cat_1 = 0
        for l in self.labels:
          cat_1 += l

        cat_0 = len(self.labels) - cat_1
        maxi = max(cat_0, cat_1)
        return torch.tensor([maxi/cat_0, maxi/cat_1])

    def collate_fn(self, batch):
        '''Función que ejecuta el dataloader para formar batches de datos.'''
        zipped_batch = list(zip(*batch))
        word_ids = [torch.tensor(t) for t in zipped_batch[0]]
        word_ids = torch.cat(word_ids, dim=0)
        lengths = torch.tensor([len(t) for t in zipped_batch[0]])
        labels = torch.tensor(zipped_batch[1])
        words = zipped_batch[2]

        t_ids = [v[3] for v in batch]
        t_masks = [v[4] for v in batch]
        t_ids = torch.stack(t_ids)
        t_masks = torch.stack(t_masks)
        return word_ids, lengths, labels, words, t_ids, t_masks

In [None]:
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(list(map(int, y_train)))

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(list(map(int, y_val.tolist())))

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])

In [None]:
from sklearn.utils import class_weight

# class weights
#cw = class_weight.compute_class_weight('balanced', classes=np.unique(y_train.tolist()), y=y_train.tolist())
cw = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
# to tensor
weights= torch.tensor(cw,dtype=torch.float)
# upload to GPU
weights = weights.to(device) # to GPU

In [None]:
# define the loss function
cross_entropy = nn.NLLLoss(weight=weights) 

# number of training epochs
epochs = 3

# Robertuito Model

In [None]:
# FOR MODEL Robertuito
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, ConcatDataset

#define a batch size
batch_size = 8

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)
# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)
# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)
# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)
# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

test_data = TensorDataset(test_seq, test_mask)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

In [None]:
class RobertuitoClasificator(nn.Module):
    def __init__(self, transformer):
        super(RobertuitoClasificator, self).__init__()
        self.transformer = transformer # pretrained
        
        #self.softmax = nn.LogSoftmax(dim=1) # softmax
        #self.clasification = nn.Linear(768,2, bias = True) # clasification layer

    def forward(self, sent_id, mask):
        # Get cls token
        x = self.transformer(sent_id, attention_mask=mask, return_dict=False)[0]

        # Classification layer
        #x = self.clasification(cls)
        #x = self.softmax(x)

        return x

In [None]:
# Use pre-trained model and upload to current device
model2 = RobertuitoClasificator(robertuito)
model2 = model2.to(device)

In [None]:
# MODEL 2 TEST
from transformers import AdamW # optimizer
optimizerM2 = AdamW(model2.parameters(), lr = 1e-5, correct_bias=False) 



In [None]:
# Training function
def train():
    
    model2.train()
    total_loss, total_accuracy = 0, 0
  
    # empty list to save model predictions
    total_preds=[]
  
    # iterate over batches
    for step,batch in enumerate(train_dataloader):
        
        # progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
        
        # push the batch to gpu
        batch = [r.to(device) for r in batch]
 
        sent_id, mask, labels = batch
        
        # clear previously calculated gradients 
        model2.zero_grad()        

        # get model predictions for the current batch
        preds = model2(sent_id, mask)
        preds = F.log_softmax(preds, dim=1)

        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)

        # add on to the total loss
        total_loss = total_loss + loss.item()

        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model2.parameters(), 1.0)

        # update parameters
        optimizerM2.step()

        # model predictions are stored on GPU. So, push it to CPU
        preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)
  
    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    #returns the loss and predictions
    return avg_loss, total_preds

In [None]:
# function for evaluating the model
def evaluate():
    
    print("\nEvaluating...")
  
    # deactivate dropout layers
    model.eval()

    total_loss, total_accuracy = 0, 0
    
    # empty list to save the model predictions
    total_preds = []
    targets = []
    predictions = []

    # iterate over batches
    for step,batch in enumerate(val_dataloader):
        
        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:
            
            # Calculate elapsed time in minutes.
            #elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # push the batch to gpu
        batch = [t.to(device) for t in batch]

        sent_id, mask, labels = batch

        # deactivate autograd
        with torch.no_grad():
            
            # model predictions
            preds = model2(sent_id, mask)
            lab = F.log_softmax(preds, dim=1).argmax(1)

            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds,labels)

            total_loss = total_loss + loss.item()

            preds = preds.detach().cpu().numpy()


            total_preds.append(preds)
        
        predictions += lab.cpu().tolist()
        targets += labels.cpu().tolist()

    
    metric = accuracy_score(targets, predictions)
    print(metric)
    

    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader) 

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [None]:
# Set the seed value all over the place to make this reproducible.

best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(3):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    #train model
    train_loss, _ = train()
    #evaluate model
    valid_loss, _ = evaluate()
    #save the best model
    torch.save(model2.state_dict(), 'saved_weights'+str(epoch)+'.pt')
    # Results
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 3
  Batch    50  of    785.
  Batch   100  of    785.
  Batch   150  of    785.
  Batch   200  of    785.
  Batch   250  of    785.
  Batch   300  of    785.
  Batch   350  of    785.
  Batch   400  of    785.
  Batch   450  of    785.
  Batch   500  of    785.
  Batch   550  of    785.
  Batch   600  of    785.
  Batch   650  of    785.
  Batch   700  of    785.
  Batch   750  of    785.

Evaluating...
  Batch    50  of     74.
0.9045996592844975

Training Loss: 0.374
Validation Loss: -1.922

 Epoch 2 / 3
  Batch    50  of    785.
  Batch   100  of    785.
  Batch   150  of    785.
  Batch   200  of    785.
  Batch   250  of    785.
  Batch   300  of    785.
  Batch   350  of    785.
  Batch   400  of    785.
  Batch   450  of    785.
  Batch   500  of    785.
  Batch   550  of    785.
  Batch   600  of    785.
  Batch   650  of    785.
  Batch   700  of    785.
  Batch   750  of    785.

Evaluating...
  Batch    50  of     74.
0.9063032367972743

Training Loss: 0.168
Vali

In [None]:
model2.load_state_dict(torch.load('saved_weights2.pt'))
model2.eval()
evaluate()


Evaluating...
  Batch    50  of     74.
0.9045996592844975


(-3.475469028224816, array([[ 1.8261124, -1.7547202],
        [ 4.868897 , -5.156266 ],
        [-4.467583 ,  4.572062 ],
        ...,
        [ 4.455827 , -5.1700225],
        [ 1.7008258, -2.2183805],
        [ 4.0854645, -4.737025 ]], dtype=float32))

In [None]:
!cp  '/content/saved_weights2.pt' '/content/drive/MyDrive/Colab_Notebooks/NLP/solo_robertuito.pt'

In [None]:
def perform_predictions(test_data, model):
  predictions, gru_preds, t_preds = [], [], []
  #model = model.to(torch.device('cpu'))
  for i, batch in enumerate(test_data):
    if i % 10 == 0:
      print('batch:', i)

    sent_id, masks = batch
    
    sent_id = sent_id.to(device)
    masks = masks.to(device)
    output = model(sent_id, masks)
    preds = F.softmax(output, dim=1).argmax(1)
    predictions += preds.tolist()
  return predictions
  #return gru_preds, t_preds

In [None]:
preds = perform_predictions(test_dataloader, model2)

batch: 0
batch: 10
batch: 20
batch: 30
batch: 40
batch: 50
batch: 60
batch: 70
batch: 80
batch: 90
batch: 100
batch: 110
batch: 120
batch: 130
batch: 140
batch: 150
batch: 160
batch: 170
batch: 180


In [None]:
print_predictions(preds)

# DATALOADER FOR MODELS ABOVE

In [None]:
train_dataset = aggr_dataset(X_train, y_train, vocab, w2idx, embeddings_matrix, tk, train_seq, train_mask)
val_dataset = aggr_dataset(X_val, y_val, vocab, w2idx, embeddings_matrix, tk, val_seq, val_mask)
test_dataset = aggr_dataset(X_test, None, vocab, w2idx, embeddings_matrix, tk, test_seq, test_mask)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, ConcatDataset
#define a batch size
batch_size = 8

# sampler and dataloader
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size, collate_fn=train_dataset.collate_fn)

# sampler and dataloader for val
val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler = val_sampler, batch_size=batch_size, collate_fn=val_dataset.collate_fn)

# dataloader for test
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=test_dataset.collate_fn)

# GRU/LSTM and Robertuito

In [None]:
class BL(nn.Module):
  def __init__(self, bert, gru):
      super(BL, self).__init__()
      '''
      Input:
          bert: pre-trained bert model
      '''

      self.bert = bert # pretrained
      self.gru  = gru
      self.down = nn.Linear(4, 2, bias=True)
      
      
  def forward(self, gru_id, gru_len, sent_id, mask):
      # Get cls token
      x = self.bert(sent_id, attention_mask=mask, return_dict=False)[0]
      x_g = self.gru(gru_id, gru_len)[0]

      x = torch.concat([x_g, x], dim=1)
      x = self.down(x)

      return x

In [None]:
robertuito2_bl = AutoModelForSequenceClassification.from_pretrained("pysentimiento/robertuito-base-uncased")
robertuito2_bl = robertuito2_bl.to(device)

bl = BL(robertuito2_bl, model)
bl = bl.to(device)

Some weights of the model checkpoint at pysentimiento/robertuito-base-uncased were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pysentimiento/robertuito-base-uncased and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.

In [None]:
# define the optimizer
from transformers import AdamW # optimizer
optimizer2 = AdamW(bl.parameters(), lr = 1e-5, correct_bias=False) 



In [None]:
# Training function
def train():
    
    bl.train()
    total_loss, total_accuracy = 0, 0
  
    # empty list to save model predictions
    total_preds=[]
  
    # iterate over batches
    for step,batch in enumerate(train_dataloader):
        
        # progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
        
        # push the batch to gpu
        #batch = [r.to(device) for r in batch]
 
        gruid, grulen, labels, _, sent_id, mask = batch
        gruid = gruid.to(device)
        labels = labels.to(device)
        sent_id =sent_id.to(device)
        mask = mask.to(device)
        
        # clear previously calculated gradients 
        bl.zero_grad()  
        
        # get model predictions for the current batch
        preds = bl(gruid, grulen, sent_id, mask)
        preds = F.log_softmax(preds, dim=1)

        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)

        # add on to the total loss
        total_loss = total_loss + loss.item()

        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(bl.parameters(), 1.0)

        # update parameters
        optimizer2.step()

        # model predictions are stored on GPU. So, push it to CPU
        preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)
  
    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    #returns the loss and predictions
    return avg_loss, total_preds

In [None]:
# function for evaluating the model
def evaluate():
    
    print("\nEvaluating...")
  
    # deactivate dropout layers
    bl.eval()
    
    total_loss, total_accuracy = 0, 0
    
    # empty list to save the model predictions
    total_preds = []
    targets = []
    predictions = []

    # iterate over batches
    for step,batch in enumerate(val_dataloader):
        
        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:
            
            # Calculate elapsed time in minutes.
            #elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # push the batch to gpu
        #batch = [t.to(device) for t in batch]

        gruid, grulen, labels, _, sent_id, mask  = batch
        gruid = gruid.to(device)
        labels = labels.to(device)
        sent_id =sent_id.to(device)
        mask = mask.to(device)
        
        # deactivate autograd
        with torch.no_grad():
            
            # model predictions
            preds = bl(gruid, grulen, sent_id, mask)
            lab = F.log_softmax(preds, dim=1).argmax(1)

            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds,labels)

            total_loss = total_loss + loss.item()

            preds = preds.detach().cpu().numpy()


            total_preds.append(preds)
        
        predictions += lab.cpu().tolist()
        targets += labels.cpu().tolist()

    
    metric = accuracy_score(targets, predictions)
    print(metric)
    

    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader) 

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [None]:
# Set the seed value all over the place to make this reproducible.

best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]
#for each epoch
for epoch in range(3):     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    #train model
    train_loss, _ = train()
    #evaluate model
    valid_loss, _ = evaluate()
    #save the best model
    torch.save(bl.state_dict(), 'saved_weights'+str(epoch)+'.pt')
    # Results
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 3
  Batch    50  of    785.
  Batch   100  of    785.
  Batch   150  of    785.
  Batch   200  of    785.
  Batch   250  of    785.
  Batch   300  of    785.
  Batch   350  of    785.
  Batch   400  of    785.
  Batch   450  of    785.
  Batch   500  of    785.
  Batch   550  of    785.
  Batch   600  of    785.
  Batch   650  of    785.
  Batch   700  of    785.
  Batch   750  of    785.

Evaluating...
  Batch    50  of     74.
0.9045996592844975

Training Loss: 0.380
Validation Loss: -1.870

 Epoch 2 / 3
  Batch    50  of    785.
  Batch   100  of    785.
  Batch   150  of    785.
  Batch   200  of    785.
  Batch   250  of    785.
  Batch   300  of    785.
  Batch   350  of    785.
  Batch   400  of    785.
  Batch   450  of    785.
  Batch   500  of    785.
  Batch   550  of    785.
  Batch   600  of    785.
  Batch   650  of    785.
  Batch   700  of    785.
  Batch   750  of    785.

Evaluating...
  Batch    50  of     74.
0.8977853492333902

Training Loss: 0.193
Vali

In [None]:
bl.load_state_dict(torch.load('saved_weights0.pt'))
bl.eval()
evaluate()


Evaluating...
  Batch    50  of     74.
0.9045996592844975


(-1.8698840769561562, array([[-0.40309802,  0.55115736],
        [ 2.852524  , -1.2707568 ],
        [-2.4587746 ,  1.7125113 ],
        ...,
        [ 3.3337648 , -1.9552519 ],
        [ 1.5371989 , -0.6677473 ],
        [ 0.04977982,  0.40633646]], dtype=float32))

In [None]:
def perform_predictions(test_data):
  predictions, gru_preds, t_preds = [], [], []
  #model = model.to(torch.device('cpu'))
  for i, data in enumerate(test_data):
    if i % 10 == 0:
      print('batch:', i)

    seq, seq_len, _, _, tids, masks = data
    
    seq = seq.to(device)
    tids = tids.to(device)
    masks = masks.to(device)
    output = bl(seq, seq_len, tids, masks)
    preds = F.log_softmax(output, dim=1).argmax(1)
    predictions += preds.tolist()
  return predictions

In [None]:
preds = perform_predictions(test_dataloader)

batch: 0
batch: 10
batch: 20
batch: 30
batch: 40
batch: 50
batch: 60
batch: 70
batch: 80
batch: 90
batch: 100
batch: 110
batch: 120
batch: 130
batch: 140
batch: 150
batch: 160
batch: 170
batch: 180


In [None]:
print_predictions(preds)

In [None]:
!cp  '/content/saved_weights2.pt' '/content/drive/MyDrive/Colab_Notebooks/NLP/'

# Voting Scheme

In [None]:
# function for evaluating the model
def evaluate_both():
    
    print("\nEvaluating...")
  
    # deactivate dropout layers
    model.eval()

    total_loss, total_accuracy = 0, 0
    
    # empty list to save the model predictions
    total_preds = []
    targets = []
    predictions = []

    # iterate over batches
    for step,batch in enumerate(val_dataloader):
        gruid, grulen, labels, _, sent_id, mask  = batch
        gruid = gruid.to(device)
        labels = labels.to(device)
        sent_id =sent_id.to(device)
        mask = mask.to(device)

        # deactivate autograd
        with torch.no_grad():
            
            # model predictions
            preds = model2(sent_id, mask)
            preds_bl = bl(gruid, grulen, sent_id, mask)

            lab1 = F.log_softmax(preds, dim=1)
            lab2 = F.log_softmax(preds_bl, dim=1)

            lab2 = (lab1 + lab2)/2
            lab = lab2.argmax(1)


            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds,labels)

            total_loss = total_loss + loss.item()

            preds = preds.detach().cpu().numpy()
            total_preds.append(preds)
        
        predictions += lab.cpu().tolist()
        targets += labels.cpu().tolist()

    metric = accuracy_score(targets, predictions)
    print(metric)
    

    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader) 

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [None]:
evaluate_both()


Evaluating...
0.9063032367972743


(-3.475469028224816, array([[ 1.8261124, -1.7547202],
        [ 4.868897 , -5.156266 ],
        [-4.467583 ,  4.572062 ],
        ...,
        [ 4.455827 , -5.1700225],
        [ 1.7008258, -2.2183805],
        [ 4.0854645, -4.737025 ]], dtype=float32))

In [None]:
def perform_predictions(test_data):
  predictions, gru_preds, t_preds = [], [], []
  #model = model.to(torch.device('cpu'))
  for i, data in enumerate(test_data):
    if i % 10 == 0:
      print('batch:', i)

    gruid, grulen, _, _, sent_id, mask = data
    gruid = gruid.to(device)
    sent_id =sent_id.to(device)
    mask = mask.to(device)
    
    preds = model2(sent_id, mask)
    preds_bl = bl(gruid, grulen, sent_id, mask)

    lab1 = F.log_softmax(preds, dim=1)
    lab2 = F.log_softmax(preds_bl, dim=1)

    lab2 = (lab1 + lab2)/2
    lab = lab2.argmax(1)

    predictions += lab.cpu().tolist()
  return predictions

In [None]:
preds = perform_predictions(test_dataloader)

batch: 0
batch: 10
batch: 20
batch: 30
batch: 40
batch: 50
batch: 60
batch: 70
batch: 80
batch: 90
batch: 100
batch: 110
batch: 120
batch: 130
batch: 140
batch: 150
batch: 160
batch: 170
batch: 180


In [None]:
print_predictions(preds)

# Save Model Weights

In [None]:
torch.save(bl.state_dict(), 'BL_best_saved_weights2.pt')

In [None]:
torch.save(model2.state_dict(), 'model2_best_saved_weights2.pt')

In [None]:
!cp  '/content/BL_best_saved_weights2.pt' '/content/drive/MyDrive/Colab_Notebooks/NLP/'

In [None]:
!cp  '/content/model2_best_saved_weights2.pt' '/content/drive/MyDrive/Colab_Notebooks/NLP/'

# ADD Robertuito and Robertuito_GRU/LSTM

In [None]:
class BL_B(nn.Module):
  def __init__(self, bl, model2):
      super(BL_B, self).__init__()
      '''
      Input:
          bert: pre-trained bert model
      '''

      self.bl = bl # pretrained
      self.model2  = model2
      
      
  def forward(self, gruid, grulen, sent_id, mask):
      # Get cls token
      preds = self.model2(sent_id, mask)
      preds_bl = self.bl(gruid, grulen, sent_id, mask)

      lab1 = F.log_softmax(preds, dim=1)
      lab2 = F.log_softmax(preds_bl, dim=1)

      lab2 = (lab1 + lab2)/2
      
      return lab2

In [None]:
bl_b = BL_B(bl, model2)

In [None]:
from transformers import AdamW # optimizer
optimizer_bl = AdamW(bl_b.parameters(), lr = 1e-5, correct_bias=False) 

In [None]:
# Training function
def train():
    
    bl_b.train()
    total_loss, total_accuracy = 0, 0
  
    # empty list to save model predictions
    total_preds=[]
  
    # iterate over batches
    for step,batch in enumerate(train_dataloader):
        
        # progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
        
        # push the batch to gpu
        #batch = [r.to(device) for r in batch]
 
        gruid, grulen, labels, _, sent_id, mask = batch
        gruid = gruid.to(device)
        labels = labels.to(device)
        sent_id =sent_id.to(device)
        mask = mask.to(device)
        
        # clear previously calculated gradients 
        bl_b.zero_grad()  
        
        # get model predictions for the current batch
        preds = bl_b(gruid, grulen, sent_id, mask)
        
        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)

        # add on to the total loss
        total_loss = total_loss + loss.item()

        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        #torch.nn.utils.clip_grad_norm_(bl_b.parameters(), 1.0)

        # update parameters
        optimizer_bl.step()

        # model predictions are stored on GPU. So, push it to CPU
        preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)
  
    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    #returns the loss and predictions
    return avg_loss, total_preds

In [None]:
# function for evaluating the model
def evaluate():
    
    print("\nEvaluating...")
  
    # deactivate dropout layers
    bl_b.eval()
    
    total_loss, total_accuracy = 0, 0
    
    # empty list to save the model predictions
    total_preds = []
    targets = []
    predictions = []

    # iterate over batches
    for step,batch in enumerate(val_dataloader):
        
        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:
            
            # Calculate elapsed time in minutes.
            #elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # push the batch to gpu
        #batch = [t.to(device) for t in batch]

        gruid, grulen, labels, _, sent_id, mask  = batch
        gruid = gruid.to(device)
        labels = labels.to(device)
        sent_id =sent_id.to(device)
        mask = mask.to(device)
        
        # deactivate autograd
        with torch.no_grad():
            
            # model predictions
            preds = bl_b(gruid, grulen, sent_id, mask)
            lab = preds.argmax(1)

            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds,labels)

            total_loss = total_loss + loss.item()

            preds = preds.detach().cpu().numpy()


            total_preds.append(preds)
        
        predictions += lab.cpu().tolist()
        targets += labels.cpu().tolist()

    
    metric = accuracy_score(targets, predictions)
    print(metric)
    

    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader) 

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [None]:
# Set the seed value all over the place to make this reproducible.

best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]
#for each epoch
for epoch in range(4):     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    #train model
    train_loss, _ = train()
    #evaluate model
    valid_loss, _ = evaluate()
    #save the best model
    torch.save(bl_b.state_dict(), 'bl_b_saved_weights'+str(epoch)+'.pt')
    # Results
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 3
  Batch    50  of  1,035.
  Batch   100  of  1,035.
  Batch   150  of  1,035.
  Batch   200  of  1,035.
  Batch   250  of  1,035.
  Batch   300  of  1,035.
  Batch   350  of  1,035.
  Batch   400  of  1,035.
  Batch   450  of  1,035.
  Batch   500  of  1,035.
  Batch   550  of  1,035.
  Batch   600  of  1,035.
  Batch   650  of  1,035.
  Batch   700  of  1,035.
  Batch   750  of  1,035.
  Batch   800  of  1,035.
  Batch   850  of  1,035.
  Batch   900  of  1,035.
  Batch   950  of  1,035.
  Batch 1,000  of  1,035.

Evaluating...
  Batch    50  of     74.
0.8739352640545145

Training Loss: 0.263
Validation Loss: 0.271

 Epoch 2 / 3
  Batch    50  of  1,035.
  Batch   100  of  1,035.
  Batch   150  of  1,035.
  Batch   200  of  1,035.
  Batch   250  of  1,035.
  Batch   300  of  1,035.
  Batch   350  of  1,035.
  Batch   400  of  1,035.
  Batch   450  of  1,035.
  Batch   500  of  1,035.
  Batch   550  of  1,035.
  Batch   600  of  1,035.
  Batch   650  of  1,035.
  Batch  

In [None]:
#bl_b.load_state_dict(torch.load('/content/drive/MyDrive/Colab_Notebooks/NLP/best_add_rob_lstmandrob_model.pt'))
bl_b.load_state_dict(torch.load('/content/bl_b_saved_weights1.pt'))
bl_b.eval()
evaluate()


Evaluating...
  Batch    50  of     74.
0.909710391822828


(0.3386003353909866, array([[-5.3767794e-01, -8.8077027e-01],
        [-5.1078359e-03, -5.3812809e+00],
        [-5.2420754e+00, -6.8170829e-03],
        ...,
        [-5.0165886e-03, -5.4053106e+00],
        [-4.8928417e-02, -3.6009932e+00],
        [-5.3122920e-01, -9.4185311e-01]], dtype=float32))

In [None]:
def perform_predictions(test_data):
  predictions, gru_preds, t_preds = [], [], []
  #model = model.to(torch.device('cpu'))
  for i, data in enumerate(test_data):
    if i % 10 == 0:
      print('batch:', i)

    gruid, grulen, _, _, sent_id, mask = data
    gruid = gruid.to(device)
    sent_id =sent_id.to(device)
    mask = mask.to(device)
    
    preds = bl_b(gruid, grulen, sent_id, mask)

    lab = preds.argmax(1)

    predictions += lab.cpu().tolist()
  return predictions

In [None]:
preds = perform_predictions(test_dataloader)

batch: 0
batch: 10
batch: 20
batch: 30
batch: 40
batch: 50
batch: 60
batch: 70
batch: 80
batch: 90
batch: 100
batch: 110
batch: 120
batch: 130
batch: 140
batch: 150
batch: 160
batch: 170
batch: 180


In [None]:
print_predictions(preds)

Id,Expected
0,0
1,0
2,0
3,1
4,0
5,0
6,0
7,0
8,1
9,0
10,0
11,0
12,0
13,0
14,1
15,0
16,0
17,0
18,0
19,0
20,1
21,0
22,0
23,1
24,0
25,1
26,0
27,0
28,0
29,0
30,1
31,0
32,0
33,0
34,1
35,1
36,1
37,1
38,0
39,0
40,0
41,0
42,1
43,0
44,0
45,1
46,1
47,0
48,0
49,0
50,1
51,0
52,0
53,1
54,0
55,0
56,0
57,1
58,0
59,1
60,1
61,1
62,0
63,0
64,0
65,1
66,0
67,0
68,0
69,0
70,1
71,0
72,1
73,0
74,0
75,1
76,0
77,0
78,1
79,0
80,0
81,0
82,1
83,0
84,0
85,0
86,0
87,1
88,0
89,1
90,0
91,0
92,1
93,1
94,1
95,0
96,1
97,1
98,1
99,0
100,0
101,0
102,0
103,1
104,1
105,1
106,1
107,0
108,0
109,0
110,0
111,0
112,0
113,0
114,0
115,0
116,0
117,1
118,1
119,0
120,0
121,0
122,1
123,0
124,0
125,0
126,0
127,1
128,0
129,1
130,0
131,0
132,0
133,0
134,1
135,0
136,0
137,0
138,0
139,0
140,1
141,1
142,0
143,0
144,0
145,0
146,1
147,0
148,1
149,0
150,0
151,0
152,0
153,0
154,0
155,0
156,1
157,0
158,0
159,1
160,0
161,0
162,0
163,0
164,0
165,1
166,1
167,1
168,0
169,0
170,0
171,0
172,0
173,0
174,0
175,0
176,0
177,0
178,0
179,1
180,1
181,0
182,0


In [None]:
!cp  '/content/bl_b_saved_weights3.pt' '/content/drive/MyDrive/Colab_Notebooks/NLP/best_add_rob_lstmandrob_model.pt'

**<h2> Tabla de Resultados </h2>**

A continuación mostramos los resultados de la métrica de accuracy resumidos en una tabla para cada modelo.

**No.** | **Model** | **Validation** | **Test**
 -------- |----|  ---- | ----
1 |`Attention GRU` |  0.87563 | 0.80430
2 | `Robertuito` |  0.90460 | 0.86987
3 | `Robertuito_Gru` |  0.90460 | 0.88356
4 | `Robertuito and Robertuito_Gru Voting Scheme`  | 0.90630 | 0.88827
5 | `Add Robertuito and Robertuito_Gru Voting Scheme No Data Augmentation`  | 0.91141 | 0.88437
6 | `Add Robertuito and Robertuito_Gru Voting Scheme Data Augmentation`  | 0.91482 | 0.89022
6 | `Add Robertuito and Robertuito_LSTM Voting Scheme Data Augmentation`  | 0.90971 | 0.89247


# Extra

In [None]:
class BL_B(nn.Module):
  def __init__(self, attn_h_s, hidden_s, dense_hidden_s, bidir, n_lays):
      super(BL_B, self).__init__()
      '''
      Input:
          bert: pre-trained bert model
      '''
      model = AttnRNN(emb_mat=train_dataset.emb_matrix, 
                bidirectional=bidir, 
                num_layers=n_lays, 
                attn_hidden_size=attn_h_s, 
                hidden_size=hidden_s, 
                dense_hidden_size=dense_hidden_s
                )


      robertuito2_bl = AutoModelForSequenceClassification.from_pretrained("pysentimiento/robertuito-base-uncased")

      robertuito = AutoModelForSequenceClassification.from_pretrained("pysentimiento/robertuito-base-uncased")

      bl = BL(robertuito2_bl, model)

      self.bl = bl # pretrained
      self.model2  = RobertuitoClasificator(robertuito)
      
      
  def forward(self, gruid, grulen, sent_id, mask):
      # Get cls token
      preds_m2 = self.model2(sent_id, mask)
      preds_bl = self.bl(gruid, grulen, sent_id, mask)

      lab1 = F.log_softmax(preds_m2, dim=1)
      lab2 = F.log_softmax(preds_bl, dim=1)

      lab2 = (lab1 + lab2)/2
      
      return lab2

In [None]:
class voting_sc(nn.Module):
  def __init__(self, bl1, bl2):
      super(voting_sc, self).__init__()
      '''
      Input:
          bert: pre-trained bert model
      '''
      self.bl1 = bl1
      self.bl2 = bl2
      #self.bl3 = bl3

      for param in self.bl1.parameters():
        param.requires_grad = False
      
      for param in self.bl2.parameters():
        param.requires_grad = False
      
      #for param in self.bl3.parameters():
      #  param.requires_grad = False

      self.yes_layer = nn.Linear(2, 1, bias=False)
      self.no_layer = nn.Linear(2, 1, bias=False)
      
      
  def forward(self, gruid, grulen, sent_id, mask):
      # Get cls token
      preds_bl1 = self.bl1(gruid, grulen, sent_id, mask)
      preds_bl2 = self.bl1(gruid, grulen, sent_id, mask)
      #preds_bl3 = self.bl1(gruid, grulen, sent_id, mask)
      
      yes = torch.concat([preds_bl1[:, 0:1], preds_bl2[:, 0:1]], dim=1)
      yes = self.yes_layer(yes)

      no = torch.concat([preds_bl1[:, 1:], preds_bl2[:, 1:]], dim=1)
      no = self.no_layer(no)

      out = torch.concat([yes, no], dim=1)
      out = F.softmax(out, dim=1)

      return out