<a href="https://colab.research.google.com/github/LuigiSigillo/nlp2021-hw/blob/master/hw1/stud/nlp_hw1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initial setup for both methodologies

##  Pre-Setup

In [None]:
#@title Setup of parameters{ run: "auto" }
WE_LENGTH = "50" #@param [50,100,200,300]
METHOD_FIRST_APPROACH = "avg" #@param ["avg","sum"]
USE_SEP = False #@param ["True", "False"] {type:"raw"}
WORDS_LIMIT = 400000 #@param {type:"slider", min:20000, max:400000, step:20000}
REMOVE_STOPWORDS = True #@param ["True", "False"] {type:"raw"}
LEMMATIZATION = True #@param ["True", "False"] {type:"raw"}
LOWERED = False #@param ["True", "False"] {type:"raw"}


## Imports

In [None]:
from google.colab import drive
# general
import matplotlib.pyplot as plt
import numpy as np
import os
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
from typing import *
import string
import json

# torch
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD
from torch.nn import functional as F
from torch.utils.tensorboard import SummaryWriter

# NLTK
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')


# SKLEARN
from sklearn.metrics import accuracy_score, f1_score, recall_score



Code to download and move the glove embeddings in the right folder


In [None]:
drive.mount('/content/drive')
root_folder = '/content/drive/My Drive/NLP/nlp2021-hw1'
dataset_folder = os.path.join(root_folder,'data')

torch.manual_seed(42)
np.random.seed(42)

#! wget http://nlp.stanford.edu/data/wordvecs/glove.6B.zip
#! unzip -d data/glove.6B
#! cd '/content/drive/My Drive/NLP/nlp2021-hw1'
#!unzip '/content/drive/My Drive/NLP/nlp2021-hw1/glove.6B.zip'
# !mv glove.6B.200d.txt '/content/drive/My Drive/NLP/nlp2021-hw1/model'
# !ls "{root_folder}/model/"
!nvidia-smi

##GloVe word embeddings

Added to the dictionary also the "UNK" and "SEP" words using a random vector

In [None]:
class GloVEEmbedding():

    def __init__(self):
        self.word_vectors = dict()


    def get_word_vectors(self):
        with open(root_folder+'/model/glove.6B.'+WE_LENGTH+'d.txt') as f:
            for i, line in tqdm(enumerate(f), total=WORDS_LIMIT):
                if i == WORDS_LIMIT:
                    break
                word, *vector = line.strip().split(' ')
                vector = torch.tensor([float(c) for c in vector])
                
                self.word_vectors[word] = vector

        self.word_vectors["UNK"] = torch.rand(int(WE_LENGTH))

        if USE_SEP:
            self.word_vectors["SEP"] = torch.rand(int(WE_LENGTH))
        return self.word_vectors

## Dataset class and interface

In [None]:
class SentencesDataset(torch.utils.data.Dataset):

    def __init__(self, dataset_path: str, sentence2vector: Callable, remove_stop_words: bool, lemmatization: bool):
        self.data_store = []
        self.lemmatization = lemmatization
        self.remove_stop_words = remove_stop_words
        self.init_structure(dataset_path, sentence2vector)


    def init_structure(self, dataset_path: str, sentence2vector) -> None:

        with open(dataset_path) as f:
            for json_string in f:
                single_json = json.loads(json_string)

                keyword = single_json['sentence1'][int(single_json['start1']):int(single_json['end1'])]
                keyword2 = single_json['sentence2'][int(single_json['start2']):int(single_json['end2'])]
                lemma = single_json['lemma']

                sep = " " if not USE_SEP else " SEP "
                
                if self.lemmatization:
                    lemmatized1 = self.use_only_lemma(single_json['sentence1'],lemma,keyword)
                    lemmatized2 = self.use_only_lemma(single_json['sentence2'],lemma,keyword2)
                else:
                    lemmatized1 = single_json['sentence1']
                    lemmatized2 = single_json['sentence2']
                
                if LOWERED:
                    lemmatized1 = lemmatized1.lower()
                    lemmatized2 = lemmatized2.lower()
                    keyword = keyword.lower()
                    keyword2 = keyword2.lower()
                    lemma = lemma.lower()
                
                if self.remove_stop_words:
                    lemmatized1_without_stop = self.remove_stopwords(lemmatized1,lemma)
                    lemmatized2_without_stop = self.remove_stopwords(lemmatized2,lemma)
                    sentence =  lemmatized1_without_stop + sep + lemmatized2_without_stop
                    # substitue digits with "number"
                    sentence = self.handle_digits(sentence)
                else:
                    sentence = lemmatized1 + sep + lemmatized2

                if USE_SEP:
                    indices = self.get_kwd_indices(sentence,[keyword,keyword2,lemma])
                else:
                    indices = self.get_kwd_indices(lemmatized1_without_stop,[keyword,keyword2,lemma]) + [42] +self.get_kwd_indices(lemmatized2_without_stop,[keyword,keyword2,lemma])

                ground_t = np.float32(1) if single_json['label'] =='True' else np.float32(0)

                vector = sentence2vector(sentence,METHOD_FIRST_APPROACH,keyword)
                
                if vector is None or len(indices)!=3:
                    print(sentence,indices, keyword) 
                    continue
                    
                self.data_store.append((vector,ground_t,indices))



    '''
    Substitute every digits with the word "number"
    '''
    def handle_digits(self,sent: str) -> str:
        filtered_sentence = [w if w.isalpha() else "number" for w in sent.split(" ") ]
        return " ".join(filtered_sentence)       

    '''
    Removing the stopwords and the punctuation but there is the possibility that the keyword is contained
    inside the set of stopwords so I remove it first.
    '''
    def remove_stopwords(self,sent: str,lemma: str) -> str:
        stop_words = set(stopwords.words('english'))
        try:
            stop_words.remove(lemma)
        except:
            pass

        # remove punkt
        others = "–" +"—" + "−" + "’" + "”" + "“" #These chars arent inside the standard punctuation
        str_punkt = string.punctuation+ others
        translator = str.maketrans(str_punkt, ' '*len(str_punkt)) 
        word_tokens = word_tokenize(sent.translate(translator)) 
        
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
        return " ".join(filtered_sentence)

    '''
    Lemmatization of the sentence
    '''
    def use_only_lemma(self,sent:str ,lemma: str, keyword: str) -> str:
        filtered_sentence = [w if not w == keyword else lemma for w in sent.split(" ") ]
        return " ".join(filtered_sentence)

    '''
    The indices of the keyword are retrieved, we have to handle the fact that the keywords can be repeated inside the sentence.
    So if we are using the separator we use this word SEP to separate the sentences and retrieve the correct indices.
    The list returned must contain three elements if we are using the separator: 
     - index of first occurence of keyword in first sentence
     - index of sep
     - index of first occurence of keyword in second sentence
    '''
    def get_kwd_indices(self,sentence: str,keywords: Sequence[str]) -> Sequence[int]:
        i = 0
        j_list = []
        sec = False
        sentence_list = sentence.split(" ")
        if USE_SEP:
            while i < len(sentence_list):
                if sentence_list[i] == "SEP":
                    sec = True
                    j_list.append(i)
                if sentence_list[i] in keywords:
                    if j_list == []:
                        j_list.append(i)
                    elif sec:
                        j_list.append(i)
                        return j_list
                i += 1
        else: # single sentence
            while i < len(sentence_list):
                if sentence_list[i] in keywords:
                    j_list.append(i)
                    return j_list
                i+= 1
        return j_list
            
    def __len__(self) -> int:
        return len(self.data_store)

    def __getitem__(self, idx: int) -> torch.Tensor:
        return self.data_store[idx]

Versatile datamodule taht can be used for both approaches, since what is changing is the sentence2vector function and the collate one.

In [None]:
class SentencesDataModule(nn.Module):

    def __init__(
        self, 
        data_train_path: str,
        data_dev_path: str,
        batch_size: int,
        vectorize_function: Callable,
        collate_fn=None
    ) -> None:
        super().__init__()
        self.data_train_path = data_train_path
        self.data_dev_path = data_dev_path
        self.batch_size = batch_size
        self.collate_fn = collate_fn
        self.vectorize_function = vectorize_function

        self.train_dataset = None
        self.validation_dataset = None

    def train_dataloader(self, *args, **kwargs) -> DataLoader:
        self.train_dataset = SentencesDataset(self.data_train_path, self.vectorize_function, REMOVE_STOPWORDS, LEMMATIZATION)
        return DataLoader(self.train_dataset, batch_size=self.batch_size, collate_fn=self.collate_fn)

    def val_dataloader(self, *args, **kwargs) -> Union[DataLoader, List[DataLoader]]:
        self.validation_dataset = SentencesDataset(self.data_dev_path, self.vectorize_function, REMOVE_STOPWORDS, LEMMATIZATION)
        return DataLoader(self.validation_dataset, batch_size=self.batch_size,collate_fn=self.collate_fn)


# First approach **(word-level)**


Word-embedding-powered function. Converts sentences to a vector by averaging the embeddings corresponding to each word in it

In [None]:
def sentence2vector(sentence: str, method: str, keyword: str) -> Optional[torch.Tensor]:
    sentences_word_vector = []
    if method == "avg":
        sentences_word_vector = [word_vectors[w] if w in word_vectors else word_vectors['UNK'] for w in sentence.split(' ')]
    elif method =="weigthed_avg":
        for w in sentence.split(' '):
            coeff = 1
            if w in word_vectors:
                if w == keyword:
                    coeff = 1.5
                sentences_word_vector.append(word_vectors[w]*coeff)
            else:
                sentences_word_vector.append(word_vectors['UNK'])
    if len(sentences_word_vector) == 0:
        return None

    sentences_word_vector = torch.stack(sentences_word_vector)  # tensor shape: (#words X #features)
    if method=="sum":
        return torch.sum(sentences_word_vector, dim=0)
    else:
        return torch.mean(sentences_word_vector, dim=0)

Loading and testing of the dataset

In [None]:
BATCH_SIZE = 40 #@param {type:"slider", min:8, max:64, step:8}

glove_embed = GloVEEmbedding()
word_vectors = glove_embed.get_word_vectors()

sentences_dm = SentencesDataModule(
    data_train_path=dataset_folder+'/train.jsonl',
    data_dev_path=dataset_folder+'/dev.jsonl',
    batch_size=BATCH_SIZE,
    vectorize_function = sentence2vector
)

val_dataloader = sentences_dm.val_dataloader()

for batch in val_dataloader:
    X, y, z  = batch
    print(batch)
    print(f"batch X shape: {X.shape}")
    print(f"batch z shape: {y.shape}")
    break

Create the MLP classifier class 

In [None]:
class SentencesClassifier(nn.Module):

    def __init__(self, n_features: int, n_hidden: int):
        super().__init__()
        self.lin1 = torch.nn.Linear(n_features, n_hidden)

        self.output_layer = torch.nn.Linear(n_hidden, 1)

        self.loss_fn = torch.nn.BCELoss()
        

    def forward(self, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
        
        out = self.lin1(x)
        out = torch.relu(out)
    
        logits = self.output_layer(out).squeeze(1)

        out = torch.sigmoid(logits)

        result = {'logits': logits, 'pred': out}

        # compute loss
        if y is not None:
            loss = self.loss(out, y)
            result['loss'] = loss

        return result

    def loss(self, pred, y):
        return self.loss_fn(pred, y)

### Training

Defining a trainer class to better separate our work.

In [None]:
class Trainer():
    def __init__(self, model, optimizer, device):

        self.device = device

        self.model = model
        self.optimizer = optimizer
        self.writer = SummaryWriter(comment="_first approach")
        self.model.train()  
        self.model.to(self.device) 

    def train(self, train_dataset, eval_dataset, epochs=1):

        train_loss = 0.0
        for epoch in tqdm(range(epochs)):
            epoch_loss = 0.0
            len_train = 0
            
            self.model.train()
            for step, sample in enumerate(train_dataset):
                # inputs 
                inputs = sample[0].to(self.device)
                # outputs 
                targets = sample[1].to(self.device)
                output_distribution = self.model(inputs)
                loss = self.model.loss(output_distribution['pred'], targets)  # compute loss
                loss.backward()  # backpropagate the loss

                self.optimizer.step()
                self.optimizer.zero_grad()

                epoch_loss += loss.item()
                len_train += 1

            avg_epoch_loss = epoch_loss / len_train
            avg_eval_loss,avg_accuracy_loss,avg_f1_score = self.eval_metrics(eval_dataset)
            
            
            self.writer.add_scalar("Train/loss", avg_epoch_loss, epoch)
            self.writer.add_scalar("Eval/loss", avg_eval_loss, epoch)
            self.writer.add_scalar("Eval/accuracy", avg_accuracy_loss, epoch)
            self.writer.add_scalar("Eval/F1_score", avg_f1_score, epoch)

            print('Epoch: {} avg loss = {:0.4f} avg_eval_loss = {:0.4f} avg_eval_acc = {:0.4f} avg_eval_f1 = {:0.4f}'.format(epoch, avg_epoch_loss, avg_eval_loss, avg_accuracy_loss, avg_f1_score))

            train_loss += avg_epoch_loss
            
        torch.save(self.model.state_dict(),os.path.join(root_folder +"/model", 'state_{}.pt'.format(epoch)))  # save the model state
        self.writer.flush()
        avg_epoch_loss = train_loss / epochs
        return avg_epoch_loss

    def eval_metrics(self,eval_dataset):
        self.model.eval()
        epoch_val_loss = 0.0
        len_val_train = 0
        accuracy = 0
        f1 = 0
        for step, sample in enumerate(eval_dataset):
            # inputs in the batch
            inputs = sample[0].to(self.device)
            # outputs in the batch
            targets = sample[1].to(self.device)
            output_distribution = self.model(inputs)
            loss = self.model.loss(output_distribution['pred'], targets)  # compute loss    
            y_pred = (output_distribution['pred']>0.5).float().cpu()
            y_true = targets.cpu()
            accuracy += accuracy_score(y_true, y_pred)
            f1 += f1_score(y_true,y_pred)
            #accuracy += ((output_distribution['pred'] > 0.5) == targets).float().mean().item() #TODO
            epoch_val_loss += loss.item()
            len_val_train += 1
        
        avg_eval_loss = epoch_val_loss / len_val_train
        avg_accuracy_loss = accuracy / len_val_train
        avg_f1_score = f1/len_val_train
        return avg_eval_loss,avg_accuracy_loss,avg_f1_score

Instanciating the classifier and tune some hyperparameters such as the number of hidden layers learning rate and number of epochs

In [None]:
sent_classifier = SentencesClassifier(
    n_features=int(WE_LENGTH), 
    n_hidden=200 #@param {type:"slider", min:50, max:300, step:50}
)
learning_rate = 0.0391 #@param {type:"slider", min:0.0001, max:0.1, step:0.001}
epochs = 100 #@param {type:"slider", min:50, max:300, step:10}

Defining the optimizer, we will use Stochastic gradient descent and instanciating the trainer to start the train.

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
optimizer = torch.optim.SGD(sent_classifier.parameters(), lr=learning_rate)

trainer = Trainer(sent_classifier, optimizer, device)

train_dataloader = sentences_dm.train_dataloader()
%load_ext tensorboard
%tensorboard --logdir=runs
avg_loss = trainer.train(train_dataloader,val_dataloader, epochs=epochs)
print(avg_loss)

#Second approach **(sequence encoding with RNN)**


Let's start by indexing each word in our vocabulary

In [None]:
def create_vocabulary():
    word_index = dict()
    vectors_store = []

    # pad token, index = 0
    vectors_store.append(torch.rand(int(WE_LENGTH)))
    # unk token, index = 1
    vectors_store.append(word_vectors["UNK"])

    # sep token, index = 2
    if USE_SEP:
        vectors_store.append(word_vectors["SEP"])

    for word, vector in word_vectors.items():
        word_index[word] = len(vectors_store)
        vectors_store.append(vector)

    word_index = defaultdict(lambda: 1, word_index)  # default dict returns 1 (unk token) when unknown word
    vectors_store = torch.stack(vectors_store)
    return word_index,vectors_store

Checking the shape of the built vocabulary

In [None]:
glove_embed = GloVEEmbedding()
word_vectors = glove_embed.get_word_vectors()
word_index,vectors_store = create_vocabulary()

vocabulary_size, hidden_features = vectors_store.shape
print(f"Vocabulary size: {vocabulary_size}\nHidden features: {hidden_features}")


Similar to sentence2vector in this function we map each word with the corresponding index of the built vocabulary


In [None]:
def sentence2indices(sentence: str,method: str,keyword: str) -> torch.Tensor:
    return torch.tensor([word_index[word] for word in sentence.split(' ')], dtype=torch.long)

Customized collate function in order to pad the sentences in each batch and to return a custom tuple with X, X_length to calculate the offsets later, targets and indices to know where are the target words

In [None]:
def rnn_collate_fn(
data_elements: List[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]] # list of (x, y,z) pairs
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:

    X = [de[0] for de in data_elements]  # list of index tensors
    
    X = torch.nn.utils.rnn.pad_sequence(X, batch_first=True, padding_value=0)  #  shape (batch_size x max_seq_len)
    
    keyword_position = [de[2] for de in data_elements] # list of tuples indices where keyword is [[1st sent, 2nd sent]]
    keyword_position = torch.tensor(keyword_position)
    
    y = [de[1] for de in data_elements]
    y = torch.tensor(y)


    return X, keyword_position, y

##### Training 

Recurrent classifier definition with a customized forward pass

In [None]:
class SentencesRecurrentClassifier(nn.Module):

    def __init__(
        self,
        vectors_store: torch.Tensor,
        n_hidden: int,
        drop_prob: float,
        bidir: bool,
        n_layer_lstm: int
    ) -> None:
        super().__init__()


        # embedding layer
        self.embedding = torch.nn.Embedding.from_pretrained(vectors_store)
        self.n_hidden = n_hidden
        # recurrent layer
        self.rnn = torch.nn.LSTM(input_size=vectors_store.size(1), hidden_size=n_hidden, num_layers=n_layer_lstm, batch_first=True, bidirectional=bidir)

        # classification 
        if bidir:
           n_hidden = n_hidden*2
        self.lin1 = torch.nn.Linear(n_hidden, n_hidden)
        self.linear_output = torch.nn.Linear(n_hidden, 1)

        self.loss_fn = torch.nn.BCELoss()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'


    def forward(
        self, 
        X: torch.Tensor, 
        #X_length: torch.Tensor,
        indices_keyword: torch.Tensor, 
        y: Optional[torch.Tensor] = None
    ) -> Dict[str, torch.Tensor]:

        
        embedding_out = self.embedding(X)
        # recurrent encoding
        recurrent_out = self.rnn(embedding_out)[0]
        # here we utilize the sequences length to retrieve the last token 
        # output for each sequence
        
        batch_size, seq_len, hidden_size = recurrent_out.shape

        # we flatten the recurrent output now I have a long sequence of batch x seq_len vectors 
        flattened_out = recurrent_out.reshape(-1, hidden_size)
        
        # tensor of the start offsets of each element in the batch
        sequences_offsets = torch.arange(batch_size, device=self.device) * seq_len
        
        summary_vectors_indices_sent1 = self.get_indices_keyword(indices_keyword, sequences_offsets,0)
        
        #summary_vectors_indices_end_first_sent = self.get_indices_keyword(indices_keyword, sequences_offsets,1)

        summary_vectors_indices_sent2 = self.get_indices_keyword(indices_keyword, sequences_offsets,2)
        

        # we retrieve the vecttor of the corrseponding states for the keyword given for each sentence.
          
        summary_vectors_sent1 = flattened_out[summary_vectors_indices_sent1]
        summary_vectors_sent2 = flattened_out[summary_vectors_indices_sent2]
        
        # do the difference of these two vectors yet retrieved
        summary_vectors = summary_vectors_sent1 * summary_vectors_sent2
        
        # feedforward pass on the summary
        out = self.lin1(summary_vectors)
        out = F.leaky_relu(out)
        

        logits = self.linear_output(out).squeeze(1)
        
        pred = torch.sigmoid(logits)

        result = {'logits': logits, 'pred': pred} 
        
        # compute loss
        if y is not None:
            loss = self.loss(pred, y)
            result['loss'] = loss
        
        return result
        
       
    def loss(self, pred, y):
        return self.loss_fn(pred, y)
    '''
    return the corresponding position of the indices of the keywords, for the sent_num passed, so the first if 0 is passed and the second if 2 is passed

    '''
    def get_indices_keyword(self,indices_keywords: Sequence[tuple], summary: Sequence[int] ,sent_num: int) -> torch.Tensor:
        #[   0,   57,  114,  171,  228] = summary
        #[ [ 6, 21],[ 4, 22],[ 6, 21],[ 4, 22] ...] = indices_keywords
        tens_idx = torch.tensor([item[sent_num] for item in indices_keywords]).to(self.device)
        return tens_idx + summary

Trainer class that will handle the training phase for the RNN classifier

In [None]:
class TrainerRNN():
    def __init__(self, model, optimizer, device, exp_details):

        self.device = device
        #self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience = 1)
        self.model = model
        self.optimizer = optimizer
        self.writer = SummaryWriter(comment="_"+exp_details)
        self.model.train()  # we are using this model for training
        self.model.to(self.device)  # move model to GPU if available

    def train(self, train_dataset, eval_dataset, epochs: int = 1, early_stopping: bool = False, early_stopping_patience:int = 3, to_be_saved: bool =False) -> float:

        train_loss = 0.0
        eval_loss = 0.0
        eval_acc = 0.0
        eval_f1 = 0.0
        valid_history = []
        patience_counter = 0
        best_avg_acc = 0.679
        for epoch in tqdm(range(epochs)):
            epoch_loss = 0.0
            len_train = 0
            
            self.model.train()

            # each element (sample) in train_dataset is a batch
            for sample in train_dataset:


                # inputs in the batch
                inputs = sample[0].to(self.device)
                # indices of keywords
                idx_start = sample[1].to(self.device)

                # outputs in the batch
                targets = sample[2].to(self.device)
                
                output_distribution = self.model(inputs, idx_start, targets)

                loss = output_distribution['loss']
                
                loss.backward()  #  backpropagate the loss
                # updates the parameters
                #Clips gradient norm of an iterable of parameters.
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), CLIP_GRAD)
                self.optimizer.step()
                self.optimizer.zero_grad()
                
                epoch_loss += loss.item()
                len_train += 1
                
            avg_epoch_loss = epoch_loss / len_train

            avg_eval_loss,avg_accuracy_loss,avg_f1_score, avg_recall_score = self.eval_metrics(eval_dataset)
            #self.model.train()
            #self.scheduler.step(avg_eval_loss)
            valid_history.append(avg_accuracy_loss)

            if early_stopping and epoch > 0:
                if valid_history[-1] < valid_history[-2]:
                    if patience_counter >= early_stopping_patience:
                        print('Early stop.')
                        break
                    else:
                        print('-- Patience.\n')
                        patience_counter += 1
            
            self.writer.add_scalar("Train/loss", avg_epoch_loss, epoch)
            self.writer.add_scalar("Eval/loss", avg_eval_loss, epoch)
            self.writer.add_scalar("Eval/accuracy", avg_accuracy_loss, epoch)
            self.writer.add_scalar("Eval/F1_score", avg_f1_score, epoch)
            self.writer.add_scalar("Eval/recall_score", avg_recall_score, epoch)
            print('Epoch: {} avg loss = {:0.4f} eval loss = {:0.4f} ACC = {:0.4f} F1 = {:0.4f} RECALL = {:0.4f}'.format(epoch, avg_epoch_loss, avg_eval_loss, avg_accuracy_loss,avg_f1_score,avg_recall_score))
            
            train_loss += avg_epoch_loss
            eval_loss += avg_eval_loss
            eval_acc += avg_accuracy_loss
            eval_f1 += avg_f1_score


            self.writer.flush()
    
            if to_be_saved:
                if avg_accuracy_loss > best_avg_acc and epoch>3:
                    torch.save(self.model.state_dict(), root_folder+'/model/'+exp_details+'_epoch_{}_acc_{:0.4f}.pt'.format(epoch, avg_accuracy_loss)) # save the model state
                    best_avg_acc = avg_accuracy_loss        
        #torch.save(self.model.state_dict(),root_folder+'/model/'+exp_details+'_epoch_{}_acc_{:0.4f}.pt'.format(epoch, avg_accuracy_loss)) # save the model state
                    
        #epoch+1 because if early stopping is true, the metrics are not sure to be arrived at number of epochs

        return [train_loss / (epoch+1), eval_loss / (epoch+1), eval_acc / (epoch+1), eval_f1 / (epoch+1)]

    '''
        returns the metrics of the current epoch
    '''
    def eval_metrics(self,eval_dataset):
        self.model.eval()
        epoch_val_loss = 0.0
        len_val_train = 0
        accuracy = 0.0
        recall = 0.0
        f1 = 0.0
        for sample in eval_dataset:
            # inputs in the batch
            inputs = sample[0].to(self.device)
            idx_start = sample[1].to(self.device)

            # outputs in the batch
            targets = sample[2].to(self.device)


            output_distribution = self.model(inputs, idx_start)
            loss = self.model.loss(output_distribution['pred'], targets)  # compute loss  
            y_pred = (output_distribution['pred']>0.5).float().cpu()
            y_true = targets.cpu()
            
            accuracy += accuracy_score(y_true, y_pred)
            f1 += f1_score(y_true,y_pred)
            recall += recall_score(y_true, y_pred)
            epoch_val_loss += loss.item()
            
            len_val_train += 1

        avg_eval_loss = epoch_val_loss / len_val_train
        avg_accuracy_loss = accuracy / len_val_train
        avg_f1_score = f1/len_val_train
        avg_recall_score = recall/len_val_train

        return avg_eval_loss, avg_accuracy_loss, avg_f1_score, avg_recall_score

Loading of the handler for the dataset and choose of the batch size

In [None]:
BATCH_SIZE = 40 #@param {type:"slider", min:8, max:64, step:4}

sentences_rnn_dm = SentencesDataModule(
    data_train_path=dataset_folder+'/train.jsonl',
    data_dev_path=dataset_folder+'/dev.jsonl',
    batch_size=BATCH_SIZE,
    collate_fn = rnn_collate_fn,
    vectorize_function = sentence2indices
)

Hyperparameter setup

In [None]:
#@title Setup of Hyper-parameters{ run: "auto" }

n_hidden=82 #@param {type:"slider", min:50, max:300, step:16}
drop_prob=0.5 #@param {type:"slider", min:0, max:1, step:0.05}
bidir = True #@param ["True", "False"] {type:"raw"}
learning_rate = 0.0001 #@param {type:"slider", min:0.00001, max:0.001, step:0.00001}
epochs = 25 #@param {type:"slider", min:10, max:100, step:10}
n_layer_lstm = 2 #@param {type:"slider", min:1, max:4, step:1}
CLIP_GRAD = 1 #@param {type:"slider", min:1, max:10, step:1}


Start the training

In [None]:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

sentences_recurrent_classifier = SentencesRecurrentClassifier(vectors_store, n_hidden=n_hidden,drop_prob=drop_prob, bidir = bidir, n_layer_lstm = n_layer_lstm)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

#define the optimizer
optimizer = torch.optim.Adam(sentences_recurrent_classifier.parameters(), lr=learning_rate)

# string to indetify the model once saved or in the graphs
exp_details="diff_leakyrelu_" + str(drop_prob) + "drop_"+str(n_hidden) +"hidden_"+str(learning_rate) +"lr_" + str(BATCH_SIZE) +"batch_" + str(n_layer_lstm) +"lstmLayer_" + str(CLIP_GRAD) +"clipGrad"

trainer = TrainerRNN(sentences_recurrent_classifier, optimizer, device, exp_details, )

#loading of the datasets
train_dataloader = sentences_rnn_dm.train_dataloader()
val_dataloader = sentences_rnn_dm.val_dataloader()

avg_train_loss,  avg_eval_loss, avg_accuracy_loss, avg_f1_score = trainer.train(train_dataloader, val_dataloader, epochs=epochs, early_stopping=False, early_stopping_patience=6, to_be_saved = True)
print(" avg_train_loss={}\n avg_eval_loss={}\n avg_acc_loss={}\n avg_f1_loss={}\n".format(avg_train_loss , avg_eval_loss, avg_accuracy_loss, avg_f1_score))



See some nice graphs of the finished training

In [None]:
try:
    %reload_ext tensorboard
except:
    %load_ext tensorboard
%tensorboard --logdir=runs