<a href="https://colab.research.google.com/github/LuigiSigillo/nlp2021-hw/blob/master/hw1/stud/nlp_hw1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initial setup for both methodologies

## Imports

In [33]:
from google.colab import drive
# general
import matplotlib.pyplot as plt
import numpy as np
import os
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
from typing import *
import string
import json

# torch
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD
from torch.nn import functional as F
from torch.utils.tensorboard import SummaryWriter

# NLTK
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')


# SKLEARN
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

drive.mount('/content/drive')
root_folder = '/content/drive/My Drive/NLP/nlp2021-hw1'
dataset_folder = os.path.join(root_folder,'data')

''' code to download and move the glove embeddings in the right folder '''
#! wget http://nlp.stanford.edu/data/wordvecs/glove.6B.zip
#! unzip -d data/glove.6B
#! cd '/content/drive/My Drive/NLP/nlp2021-hw1'
#!unzip '/content/drive/My Drive/NLP/nlp2021-hw1/glove.6B.zip'
# !mv glove.6B.200d.txt '/content/drive/My Drive/NLP/nlp2021-hw1/model'
# !ls "{root_folder}/model/"
!nvidia-smi

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Fri Apr 23 15:05:05 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P0    32W /  70W |   1202MiB / 151

In [34]:
#@title Setup of parameters{ run: "auto" }
WE_LENGTH = "100" #@param [50,100,200,300]
METHOD = "avg" #@param ["avg","sum","weigthed_avg"]
USE_SEP = True #@param ["True", "False"] {type:"raw"}
WORDS_LIMIT = 140000 #@param {type:"slider", min:20000, max:200000, step:20000}


torch.manual_seed(42)
np.random.seed(42)


##GloVe word embeddings

Added to the dictionary also the "UNK" and "SEP" words using a random vector

In [35]:
class GloVEEmbedding():

    def __init__(self):
        self.word_vectors = dict()


    def get_word_vectors(self):
        with open(root_folder+'/model/glove.6B.'+WE_LENGTH+'d.txt') as f:
            next(f)  # skip header
            for i, line in tqdm(enumerate(f), total=WORDS_LIMIT):
                if i == WORDS_LIMIT:
                    break
                word, *vector = line.strip().split(' ')
                vector = torch.tensor([float(c) for c in vector])
                
                self.word_vectors[word] = vector

        self.word_vectors["UNK"] = torch.tensor(np.random.random(int(WE_LENGTH)),dtype=torch.float)

        if USE_SEP:
            self.word_vectors["SEP"] = torch.tensor(np.random.random(int(WE_LENGTH)),dtype=torch.float)
        return self.word_vectors

## Dataset class and interfaces

In [36]:
class SentencesDataset(torch.utils.data.Dataset):

    def __init__(self, dataset_path: str, phrase2vector):
        self.data_store = []
        self.init_structures(dataset_path, phrase2vector)

    def init_structures(self, dataset_path: str, phrase2vector) -> None:

        with open(dataset_path) as f:
            for json_string in f:
                single_json = json.loads(json_string)

                keyword = single_json['sentence1'][int(single_json['start1']):int(single_json['end1'])]
                keyword2 = single_json['sentence2'][int(single_json['start2']):int(single_json['end2'])]
                lemma = single_json['lemma']

                sep = " " if not USE_SEP else " SEP "
                lemmatized1 = self.use_only_lemma(single_json['sentence1'],lemma,keyword)
                lemmatized2 = self.use_only_lemma(single_json['sentence2'],lemma,keyword2)

                sentence =  self.remove_stopwords(lemmatized1) + sep + self.remove_stopwords(lemmatized2)
                
                indices = self.get_kwd_indices(sentence,[keyword,keyword2,lemma])
                ground_t = np.float32(1) if single_json['label'] =='True' else np.float32(0)

                vector = phrase2vector(sentence,METHOD,keyword)
                
                if vector is None or len(indices)!=3:
                    #print(sentence,indices)
                    continue
                    
                self.data_store.append((vector,ground_t,indices))

    def remove_stopwords(self,sent):
        stop_words = set(stopwords.words('english'))
        # remove punkt
        translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) 
        word_tokens = word_tokenize(sent.translate(translator)) 
        
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
        return " ".join(filtered_sentence)

    def use_only_lemma(self,sent,lemma, keyword):
        filtered_sentence = [w if not w == keyword else lemma for w in sent.split(" ") ]
        return " ".join(filtered_sentence)

    
    def get_kwd_indices(self,sentence,keywords):
        i = 0
        j = []
        sec = False
        sentence_list = sentence.split(" ")
        while i < len(sentence_list):
            if sentence_list[i] == "SEP":
                sec = True
                j.append(i)
            if sentence_list[i] in keywords:
                if j == []:
                    j.append(i)
                elif sec:
                    j.append(i)
                    return j
            i += 1
        return j
            
    def __len__(self) -> int:
        return len(self.data_store)

    def __getitem__(self, idx: int) -> torch.Tensor:
        return self.data_store[idx]

In [37]:
class SentencesDataModule(nn.Module):

    def __init__(
        self, 
        data_train_path: str,
        data_dev_path: str,
        batch_size: int,
        vectorize_function: Callable,
        collate_fn=None
    ) -> None:
        super().__init__()
        self.data_train_path = data_train_path
        self.data_dev_path = data_dev_path
        self.batch_size = batch_size
        self.collate_fn = collate_fn
        self.vectorize_function = vectorize_function

        self.train_dataset = None
        self.validation_dataset = None

    def train_dataloader(self, *args, **kwargs) -> DataLoader:
        self.train_dataset = SentencesDataset(self.data_train_path, self.vectorize_function)
        return DataLoader(self.train_dataset, batch_size=self.batch_size, collate_fn=self.collate_fn)

    def val_dataloader(self, *args, **kwargs) -> Union[DataLoader, List[DataLoader]]:
        self.validation_dataset = SentencesDataset(self.data_dev_path, self.vectorize_function)
        return DataLoader(self.validation_dataset, batch_size=self.batch_size,collate_fn=self.collate_fn)


# First approach **(word-level)**


Word-embedding-powered function $\phi$. Converts sentences to a vector by averaging the embeddings corresponding to each word in it

In [None]:
def phrase2vector(phrase: str, method: str, keyword: str) -> Optional[torch.Tensor]:
    phrases_word_vector = []
    if method == "avg":
        phrases_word_vector = [word_vectors[w] if w in word_vectors else word_vectors['UNK'] for w in phrase.split(' ')]
    elif method =="weigthed_avg":
        for w in phrase.split(' '):
            coeff = 1
            if w in word_vectors:
                if w == keyword:
                    coeff = 1.5
                phrases_word_vector.append(word_vectors[w]*coeff)
            else:
                phrases_word_vector.append(word_vectors['UNK'])
    if len(phrases_word_vector) == 0:
        return None

    phrases_word_vector = torch.stack(phrases_word_vector)  # tensor shape: (#words X #features)
    if method=="sum":
        return torch.sum(phrases_word_vector, dim=0)
    else:
        return torch.mean(phrases_word_vector, dim=0)

Loading and testing of the dataset

In [None]:
BATCH_SIZE = 16 #@param {type:"slider", min:8, max:64, step:8}

glove_embed = GloVEEmbedding()
word_vectors = glove_embed.get_word_vectors()

sentences_dm = SentencesDataModule(
    data_train_path=dataset_folder+'/train.jsonl',
    data_dev_path=dataset_folder+'/dev.jsonl',
    batch_size=BATCH_SIZE,
    vectorize_function = phrase2vector
)

val_dataloader = sentences_dm.val_dataloader()

for batch in val_dataloader:
    X, y, z  = batch
    print(batch)
    print(f"batch X shape: {X.shape}")
    print(f"batch z shape: {y.shape}")
    break

Create the classifier class

In [None]:
class SentencesClassifier(nn.Module):

    def __init__(self, n_features: int, n_hidden: int):
        super().__init__()
        # classification function
        self.lin1 = torch.nn.Linear(n_features, n_hidden)
        self.output_layer = torch.nn.Linear(n_hidden, 1)
        
        # criterion
        self.loss_fn = torch.nn.BCELoss()
        

    def forward(self, x: torch.Tensor, y: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
        # actual forward
        out = self.lin1(x)
        out = torch.relu(out)
        # compute logits (which are simply the out variable) and the actual probability distribution (pred, as it is the predicted distribution)
    
        logits = self.output_layer(out).squeeze(1)

        out = torch.sigmoid(logits)

        result = {'logits': logits, 'pred': out}

        # compute loss
        if y is not None:
            # torch optimizes its computation internally and takes as input the logits instead
            loss = self.loss(out, y)
            result['loss'] = loss

        return result

    def loss(self, pred, y):
        return self.loss_fn(pred, y)

### Training

Defining a trainer class to better separate our work.

In [None]:
class Trainer():
    def __init__(self, model, optimizer, device):

        self.device = device

        self.model = model
        self.optimizer = optimizer
        self.writer = SummaryWriter()
        # starts requires_grad for all layers
        self.model.train()  # we are using this model for training (some layers have different behaviours in train and eval mode)
        self.model.to(self.device)  # move model to GPU if available

    def train(self, train_dataset, eval_dataset, epochs=1):

        train_loss = 0.0
        for epoch in tqdm(range(epochs)):
            epoch_loss = 0.0
            len_train = 0
            epoch_val_loss = 0.0
            len_val_train = 0
            accuracy = 0
            f1 = 0
            self.model.train()
            # each element (sample) in train_dataset is a batch
            for step, sample in enumerate(train_dataset):
                # inputs in the batch
                inputs = sample[0].to(self.device)
                # outputs in the batch
                targets = sample[1].to(self.device)
                output_distribution = self.model(inputs)
                loss = self.model.loss(output_distribution['pred'], targets)  # compute loss
                # calculates the gradient and accumulates
                loss.backward()  # we backpropagate the loss
                # updates the parameters
                self.optimizer.step()
                self.optimizer.zero_grad()

                epoch_loss += loss.item()
                len_train += 1

            self.model.eval()
            for step, sample in enumerate(eval_dataset):
                # inputs in the batch
                inputs = sample[0].to(self.device)
                # outputs in the batch
                targets = sample[1].to(self.device)
                output_distribution = self.model(inputs)
                loss = self.model.loss(output_distribution['pred'], targets)  # compute loss    
                y_pred = (output_distribution['pred']>0.5).float().cpu()
                y_true = targets.cpu()
                accuracy += accuracy_score(y_true, y_pred)
                f1 += f1_score(y_true,y_pred)
                #accuracy += ((output_distribution['pred'] > 0.5) == targets).float().mean().item() #TODO
                epoch_val_loss += loss.item()
                len_val_train += 1
            
            avg_epoch_loss = epoch_loss / len_train
            avg_eval_loss = epoch_val_loss / len_val_train
            avg_accuracy_loss = accuracy / len_val_train
            avg_f1_score = f1/len_val_train
            self.writer.add_scalar("Train/loss", avg_epoch_loss, epoch)
            self.writer.add_scalar("Eval/loss", avg_eval_loss, epoch)
            self.writer.add_scalar("Eval/accuracy", avg_accuracy_loss, epoch)
            self.writer.add_scalar("Eval/F1_score", avg_f1_score, epoch)

            print('Epoch: {} avg loss = {:0.4f} avg_eval_loss = {:0.4f} avg_eval_acc = {:0.4f} avg_eval_f1 = {:0.4f}'.format(epoch, avg_epoch_loss, avg_eval_loss, avg_accuracy_loss, avg_f1_score))

            train_loss += avg_epoch_loss
            
        torch.save(self.model.state_dict(),os.path.join(root_folder +"/model", 'state_{}.pt'.format(epoch)))  # save the model state
        self.writer.flush()
        avg_epoch_loss = train_loss / epochs
        return avg_epoch_loss

Instanciating the classifier and tune some hyperparameters such as the number of hidden layers learning rate and number of epochs

In [None]:
sent_classifier = SentencesClassifier(
    n_features=int(WE_LENGTH), 
    n_hidden=200 #@param {type:"slider", min:50, max:300, step:50}
)
learning_rate = 0.0391 #@param {type:"slider", min:0.0001, max:0.1, step:0.001}
epochs = 100 #@param {type:"slider", min:50, max:300, step:10}

Defining the optimizer, we will use Stochastic gradient descent and instanciating the trainer to start the train.

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
optimizer = torch.optim.SGD(sent_classifier.parameters(), lr=learning_rate)

trainer = Trainer(sent_classifier, optimizer, device)

train_dataloader = sentences_dm.train_dataloader()
%load_ext tensorboard
%tensorboard --logdir=runs
avg_loss = trainer.train(train_dataloader,val_dataloader, epochs=epochs)
print(avg_loss)

Method to run a prediction on a personal test phrase and see the prob of beeing of the same context

In [None]:
def predict(model, phrase2vector, phrase: str, keyword: str):
    phrase_vector = phrase2vector(phrase,METHOD,keyword).to('cuda' if torch.cuda.is_available() else 'cpu')
    forward_out = model(phrase_vector.unsqueeze(0))  # add a dimension to create a one-item batch
    print(f"# Sentences: {phrase}")
    for i,prob in enumerate(forward_out["pred"]):
        print("\n {}".format( prob) )
predict(sent_classifier, phrase2vector, "The cat eats the mouse SEP Use the mouse to click on the button", "mouse")
predict(sent_classifier, phrase2vector, "The cat eats the mouse SEP The mouse escaped from the predator", "mouse")



#Second approach **(sequence encoding with RNN)**


Let's start by indexing each word in our vocabulary

In [38]:
def create_vocabulary():
    word_index = dict()
    vectors_store = []

    # pad token, index = 0
    vectors_store.append(torch.rand(int(WE_LENGTH)))

    # unk token, index = 1
    vectors_store.append(torch.rand(int(WE_LENGTH)))

    for word, vector in word_vectors.items():
        word_index[word] = len(vectors_store)
        vectors_store.append(vector)

    word_index = defaultdict(lambda: 1, word_index)  # default dict returns 1 (unk token) when unknown word
    vectors_store = torch.stack(vectors_store)
    return word_index,vectors_store

Checking the shape of the built vocabulary

In [39]:
word_index,vectors_store = create_vocabulary()

vocabulary_size, hidden_features = vectors_store.shape
print(f"Vocabulary size: {vocabulary_size}\nHidden features: {hidden_features}")
word_index['unk'] 


Vocabulary size: 140004
Hidden features: 100


1

Similar to sentence2vector in this function we map each word with the corresponding index of the built vocabulary


In [40]:
def sentence2indices(sentence: str,method: str,keyword: str) -> torch.Tensor:
    return torch.tensor([word_index[word] for word in sentence.split(' ')], dtype=torch.long)

Customized collate function in order to pad the sentences in each batch and to return a custom tuple with X, X_length to calculate the offsets later, targets and indices to know where are the target words

In [41]:
def rnn_collate_fn(
    data_elements: List[Tuple[torch.Tensor, torch.Tensor]] # list of (x, y) pairs
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:

    X = [de[0] for de in data_elements]  # list of index tensors
    # to implement the many-to-one strategy
    X_lengths = torch.tensor([x.size(0) for x in X], dtype=torch.long)
    

    X = torch.nn.utils.rnn.pad_sequence(X, batch_first=True, padding_value=0)  #  shape (batch_size x max_seq_len)

    y = [de[1] for de in data_elements]
    y = torch.tensor(y)
    keyword_position = [de[2] for de in data_elements] # list of tuples indices where keyword is [[1st sent, 2nd sent]]


    keyword_position = torch.tensor(keyword_position)

    return X, X_lengths, y, keyword_position

#### Training 

Recurrent classifier definition with a customized forward pass

In [42]:
class SentencesRecurrentClassifier(nn.Module):

    def __init__(
        self,
        vectors_store: torch.Tensor,
        n_hidden: int,
        drop_prob: float,
        bidir: bool
    ) -> None:
        super().__init__()


        # embedding layer
        self.embedding = torch.nn.Embedding.from_pretrained(vectors_store)
        self.n_hidden = n_hidden
        # recurrent layer
        self.rnn = torch.nn.LSTM(input_size=vectors_store.size(1), hidden_size=n_hidden, num_layers=1, batch_first=True, bidirectional=bidir)
        self.dropout = nn.Dropout(drop_prob)

        # classification 
        if bidir:
           n_hidden = n_hidden*2
        self.lin1 = torch.nn.Linear(n_hidden, n_hidden)
        self.linear_output = torch.nn.Linear(n_hidden, 1)

        # criterion
        self.loss_fn = torch.nn.BCELoss()
        self.device = 'cuda'

    def forward(
        self, 
        X: torch.Tensor, 
        X_length: torch.Tensor,
        indices_keyword: torch.Tensor, 
        y: Optional[torch.Tensor] = None
    ) -> Dict[str, torch.Tensor]:

        
        embedding_out = self.embedding(X)
        # recurrent encoding
        recurrent_out = self.rnn(embedding_out)[0]
        # here we utilize the sequences length to retrieve the last token 
        # output for each sequence
        
        batch_size, seq_len, hidden_size = recurrent_out.shape

        # we flatten the recurrent output now I have a long sequence of batch x seq_len vectors 
        flattened_out = recurrent_out.reshape(-1, hidden_size)
        
        # and we use a simple trick to compute a tensor of the indices of the last token in each batch element
        #last_word_relative_indices = X_length - 1
        # tensor of the start offsets of each element in the batch
        sequences_offsets = torch.arange(batch_size, device=self.device) * seq_len
        # e.g. (0, 5, 10, 15, ) + ( 3, 2, 1, 4 ) = ( 3, 7, 11, 19 )
        #summary_vectors_indices = sequences_offsets + last_word_relative_indices
        
        
        summary_vectors_indices_sent1 = self.get_indices_keyword(indices_keyword, sequences_offsets,0)
        
        summary_vectors_indices_end_first_sent = self.get_indices_keyword(indices_keyword, sequences_offsets,1)

        summary_vectors_indices_sent2 = self.get_indices_keyword(indices_keyword, sequences_offsets,2)
        

        # finaly we retrieve the vectors that should summarize every sentence.
        # (i.e. the last token in the sequence)
        #summary_vectors = flattened_out[summary_vectors_indices]
        
        summary_vectors_sent1 = flattened_out[summary_vectors_indices_sent1]
        summary_vectors_sent2 = flattened_out[summary_vectors_indices_sent2]
        
        #summary_vectors = torch.mean(torch.stack((summary_vectors_sent1,summary_vectors_sent2)), dim = 0)
        summary_vectors = summary_vectors_sent1 * summary_vectors_sent2
        
        # now we can classify the sentences with a feedforward pass on the summary
        # vectors
        out = self.lin1(summary_vectors)
        out = F.leaky_relu(out)
        

        # compute logits (which are simply the out variable) and the actual probability distribution (pred, as it is the predicted distribution)
        logits = self.linear_output(out).squeeze(1)
        
        pred = torch.sigmoid(logits)

        result = {'logits': logits, 'pred': pred} #'hidden':hidden}
        
        # compute loss
        if y is not None:
            loss = self.loss(pred, y)
            result['loss'] = loss
        
        return result
        
       
    def loss(self, pred, y):
        return self.loss_fn(pred, y)

    def get_indices_keyword(self,indices_keywords,summary,sent_num):
        #[   0,   57,  114,  171,  228] = summary
        #[ [ 6, 21],[ 4, 22],[ 6, 21],[ 4, 22] ] = indices_keywords
        tens_idx = torch.tensor([item[sent_num] for item in indices_keywords]).to(self.device)
        return tens_idx + summary

Trainer class that will handle the training phase for the RNN classifier

In [47]:
class TrainerRNN():
    def __init__(self, model, optimizer, device, exp_details):

        self.device = device

        self.model = model
        self.optimizer = optimizer
        self.writer = SummaryWriter(comment="_"+exp_details)

        self.model.train()  # we are using this model for training
        self.model.to(self.device)  # move model to GPU if available

    def train(self, train_dataset, eval_dataset, epochs=1):

        train_loss = 0.0
        for epoch in tqdm(range(epochs)):
            epoch_loss = 0.0
            len_train = 0
            epoch_val_loss = 0.0
            len_val_train = 0
            accuracy = 0.0
            f1 = 0.0
            self.model.train()

            # each element (sample) in train_dataset is a batch
            for step, sample in enumerate(train_dataset):
                # inputs in the batch
                inputs = sample[0].to(self.device)
                x_lenghts = sample[1].to(self.device)
                # outputs in the batch
                targets = sample[2].to(self.device)
                
                idx_start = torch.tensor(sample[3]).to(self.device)

                output_distribution = self.model(inputs, x_lenghts, idx_start, targets)

                loss = output_distribution['loss']

                # calculates the gradient and accumulates
                loss.backward()  # we backpropagate the loss
                # updates the parameters
                self.optimizer.step()
                self.optimizer.zero_grad()

                epoch_loss += loss.item()
                len_train += 1
                
            self.model.eval()
            
            for step, sample in enumerate(eval_dataset):
                # inputs in the batch
                inputs = sample[0].to(self.device)
                x_lenghts = sample[1].to(self.device)
                # outputs in the batch
                targets = sample[2].to(self.device)
                #h = tuple([e.data for e in h])

                idx_start = torch.tensor(sample[3]).to(self.device)

                #output_distribution = self.model(inputs, h)
                #h = output_distribution['hidden']

                output_distribution = self.model(inputs, x_lenghts, idx_start)
                loss = self.model.loss(output_distribution['pred'], targets)  # compute loss  
                #print(output_distribution['pred'], targets)  
                y_pred = (output_distribution['pred']>0.5).float().cpu()
                y_true = targets.cpu()
                accuracy += accuracy_score(y_true, y_pred)
                f1 += f1_score(y_true,y_pred)

                epoch_val_loss += loss.item()
                len_val_train += 1
            
            avg_epoch_loss = epoch_loss / len_train
            avg_eval_loss = epoch_val_loss / len_val_train
            avg_accuracy_loss = accuracy / len_val_train
            avg_f1_score = f1/len_val_train

            self.writer.add_scalar("Train/loss", avg_epoch_loss, epoch)
            self.writer.add_scalar("Eval/loss", avg_eval_loss, epoch)
            self.writer.add_scalar("Eval/accuracy", avg_accuracy_loss, epoch)
            self.writer.add_scalar("Eval/F1_score", avg_f1_score, epoch)
            print('Epoch: {} avg loss = {:0.4f} eval loss = {:0.4f} ACC = {:0.4f}'.format(epoch, avg_epoch_loss, avg_eval_loss, avg_accuracy_loss))
            train_loss += avg_epoch_loss
            self.writer.flush()
    
        torch.save(self.model.state_dict(),root_folder+'/model/'+exp_details+'_epoch_{}.pt'.format(epoch)) # save the model state        
        avg_epoch_loss = train_loss / epochs
        return avg_epoch_loss

Loading of the handler for the dataset and choose of the batch size

In [44]:
BATCH_SIZE = 16 #@param {type:"slider", min:8, max:64, step:8}

sentences_rnn_dm = SentencesDataModule(
    data_train_path=dataset_folder+'/train.jsonl',
    data_dev_path=dataset_folder+'/dev.jsonl',
    batch_size=BATCH_SIZE,
    collate_fn = rnn_collate_fn,
    vectorize_function = sentence2indices
)

Hyperparameter setup

In [45]:
n_hidden=128 #@param {type:"slider", min:50, max:300, step:16}
drop_prob=0.3 #@param {type:"slider", min:0, max:1, step:0.1}
bidir = True #@param ["True", "False"] {type:"raw"}
learning_rate = 0.0001 #@param {type:"slider", min:0.0001, max:0.001, step:0.001}
epochs = 25 #@param {type:"slider", min:10, max:100, step:10}

Start the training

In [48]:
sentences_recurrent_classifier = SentencesRecurrentClassifier(vectors_store, n_hidden=n_hidden,drop_prob=drop_prob, bidir = bidir)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

optimizer = torch.optim.Adam(sentences_recurrent_classifier.parameters(), lr=learning_rate)

exp_details="difference_leakyrelu_"

trainer = TrainerRNN(sentences_recurrent_classifier, optimizer, device, exp_details)


train_dataloader = sentences_rnn_dm.train_dataloader()
val_dataloader = sentences_rnn_dm.val_dataloader()

avg_loss = trainer.train(train_dataloader, val_dataloader, epochs=epochs)
print(avg_loss)

HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))



Epoch: 0 avg loss = 0.6933 eval loss = 0.6923 ACC = 0.5625
Epoch: 1 avg loss = 0.6908 eval loss = 0.6864 ACC = 0.5675
Epoch: 2 avg loss = 0.6781 eval loss = 0.6664 ACC = 0.6121
Epoch: 3 avg loss = 0.6551 eval loss = 0.6467 ACC = 0.6409
Epoch: 4 avg loss = 0.6305 eval loss = 0.6343 ACC = 0.6498
Epoch: 5 avg loss = 0.6055 eval loss = 0.6280 ACC = 0.6498
Epoch: 6 avg loss = 0.5792 eval loss = 0.6266 ACC = 0.6498
Epoch: 7 avg loss = 0.5510 eval loss = 0.6307 ACC = 0.6558
Epoch: 8 avg loss = 0.5204 eval loss = 0.6407 ACC = 0.6567
Epoch: 9 avg loss = 0.4874 eval loss = 0.6580 ACC = 0.6488
Epoch: 10 avg loss = 0.4522 eval loss = 0.6829 ACC = 0.6389
Epoch: 11 avg loss = 0.4152 eval loss = 0.7161 ACC = 0.6339
Epoch: 12 avg loss = 0.3769 eval loss = 0.7584 ACC = 0.6379
Epoch: 13 avg loss = 0.3378 eval loss = 0.8102 ACC = 0.6389
Epoch: 14 avg loss = 0.2979 eval loss = 0.8711 ACC = 0.6329
Epoch: 15 avg loss = 0.2578 eval loss = 0.9416 ACC = 0.6310
Epoch: 16 avg loss = 0.2183 eval loss = 1.0254 ACC

See some nice graph of the finished training

In [None]:
a = torch.tensor([ [ 6, 21],[ 4, 22],[ 6, 21],[ 4, 22] ], dtype=float)
b = torch.tensor([ [ 1, 1],[ 4, 2],[ 1, 21],[ 2, 2] ], dtype=float)
c = torch.mean(torch.stack((a,b)), dim = 0)
print(c)
print(torch.stack((a,b)), c.shape)
try:
    %reload_ext tensorboard
except:
    %load_ext tensorboard
%tensorboard --logdir=runs