In [None]:
!pip install torchtext --upgrade
!pip install pytorch_lightning


Requirement already up-to-date: torchtext in /usr/local/lib/python3.7/dist-packages (0.9.1)


And download data that we need.

In [None]:
! wget http://nlp.stanford.edu/data/glove.840B.300d.zip
!unzip glove.840B.300d.zip
!git clone https://github.com/SapienzaNLP/nlp2021-hw2

--2021-06-09 07:01:41--  http://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2021-06-09 07:01:41--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2021-06-09 07:01:42--  http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip

In [None]:
import os

data_folder = os.sep.join(["nlp2021-hw2", "data"])
training_file = [os.sep.join([data_folder, "restaurants_train.json"]), os.sep.join([data_folder, "laptops_train.json"])]
dev_file = [os.sep.join([data_folder, "restaurants_dev.json"]),os.sep.join([data_folder, "laptops_dev.json"])]

In [None]:
# here go all the imports
import torch
from torch import nn
from torch.utils.data import Dataset
from torchtext import data
from torchtext.vocab import Vectors

from pprint import pprint
from tqdm import tqdm
from torchtext.vocab import Vocab
from collections import Counter
import random
import numpy as np

from typing import *

import json
import re

import nltk

from nltk import word_tokenize
nltk.download('punkt')  

import pytorch_lightning as pl
from torch.utils.data import DataLoader
import torch.optim as optim

from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint

from sklearn.metrics import f1_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Set up the seed and deterministic algorithms in order to have reproducible results

In [None]:
SEED = 96

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True 

In [None]:
# get the device 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#### Load the data

In [None]:
class MyDataset(Dataset):

    def __init__(self, 
                 input_file:List, 
                 test=False,
                 device="cpu"):
        """
        Args:
            input_file (list of strings): each element is a path to a dataset to be loaded.
            test Boolean: True if the dataset is used for testing pourposes
            device (string): device where to put tensors (cpu or cuda).
        """

        self.test = test
        self.input_file = input_file
        # load the jsonlines into a list of dictionaries
        with open(input_file[0]) as fin:
            data0 = json.load(fin)
        with open(input_file[1]) as fin:
            data1 = json.load(fin)
        
        self.data = data0 + data1

        # prepare the data to be inialized
        for entry in self.data:
            text = entry["text"]
            
            # if the dataset is used for test pouposes I use a tokenization
            # that is independent with the position of the aspect terms
            # otherwise I have to know where the aspect terms since there is no 
            # a correct way to split it (for instance dashes are treated in 
            # different manners across different sentences)
            if (self.test):
                entry["splitted_text"] = word_tokenize(text)
                entry["labels_index"] = None
            else:
                # get the aspect terms
                targets_pos = [target[0] for target in entry["targets"]] 
                # sort the aspect terms wrt the start index 
                targets_pos.sort(key=lambda x:x[0])


                begin_indexes, internal_indexes, splitted_text = [], [], []
                last_target_end = 0
                if (targets_pos != []):
                    for target in targets_pos:
                        # for each aspect term get the text from the end of
                        # the last asperct term considered and the beginngin of 
                        # this aspect term and tokenize it
                        splitted_text += word_tokenize(text[last_target_end:target[0]])
                        
                        # tokenize the aspect term
                        target_text_splitted = word_tokenize(text[target[0]:target[1]])
                        
                        # save the begin index of the aspect term and the 
                        # internal ones
                        begin_indexes.append(len(splitted_text))
                        for i in range(1, len(target_text_splitted)):
                            internal_indexes.append(len(splitted_text) + i)
                        
                        splitted_text += target_text_splitted
                        last_target_end = target[1]

                    splitted_text += word_tokenize(text[last_target_end:])
                
                else:
                    # if there is no aspect term tokenize directly the text
                    splitted_text = word_tokenize(text)

                # save the tokenized text and the indeces of the targets
                entry["splitted_text"] = splitted_text
                entry["begin_indexes"] = begin_indexes
                entry["internal_indexes"] = internal_indexes

        self.device = device
        self.encoded_data = None
    
    def index_dataset(self, vocabulary):
        self.encoded_data = list()
        
        for i in range(len(self.data)):
            # get the tokenized text    
            splitted_text = self.data[i]["splitted_text"]
            
            #build the list of indices relative to the terms inside the vocabulary 
            index_list = [] 
            for term in splitted_text:
                index_list.append(vocabulary[term.lower()])

            #turn the list into a tensor
            index_list = torch.LongTensor(index_list).to(self.device)

            # get the list of aspect terms of this sentence
            true_targets = []
            for true_target in self.data[i]["targets"]:
                true_targets.append(true_target[1])

            
            if (self.test):
                self.encoded_data.append({"inputs":index_list, 
                                          "outputs":None,
                                          "true_targets":true_targets,
                                          "splitted_texts": splitted_text
                                          })
            
            else:    
                # in dev and train phase I have to build the outputs: the tensor
                # of 0s and 1s that represent if a term is or not part of an 
                # aspect term 

                # I also check that the computed aspect terms are the same as 
                # the true ones

                # lets build the labels
                labels = [0]*len(splitted_text)
                for begin in self.data[i]["begin_indexes"]:
                    labels[begin] = 1
                for internal in self.data[i]["internal_indexes"]:
                    labels[internal] = 2
                
                computed_targets = []

                for idx, label in enumerate(labels):

                    # update the computed aspect terms:
                    # if is a Bengin (1) I add a new aspect term
                    if (label == 1):
                        computed_targets.append(splitted_text[idx])
                    
                    # if is a Internal (2) I concatenate the token with the 
                    # precedent aspect term
                    elif (label == 2):
                        computed_targets[-1] += " " + splitted_text[idx]
                    
                # CHECK THAT THE COMPUTED TARGETS ARE CORRECT
                assert computed_targets.sort() == true_targets.sort()

                labels = torch.Tensor(labels).float().to(self.device)
                self.encoded_data.append({"inputs":index_list, 
                                          "outputs":labels,
                                          "true_targets":true_targets,
                                          "splitted_texts": splitted_text
                                          })


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.encoded_data is None:
            raise RuntimeError("You have to call the index_dataset func")
        return self.encoded_data[idx]
    
    def get_raw_element(self, idx):
        return self.data[idx]


In [None]:
# load the embeddings
vectors = Vectors("glove.840B.300d.txt", cache="./")

In [None]:
# build the vocabulary with all the terms in the embeddingd plus an unk token
# and a pad token
def build_vocab(vectors, min_freq=1):
    counter = Counter()
    for term in vectors.stoi:
        counter[term] += 1

    return Vocab(counter, specials=['<pad>', '<unk>'], min_freq=min_freq)


dataset = MyDataset(training_file)
vocabulary = build_vocab(vectors, min_freq=1)
# initialize the data
dataset.index_dataset(vocabulary)

#### Define the torch Model

In [None]:
class MyModel(nn.Module):
  
    """
      hparams has to contain
          vocab_size: the number of tokens in the vocabulary
          embedding_dim: the size of the embeddings
          bidirectional: if the LSTM has to be BiLSTM or not
          num_layers: the number of layers in the LSTM
          dropout: the dropout value
          hidden_dim: the output dimension for a direction of the LSTM
          num_classes: the number of classes that we have to classify

    """
    def __init__(self, hparams, embeddings = None):
        super(MyModel, self).__init__()
        
        # load the pretrained embedding or build a random one
        if embeddings is not None:
            print("initializing embeddings from pretrained")
            self.word_embedding = torch.nn.Embedding.from_pretrained(embeddings)
        else:
            self.word_embedding = nn.Embedding(hparams.vocab_size, hparams.embedding_dim)

        # initialize the LSTM 
        self.lstm = nn.LSTM(hparams.embedding_dim, hparams.hidden_dim, 
                            bidirectional=hparams.bidirectional,
                            num_layers=hparams.num_layers, 
                            dropout = hparams.dropout if hparams.num_layers > 1 else 0,
                            batch_first = True)

        # calculate the lstm output size (by considering if is bidirectional or not)
        lstm_output_dim = hparams.hidden_dim if hparams.bidirectional is False else hparams.hidden_dim * 2

        # initialize dropout and a FC layer
        self.dropout = nn.Dropout(hparams.dropout)
        self.lin = nn.Linear(lstm_output_dim, hparams.num_classes)

    
    def forward(self, x:torch.Tensor, lengths:List):
        # get the embeddings from the vocabulary indices
        embeddings = self.word_embedding(x)
        
        # packs a tensor containing padded sequences of variable length.
        embeddings = torch.nn.utils.rnn.pack_padded_sequence(embeddings, lengths=lengths.cpu(), batch_first=True, enforce_sorted=False)
        
        # get the lstm output
        o, (h, c) = self.lstm(embeddings)
        
        # Pads a packed batch of variable length sequences.
        # padding_value=0 since 0 is the index of the PAD token in the vocab
        o, _ = torch.nn.utils.rnn.pad_packed_sequence(o, batch_first=True, padding_value=0)

        # dropout and FC
        o = self.dropout(o)
        output = self.lin(o)
        
        # the activation function is applied in the caller
        return output

        

#### Model Training (Pytorch Lightning)

In [None]:

class MyLightningModule(pl.LightningModule):
    """
      hparams has to contain
          vocab_size: the number of tokens in the vocabulary
          embedding_dim: the size of the embeddings
          bidirectional: if the LSTM has to be BiLSTM or not
          num_layers: the number of layers in the LSTM
          dropout: the dropout value
          hidden_dim: the output dimension for a direction of the LSTM
          num_classes: the number of classes that we have to classify

    """
    def __init__(self, hparams, embeddings = None, *args, **kwargs):
        super(MyLightningModule, self).__init__(*args, **kwargs)
        
        self.save_hyperparameters(hparams)

        # initialize the CrossEntropy Loss
        self.loss_function = torch.nn.CrossEntropyLoss()
        
        # initialize the model
        self.model = MyModel(self.hparams, embeddings)
    
    # This performs a forward pass of the model and returns the logits
    """
      x is the sequence of vocabulary indices of each sentence
      lengths is the list of lengths of each sentence
    """
    def forward(self, x:torch.Tensor, lengths:List):
        logits = self.model(x, lengths)
        
        return logits

    # This runs the model in training mode mode, ie. activates dropout and 
    # gradient computation.
    def training_step(self, batch, batch_nb):

        inputs = batch['inputs'] # terms indices in the vocabulary
        labels = batch['outputs'] # binary torch tensor -> 1 if aspect 0 otherwise
        lenghts = batch['lengths'] # list of lenghts
        mask = batch['mask'] # mask used to avoid to consider the padd values

        # forward pass
        logits = self.forward(inputs, lenghts)

        # remove the outputs for the padded values
        logits = logits.view(-1, logits.shape[-1]).squeeze()[mask]
                
        labels = labels.view(-1).long()

        # compute loss
        loss = self.loss_function(logits, labels)

        self.log('train_loss', loss, prog_bar=True, on_epoch=True)

        # return the loss in order to update the weights
        return loss
    

    # This runs the model in eval mode, ie. sets dropout to 0 and deactivates grad. Needed when we are in inference mode.
    def validation_step(self, batch, batch_nb):
        inputs = batch['inputs']# terms indices in the vocabulary
        labels = batch['outputs']# binary torch tensor -> 1 if aspect 0 otherwise
        lengths = batch['lengths']# list of lenghts
        mask = batch['mask']# mask used to avoid to consider the padd values
        
        # forward pass
        logits = self.forward(inputs, lengths)
        
        # turn the logits into a unidimensional tensor and remove the outputs
        # for the padded values
        logits = logits.view(-1, logits.shape[-1]).squeeze()[mask]
        labels = labels.view(-1).long()
        
        # get binary predictions
        preds = nn.Softmax(-1)(logits)
        rounded = torch.argmax(preds, dim=-1)
        
        # compute loss and f1 score (to notice, this f1 score is not the final one
        # since this f1 score does not take into account the sequences)
        sample_loss = self.loss_function(logits, labels)
        sample_f1 = f1_score(labels.detach().cpu(), rounded.detach().cpu(), average="macro")

        self.log('valid_loss', sample_loss, prog_bar=True,on_epoch=True)
        self.log('valid_f1', sample_f1, prog_bar=True, on_epoch=True)


    # This runs the model in eval mode, ie. sets dropout to 0 and deactivates grad. Needed when we are in inference mode.
    def test_step(self, batch, batch_nb):
        inputs = batch['inputs']# terms indices in the vocabulary
        lengths = batch['lengths']# list of lenghts
        mask = batch['mask']# mask used to avoid to consider the padd values

        true_targets = batch['true_targets']# get the true aspect terms
        splitted_texts = batch['splitted_texts']# get the splitted text

        # forward pass        
        logits = self.forward(inputs, lengths)

        # get binary predictions
        preds = nn.Softmax(-1)(logits)
        rounded = torch.argmax(preds, dim=-1)
        tp, fp, fn = 0, 0, 0
        
        # if near terms are both associated to 1, than they are considered
        # as a single aspect term
        for i, row_ in enumerate(rounded):  
            # get the predictions for this sentence      
            row = row_[:lengths[i]]
            
            # get the indices that the model has predicted to be aspect terms
            indeces = (row.squeeze()!=0).nonzero()

            computed_targets, last_index = [], -1
            if (lengths[i].tolist() == 1):
                if (row[0][0] != 0):
                    computed_targets = splitted_texts[i][0]                                        
            else:
                for index in indeces:
                    # if is a new Begin or the index is not the following one from
                    # the last one considered, we add a new aspect term
                    if (row[index] == 1 or last_index == -1 or last_index != index -1): # 1 is beginning
                        computed_targets.append(splitted_texts[i][index])
                    
                    # otherwise we concatenate the token with the precedent
                    # aspect term (by considering that is better not add a space
                    # with the following special characters)
                    elif computed_targets[-1][-1] in ["-","(",")","/"] or splitted_texts[i][index] in ["-","(",")","/"]: 
                        computed_targets[-1] += splitted_texts[i][index]
                    else:
                        computed_targets[-1] += " " + splitted_texts[i][index]

                    last_index = index

            true_out = true_targets[i]
            
            # compute true positive, false positive and false negatives
            tp += len(set(true_out) & set(computed_targets))
            fp += len(set(computed_targets) - set(true_out))
            fn += len(set(true_out) - set(computed_targets))


        # compute precision, recall and f1        
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
            
        avg_f1 = 2* (precision*recall) / (precision + recall)
        print(avg_f1, precision, recall, tp, fp, fn)
        
        self.log('test_f1', avg_f1, prog_bar=True, on_epoch=True)
        self.log('test_recall', recall, prog_bar=True, on_epoch=True)
        self.log('test_precision', precision, prog_bar=True, on_epoch=True)

    def configure_optimizers(self):
        return optim.Adam(self.parameters())

In [None]:
"""
  data is a list of dictionaries with the following structure

            {"inputs":List of indices in the vocab of the terms, 
              "outputs": list of binary labels or None (None in case of test),
              "true_targets": list of the true aspect terms,
              "splitted_texts": list of tokens of the sentece
              }
"""

def collate_fn(data: Dict):
    # X contains the indices in the vocab of the terms composing each sentence
    X = [entry["inputs"] for entry in data]
    lengths = torch.tensor([x.size(0) for x in X], dtype=torch.int64)

    # 0 is the index of the PAD token in the vocabulary
    # pad the sentences
    X = torch.nn.utils.rnn.pad_sequence(X, batch_first=True, padding_value=0)
    y = torch.hstack([entry["outputs"] for entry in data]) if data[0]["outputs"]!= None else None

    # compute the mask used to ignore the padding
    mask = [ True if i<len else False for len in lengths for i in range(torch.max(lengths)) ]

    # load the true aspect terms and the splitted sentences
    true_targets = [entry["true_targets"] for entry in data]
    splitted_texts = [entry["splitted_texts"] for entry in data]

    return {"inputs":X, "outputs":y, "lengths":lengths, "mask":mask, "true_targets":true_targets, "splitted_texts": splitted_texts}

In [None]:
class MyLightningDataModule(pl.LightningDataModule):
    def __init__(self, training_file, dev_file, vocabulary, collate_fn, device="cpu"):
        super().__init__()
        self.training_file = training_file
        self.dev_file = dev_file
        self.vocabulary = vocabulary
        self.collate_fn = collate_fn
        self.device = device

    def setup(self, stage=None):
      # define datasets and index data
      self.trainingset = MyDataset(self.training_file, device=self.device)
      self.devset = MyDataset(self.dev_file, device=self.device)
      self.testset = MyDataset(self.dev_file, device=self.device, test=True)

      self.trainingset.index_dataset(self.vocabulary)
      self.devset.index_dataset(self.vocabulary)
      self.testset.index_dataset(self.vocabulary)


          
    def train_dataloader(self):
      return DataLoader(self.trainingset, batch_size=128, collate_fn = self.collate_fn)
    
    def val_dataloader(self):
        return DataLoader(self.devset, batch_size=128, collate_fn = self.collate_fn)
    
    def test_dataloader(self):
        return DataLoader(self.testset, batch_size=len(self.testset), collate_fn = self.collate_fn)
    

#### Define Hyperparams and start Training

In [None]:

hparams = {'vocab_size': len(vocabulary),
            'hidden_dim': 256, 
            'embedding_dim': 300,
            'num_classes': 3, 
            'bidirectional': True,
            'num_layers': 2,
            'dropout': 0.3
           }

In [None]:
# initialize the matrix of embs
pretrained_embeddings = torch.randn(len(vocabulary), vectors.dim)
initialised = 0
# for each term in the vocabulary replace the random emb with the right one
for i, w in enumerate(vocabulary.itos):
    if w in vectors.stoi:
        initialised += 1
        vec = vectors.get_vecs_by_tokens(w)
        pretrained_embeddings[i] = vec

# set the PAD to have emb of zeros    
pretrained_embeddings[vocabulary["<pad>"]] = torch.zeros(vectors.dim)
hparams["embedding_dim"] = vectors.dim
hparams["vocab_size"] = len(vocabulary)
print("initialised embeddings {}".format(initialised))
print("random initialised embeddings {} ".format(len(vocabulary) - initialised))

# save memory
vectors = None

initialised embeddings 2196016
random initialised embeddings 2 


In [None]:
# setup early stopping and checkpoint callbacks
# in order to stop the train after patience epochs from the lowest loss
# and save the model with lower loss
early_stop_callback = EarlyStopping(
   monitor="valid_loss",
   min_delta=0.00,
   patience=3,
   verbose=False,
   mode="min"
)

checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best_model",
    monitor = "valid_loss",
    mode = "min"
)

# initialize the datamodule and the trainer
data_module = MyLightningDataModule(training_file, dev_file, vocabulary, collate_fn, device=device)
trainer = pl.Trainer(val_check_interval=1.0, max_epochs=15, gpus=1 if device!=torch.device('cpu') else None, callbacks=[early_stop_callback, checkpoint_callback])

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


In [None]:
# initialize the model and start the training
model = MyLightningModule(hparams, embeddings = pretrained_embeddings)
trainer.fit(model, datamodule=data_module)


  | Name          | Type             | Params
---------------------------------------------------
0 | loss_function | CrossEntropyLoss | 0     
1 | model         | MyModel          | 661 M 
---------------------------------------------------
2.7 M     Trainable params
658 M     Non-trainable params
661 M     Total params
2,646.107 Total estimated model params size (MB)


initializing embeddings from pretrained


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




### Model Evaluation

In [None]:
chk_path = "checkpoints/best_model.ckpt"
model = MyLightningModule.load_from_checkpoint(chk_path)

In [None]:
trainer.test(model, test_dataloaders=data_module.test_dataloader())

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

0.7342491710090004 0.7531584062196307 0.7162661737523105 775 254 307

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_f1': 0.7342491710090004,
 'test_precision': 0.7531584062196307,
 'test_recall': 0.7162661737523105}
--------------------------------------------------------------------------------


[{'test_f1': 0.7342491710090004,
  'test_precision': 0.7531584062196307,
  'test_recall': 0.7162661737523105}]

In [None]:
!ls -la checkpoints/

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir lightning_logs/