In [None]:
!pip install torchtext --upgrade
!pip install transformers
!pip install pytorch_lightning

Requirement already up-to-date: torchtext in /usr/local/lib/python3.7/dist-packages (0.9.1)
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 7.3MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 47.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d6

And download data that we need.

In [None]:
!git clone https://github.com/SapienzaNLP/nlp2021-hw2

Cloning into 'nlp2021-hw2'...
remote: Enumerating objects: 32, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 32 (delta 10), reused 18 (delta 3), pack-reused 0[K
Unpacking objects: 100% (32/32), done.


In [None]:
import os

data_folder = os.sep.join(["nlp2021-hw2", "data"])
training_file = [os.sep.join([data_folder, "restaurants_train.json"]), os.sep.join([data_folder, "laptops_train.json"])]
dev_file = [os.sep.join([data_folder, "restaurants_dev.json"]),os.sep.join([data_folder, "laptops_dev.json"])]

In [None]:
# here go all the imports
import torch
from torch import nn
from torch.utils.data import Dataset
from torchtext import data
from torchtext.vocab import Vectors
from transformers import BertTokenizer, BertModel


from pprint import pprint
from tqdm import tqdm
from torchtext.vocab import Vocab
from collections import Counter
import random
import numpy as np

from typing import *

import json

import pytorch_lightning as pl
from torch.utils.data import DataLoader
import torch.optim as optim

from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint

from sklearn.metrics import f1_score

Set up the seed and deterministic algorithms in order to have reproducible results

In [None]:
SEED = 96

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#### Load Data

In [None]:
class MyDataset(Dataset):

    def __init__(self, 
                 input_file:List,
                 tokenizer,  
                 test=False,
                 device="cpu"):
        """
        Args:
            input_file (list of strings or paths): each element is a path to a dataset to be loaded.
            tokeniser BertTokenizer: the tokernizer to be used 
            test Boolean: True if the dataset is used for testing pourposes
            device (string): device where to put tensors (cpu or cuda).
        """

        self.test = test
        self.input_file = input_file
        
        # load the jsonlines into a list of dictionaries
        with open(input_file[0]) as fin:
            data0 = json.load(fin)
        with open(input_file[1]) as fin:
            data1 = json.load(fin)
        
        self.data = data0 + data1

        self.tokenizer = tokenizer
        self.device = device
        self.encoded_data = None
        # initialize the data
        self.init_data()
    
    def get_indexes_to_merge(self, word_pieces:List, start_idx:int):

        # get the indeces and the number of word pieces that have to me merged
        word_pieces_to_merge = []
        
        # started is used in order to know if a term formed by more word pieces
        # is aleady started
        started = False
        
        # number of word pieces starting with ##
        tot_to_subtract = 0 
        
        
        for i, word_piece in enumerate(word_pieces):
            idx = start_idx + i

            # if the word piece starts with ##, this measn that is part of 
            # another token
            if (len(word_piece)>2 and word_piece[0:2] == "##"):
                # if the term has already seen a word piece with ## we append only
                if (started):
                    word_pieces_to_merge[-1].append(idx)
                
                # otherwise we have to add a new element to the list of 
                # word pieces that for a term
                else:
                    word_pieces_to_merge.append([idx-1, idx])
                started = True
                tot_to_subtract += 1
            else:
                started = False
        
        return word_pieces_to_merge, tot_to_subtract
                      
    # turn list of word pieces into string
    def word_pieces2string(self, word_pieces:List):
        s = ""
        for word_piece in word_pieces:
            if (len(word_piece) > 2 and word_piece[:2] == "##"):
                s += word_piece[2:]
            else:
                s += " " + word_piece
        return s

    def init_data(self):
        self.encoded_data = list()
        
        for i in range(len(self.data)):

            text = self.data[i]["text"]
            last_idx = 0
            
            # sort the targets w.r.t. the starting index
            targets = sorted(self.data[i]["targets"], key=lambda x: x[0][0])
            
            # tokenized stars with [101] since 101 is the id relative to the 
            # CLS token for BertTokenizer
            tokenized, label_indexes = [101], []
            rebuild_phrase = ["[start]"]

            to_subtract = 0

            word_pieces_to_merge = []
            true_targets = []
            
            # if we are testing, than we split the data without knowing where the
            # aspect terms are (during train I prefer to split according to the
            # aspect terms since there is no a correct manner to split them
            # cause some chars (like dash) are threadet in different manner in
            # the different aspect terms)
            if (self.test):
                # intialize the true aspect terms
                for true_target in targets:
                    true_targets.append(true_target[1])
                
                # tokenize the data
                word_pieces = self.tokenizer.tokenize(text)

                # get the indices of the word pieces to merge (the ones that
                # for a single term)
                word_pieces_to_merge, tmp_to_subtract = self.get_indexes_to_merge(word_pieces, len(tokenized))

                # rebuild the phrase (by concatenating the word pieces of the 
                # same term) and than split over the spaces
                rebuild_phrase += self.word_pieces2string(word_pieces).split()

                # get the indices to feed in input to bert 
                tokenized = self.tokenizer.encode(word_pieces)

                assert tokenized.count(101) == 1 and tokenized.count(102) == 1  
                assert tokenized[0] == 101 and tokenized[-1] == 102
                ''' tokenized.remove(101)
                tokenized.remove(102)

                if (101 in tokenized):
                    tokenized.remove(101)
                if (102 in tokenized):
                    tokenized.remove(102)
                
                tokenized = [101] + tokenized + [102] '''

                # get the len of the phrase (where the word pieces of the same
                # term are merged)
                length = len(rebuild_phrase)

                self.encoded_data.append({
                    "inputs": torch.tensor(tokenized),
                    "outputs": None,
                    "rebuild_phrase": rebuild_phrase,
                    "word_pieces_to_merge": word_pieces_to_merge,
                    "lengths":length,
                    "true_targets": true_targets
                })
            else: 
                # in case of train/dev I split the text by knowing where the
                # aspect terms are

                for true_target in targets:
                    # get start and end idx of the aspect term
                    start_idx, end_idx = true_target[0][0], true_target[0][1]
                    

                    # get the word pieces of the text between the end of the last
                    # aspect term considered and the beginning of this aspect term
                    word_pieces_pre = self.tokenizer.tokenize(text[last_idx:start_idx])

                    # get the word pieces of the aspect term
                    word_pieces_target = self.tokenizer.tokenize(text[start_idx:end_idx])

                    # concatenate the word pieces
                    word_pieces = word_pieces_pre + word_pieces_target 
                    

                    # get the indices of the word pieces that form a single token
                    tmp_word_pieces_to_merge, tmp_to_subtract = self.get_indexes_to_merge(word_pieces, len(tokenized))
                    word_pieces_to_merge += tmp_word_pieces_to_merge
                    # counter of the number of elements with ##
                    # is used in order to check that:
                    # n_tokens + n_word_pieces_with_## = n_word_pieces
                    to_subtract += tmp_to_subtract

                    tokenized += self.tokenizer.encode(word_pieces)
                    
                    # remove the newly added 101 and 102 
                    tokenized.reverse()
                    tokenized.remove(101)
                    tokenized.remove(102)
                    tokenized.reverse()

                    # merge the word pieces that are referred to as single term
                    # word_pieces_pre (word pieces of the text between the end of the last
                    # aspect term considered and the beginning of this aspect term)
                    rebuild_phrase += self.word_pieces2string(word_pieces_pre).split()
                    start_label_idx = len(rebuild_phrase) 
                    
                    # do the same for the target
                    rebuild_target = self.word_pieces2string(word_pieces_target).split()
                    rebuild_phrase += rebuild_target


                    # save the indices of the aspect terms
                    label_indexes += list(range(start_label_idx, start_label_idx + len(rebuild_target)))

                    # save the true aspect terms
                    true_targets.append(true_target[1])
                    last_idx = end_idx


                if (len(text) > last_idx):
                    word_pieces = self.tokenizer.tokenize(text[last_idx:])
                    tmp_word_pieces_to_merge, tmp_to_subtract = self.get_indexes_to_merge(word_pieces, len(tokenized))
                    word_pieces_to_merge += tmp_word_pieces_to_merge
                    to_subtract+=tmp_to_subtract 

                    tokenized += self.tokenizer.encode(word_pieces)
                    rebuild_phrase += self.word_pieces2string(word_pieces).split()

                     # remove the newly added 101 and 102 
                    tokenized.reverse()
                    tokenized.remove(101)
                    tokenized.remove(102)
                    tokenized.reverse()

                    ''' tokenized.remove(101)
                    tokenized.remove(102) '''
                ''' if (101 in tokenized):
                    tokenized.remove(101)
                if (102 in tokenized):
                    tokenized.remove(102) '''

                #tokenized = [101] + tokenized
                # add the ID that reprensets the SEP token
                tokenized.append(102)
                rebuild_phrase.append("[end]")

                assert tokenized.count(101) == 1 and tokenized.count(102) == 1  
                assert tokenized[0] == 101 and tokenized[-1] == 102

                # get the number of tokens (not word pieces) in the sentece
                length = len(rebuild_phrase)

                # check that the number of word pieces to merge is equal to the
                # number counted precedently 
                compute_diff = 0
                for wp in word_pieces_to_merge:
                    compute_diff += len(wp)-1

                assert compute_diff == to_subtract


                # assert that n_tokens + n_word_pieces_with_## = n_word_pieces
                labels = [0.0]*len(rebuild_phrase) 
                for index in label_indexes:
                    labels[index] = 1.0

                assert len(tokenized) ==  len(rebuild_phrase) + to_subtract

                self.encoded_data.append({
                    "inputs": torch.tensor(tokenized),
                    "outputs": torch.tensor(labels),
                    "rebuild_phrase": rebuild_phrase,
                    "word_pieces_to_merge": word_pieces_to_merge,
                    "lengths":length,
                    "true_targets": true_targets
                })
                      

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.encoded_data[idx]
    
    def get_raw_element(self, idx):
        return self.data[idx]


#### Define the torch Model

In [None]:
class MyModel(nn.Module):
    
    def __init__(self, hparams):
        super(MyModel, self).__init__()
        # load the pretrained bert large cased model
        self.bert = BertModel.from_pretrained('bert-large-cased', output_hidden_states=True)

        # define dropout and the FC
        self.dropout = nn.Dropout(hparams.dropout)
        
        # 4*1024 since 1024 is the size of an hidden layer, 4 cause we 
        # concatenate the last 4 hidden layers
        
        # 1 since I am doing binary classification
        self.lin1 = torch.nn.Linear(4*1024, 1)


    """
        x: dict with 
          input_ids: the padded sequence of ids of word pieces
          attention_mask
        word_pieces: list containing the indices of the word pieces that we have
                     to merge in order to get the result for a token
    """

    def forward(self, x, word_pieces, lenghts):
        bert_out = self.bert(input_ids=x["input_ids"], attention_mask=x["attention_mask"]).hidden_states
        x=x["input_ids"]

        # initialize the lists that will contain the embeddings of the tokens
        # (not word pieces)
        last_embs = []
        last_but_one_embs = []
        last_but_two_embs = []
        last_but_three_embs = []
        
        # for each phrase, compute the list of embs of the terms inside it 
        # by considering for each token only the first word piece
        for i in range(x.shape[0]):
            last_list_of_embs = []
            last_but_one_list_of_embs = []
            last_but_two_list_of_embs = []
            last_but_three_list_of_embs = []
            
            last = 0
            
            # for each token formed by multiple word pieces take the first word piece
            # from the last 4 hidden layers
            for word_piece in word_pieces[i]:
                last_list_of_embs+=bert_out[-1][i][last:word_piece[0]]
                last_list_of_embs.append(bert_out[-1][i][word_piece[0]])
                
                last_but_one_list_of_embs+=bert_out[-2][i][last:word_piece[0]]
                last_but_one_list_of_embs.append(bert_out[-2][i][word_piece[0]])
                
                last_but_two_list_of_embs+=bert_out[-3][i][last:word_piece[0]]
                last_but_two_list_of_embs.append(bert_out[-3][i][word_piece[0]])
                
                last_but_three_list_of_embs+=bert_out[-4][i][last:word_piece[0]]
                last_but_three_list_of_embs.append(bert_out[-4][i][word_piece[0]])
                
                last = word_piece[-1]+1
            
            # take the tokens from the last term formed by multiple word pieces
            # and the end of the sentence
            if (last < bert_out[-1][i].shape[0]):
                last_list_of_embs+=bert_out[-1][i][last:lenghts[i]]
                last_but_one_list_of_embs+=bert_out[-2][i][last:lenghts[i]]
                last_but_two_list_of_embs+=bert_out[-3][i][last:lenghts[i]]
                last_but_three_list_of_embs+=bert_out[-4][i][last:lenghts[i]]


            #stack the embs of the terms of the sentence
            last_list_of_embs = torch.stack(last_list_of_embs)
            last_but_one_list_of_embs = torch.stack(last_but_one_list_of_embs)
            last_but_two_list_of_embs = torch.stack(last_but_two_list_of_embs)
            last_but_three_list_of_embs = torch.stack(last_but_three_list_of_embs)

            # add the stacked embs into these lists
            last_embs.append(last_list_of_embs)
            last_but_one_embs.append(last_but_one_list_of_embs)
            last_but_two_embs.append(last_but_two_list_of_embs)
            last_but_three_embs.append(last_but_three_list_of_embs)


        # pad the sequences with zeros
        last_padded_embs = torch.nn.utils.rnn.pad_sequence(last_embs, batch_first=True, padding_value=0)
        last_but_one_padded_embs = torch.nn.utils.rnn.pad_sequence(last_but_one_embs, batch_first=True, padding_value=0)
        last_but_two_padded_embs = torch.nn.utils.rnn.pad_sequence(last_but_two_embs, batch_first=True, padding_value=0)
        last_but_three_padded_embs = torch.nn.utils.rnn.pad_sequence(last_but_three_embs, batch_first=True, padding_value=0)

        # save memory
        last_embs = None
        last_but_one_embs = None
        last_but_two_embs = None
        last_but_three_embs = None

        # concatenate the results from the last 4 hidden layers of bert
        padded_embs = torch.cat((last_padded_embs, last_but_one_padded_embs, last_but_two_padded_embs, last_but_three_padded_embs), -1)

        # save memory
        last_padded_embs = None
        last_but_one_padded_embs = None
        last_but_two_padded_embs = None
        last_but_three_padded_embs = None

        padded_embs = self.dropout(padded_embs)

        out = self.lin1(padded_embs)
        
        return out

        

#### Pytorch Lightning Module (Train, Dev and Test code)

In [None]:
class MyLightningModule(pl.LightningModule):
    def __init__(self, hparams, *args, **kwargs):
        super(MyLightningModule, self).__init__(*args, **kwargs)
        """
          hparams contains the dropout value
        """
        self.save_hyperparameters(hparams)

        # initialize the binary cross entropy loss 
        self.loss_function = torch.nn.BCELoss()
        
        # initialize the model
        self.model = MyModel(self.hparams)

        self.tp = 0
        self.fp = 0
        self.fn = 0

    
    # This performs a forward pass of the model, as well as returning the predicted index.
    """
        x is a dictionary containing the inputs_ids and the attention mask: the parameters of bert
        lengths is a list containing the number of word pieces for each sentence (not the number of tokens)
    """
    def forward(self, x, word_pieces, lenghts):
        logits = self.model(x, word_pieces, lenghts)

        return logits

    # This runs the model in training mode mode, ie. activates dropout and gradient computation. It defines a single training step.
    def training_step(self, batch, batch_nb):

        inputs = batch['inputs'] # input_ids and attention mask
        labels = batch['outputs'] # correct binary labels
        lenghts = batch['lengths'] # number of tokens per sentence
        input_lengths = batch["input_lengths"] # number of word piece per sentence

        mask = batch['mask'] # mask used to ignore the output for the padding values
        word_pieces = batch["word_pieces"] # idxs of word pieces to merge

        # forward pass
        logits = self.forward(inputs, word_pieces, input_lengths)
        
        #remove the outputs for the padded values
        logits = logits.view(-1, logits.shape[-1]).squeeze()[mask]
        
        labels = labels.view(-1)

        # compute the loss:
        loss = self.loss_function(torch.sigmoid(logits), labels)

        self.log('train_loss', loss, prog_bar=True, on_epoch=True)

        # return the loss in order to update the weights
        return loss
    

    # This runs the model in eval mode, ie. sets dropout to 0 and deactivates grad. Needed when we are in inference mode.
    def validation_step(self, batch, batch_nb):
        inputs = batch['inputs']# input_ids and attention mask
        labels = batch['outputs']# correct binary labels
        lengths = batch['lengths']# number of tokens per sentence
        input_lengths = batch["input_lengths"]# number of word piece per sentence
        
        mask = batch['mask']# mask used to ignore the output for the padding values
        word_pieces = batch["word_pieces"]# idxs of word pieces to merge

        # forward pass
        logits = self.forward(inputs, word_pieces, input_lengths)
        
        #remove the outputs for the padded values
        logits = logits.view(-1, logits.shape[-1]).squeeze()[mask]
        
        
        labels = labels.view(-1)
        
        # apply sigmoid and round the predictions
        preds = torch.sigmoid(logits)
        rounded = torch.round(preds)
        
        # compute loss and f1 score
        sample_loss = self.loss_function(preds, labels)
        sample_f1 = f1_score(labels.detach().cpu(), rounded.detach().cpu(), average="macro")

        self.log('valid_loss', sample_loss, prog_bar=True,on_epoch=True)
        self.log('valid_f1', sample_f1, prog_bar=True, on_epoch=True)


    # This runs the model in eval mode, ie. sets dropout to 0 and deactivates grad. Needed when we are in inference mode.
    def test_step(self, batch, batch_nb):
        inputs = batch['inputs']# input_ids and attention mask
        labels = batch['outputs']# correct binary labels
        lengths = batch['lengths']# number of tokens per sentence
        input_lengths = batch["input_lengths"]# number of word piece per sentence
        
        mask = batch['mask']# mask used to ignore the output for the padding values
        word_pieces = batch["word_pieces"]# idxs of word pieces to merge

        true_targets = batch['true_targets'] # correct aspect terms
        splitted_texts = batch['splitted_texts'] # splitted text
        

        # forward pass
        logits = self.forward(inputs, word_pieces, input_lengths)
        
        # apply sigmoid and round the predicions
        preds = torch.sigmoid(logits)
        rounded = torch.round(preds)

        # for each sentence
        for i, row_ in enumerate(rounded):

            # avoid to consider the padding
            row = row_[:lengths[i]]

            # get the indices with 1
            indeces = (row.squeeze()==1).nonzero()

            computed_targets, last_index = [], -1
            if (lengths[i] == 1):
                if (row[0][0] == 1):
                    computed_targets = splitted_texts[i][0]                                        
            else:
                for index in indeces:
                    # if an index is not equal to the precedent considered + 1
                    # we create a new aspect term
                    if (index != last_index + 1 or last_index == -1):
                        computed_targets.append(splitted_texts[i][index])
                    
                    # otherwise we concatenate to the precedent one (by considering
                    # that for some special chars we have to not put the space)
                    elif computed_targets[-1][-1] in ["-","(",")","/"] or splitted_texts[i][index] in ["-","(",")","/"]: 
                        computed_targets[-1] += splitted_texts[i][index]
                    else:
                        computed_targets[-1] += " " + splitted_texts[i][index]

                    last_index = index 

            true_out = true_targets[i]
            
            print("preds:", computed_targets)
            print("true:", true_out)

            # update the values of TP, FP and FN
            self.tp += len(set(true_out) & set(computed_targets))
            self.fp += len(set(computed_targets) - set(true_out))
            self.fn += len(set(true_out) - set(computed_targets))

            print("tp:", self.tp, "fp:", self.fp, "fn:",self.fn)

        
        precision = self.tp / (self.tp + self.fp)
        recall = self.tp / (self.tp + self.fn)
            
        avg_f1 = 2* (precision*recall) / (precision + recall)
        print(avg_f1, precision, recall, self.tp, self.fp, self.fn)
        
        self.log('test_f1', avg_f1, prog_bar=True, on_epoch=True)
        self.log('test_recall', recall, prog_bar=True, on_epoch=True)
        self.log('test_precision', precision, prog_bar=True, on_epoch=True)

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=0.000_05)

In [None]:
"""
data is a list of dictionaries                
                {
                    "inputs": torch.tensor of the word piece ids of the sentence
                    "outputs": torch.tensor of the labels for each term,
                    "rebuild_phrase": the rebuilded phrase,
                    "word_pieces_to_merge": the indices of the word peices that for a single token,
                    "lengths": the number of terms in the sentence
                    "true_targets": the true aspect terms
                }
"""

def collate_fn(data):
    # get the indices of the word pieces
    X = [entry["inputs"] for entry in data]

    # get the number of word pieces for each sentence
    input_lengths = [entry["inputs"].shape[0] for entry in data]

    # get the number of tokens for each sentence
    lengths = [entry["lengths"] for entry in data]

    # pad the indices 
    X = torch.nn.utils.rnn.pad_sequence(X, batch_first=True, padding_value=0)# TODO vocabulary.stoi("<pad>")  #  shape (batch_size x max_seq_len)
    y = torch.hstack([entry["outputs"] for entry in data]) if data[0]["outputs"]!= None else None

    # prepare the mask in order to avoid to consider the padding values
    # after flattening the output of the model
    mask = [ True if i<len else False for len in lengths for i in range(max(lengths)) ]

    # get the word pieces indices that form a single term
    word_pieces = [entry["word_pieces_to_merge"] for entry in data]
    
    # get the true aspect term and the splitted text
    true_targets = [entry["true_targets"] for entry in data]
    splitted_texts = [entry["rebuild_phrase"] for entry in data]

    # build the attention mask with 1 everywhere but over the padding word pieces
    attention_masks = torch.tensor([[1]*i_len + [0]*(X.shape[1] - i_len) for i_len in input_lengths])

    # define the bert input
    input = {"input_ids": X, "attention_mask": attention_masks}

    return {"inputs":input, "outputs":y, "lengths":lengths, "mask":mask, "true_targets":true_targets, "splitted_texts": splitted_texts, "word_pieces": word_pieces, "input_lengths":input_lengths}

In [None]:
class MyLightningDataModule(pl.LightningDataModule):
    def __init__(self, training_file, dev_file, tokenizer, collate_fn, device="cpu"):
        super().__init__()
        self.training_file = training_file
        self.dev_file = dev_file
        self.tokenizer = tokenizer
        self.collate_fn = collate_fn
        self.device = device

    def setup(self, stage=None):
      # initialize the three datasets
      self.trainingset = MyDataset(self.training_file, device=self.device, tokenizer=self.tokenizer)
      self.devset = MyDataset(self.dev_file, device=self.device, tokenizer=self.tokenizer)

      # same file as the dev set, but in this case the tokenization is done
      # whitout knowing where the aspect term are      
      self.testset = MyDataset(self.dev_file, device=self.device, tokenizer=self.tokenizer, test=True)

          
    def train_dataloader(self):
      return DataLoader(self.trainingset, batch_size=32, collate_fn = self.collate_fn, shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.devset, batch_size=32, collate_fn = self.collate_fn)
    
    def test_dataloader(self):
        return DataLoader(self.testset, batch_size=len(self.testset)//5, collate_fn = self.collate_fn)
    

#### Define Hyperparams and start Training

In [None]:
hparams = {
            "dropout": 0.5
           }

In [None]:
# setup early stopping and checkpoint callbacks
# in order to stop the train after patience epochs from the lowest loss
# and save the model with lower loss
early_stop_callback = EarlyStopping(
   monitor="valid_loss",
   min_delta=0.00,
   patience=3,
   verbose=False,
   mode="min"
)


checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best_model",
    monitor = "valid_loss",
    mode = "min"
)

tokenizer = BertTokenizer.from_pretrained('bert-large-cased')

# initialize the datamodule and the trainer
data_module = MyLightningDataModule(training_file, dev_file, tokenizer, collate_fn, device=device)
trainer = pl.Trainer(val_check_interval=1.0, max_epochs=15, gpus=1 if device!=torch.device('cpu') else None, callbacks=[early_stop_callback, checkpoint_callback])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…

GPU available: True, used: True
TPU available: False, using: 0 TPU cores





In [None]:
# initialize the model and start the training
model = MyLightningModule(hparams)
trainer.fit(model, datamodule=data_module)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=762.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1338740706.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type    | Params
------------------------------------------
0 | loss_function | BCELoss | 0  

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




### Model Evaluation

In [None]:
ls -la checkpoints/

total 3901412
drwxr-xr-x 2 root root       4096 Jun  9 09:26 [0m[01;34m.[0m/
drwxr-xr-x 1 root root       4096 Jun  9 09:26 [01;34m..[0m/
-rw-r--r-- 1 root root 3995029721 Jun  9 09:28 best_model.ckpt


In [None]:
chk_path = "checkpoints/best_model.ckpt"
model = MyLightningModule.load_from_checkpoint(chk_path)


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
trainer.test(model, test_dataloaders=data_module.test_dataloader())

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…

preds: []
true: []
tp: 0 fp: 0 fn: 0
preds: ['wines', 'the glass']
true: ['wines by the glass']
tp: 0 fp: 2 fn: 1
preds: ['staff']
true: ['staff']
tp: 1 fp: 2 fn: 1
preds: []
true: []
tp: 1 fp: 2 fn: 1
preds: ['Appetizers', 'main dishes']
true: ['Appetizers', 'main dishes']
tp: 3 fp: 2 fn: 1
preds: []
true: ['view']
tp: 3 fp: 2 fn: 2
preds: ['reservation']
true: ['reservation']
tp: 4 fp: 2 fn: 2
preds: ['half price sushi deal']
true: ['half price sushi deal']
tp: 5 fp: 2 fn: 2
preds: []
true: []
tp: 5 fp: 2 fn: 2
preds: []
true: []
tp: 5 fp: 2 fn: 2
preds: []
true: []
tp: 5 fp: 2 fn: 2
preds: []
true: []
tp: 5 fp: 2 fn: 2
preds: []
true: []
tp: 5 fp: 2 fn: 2
preds: ['food']
true: ['food']
tp: 6 fp: 2 fn: 2
preds: []
true: []
tp: 6 fp: 2 fn: 2
preds: ['oil']
true: ['oil']
tp: 7 fp: 2 fn: 2
preds: ['food']
true: ['food']
tp: 8 fp: 2 fn: 2
preds: ['staff', 'food', 'place']
true: ['staff', 'food', 'place']
tp: 11 fp: 2 fn: 2
preds: []
true: []
tp: 11 fp: 2 fn: 2
preds: ['dinner']
true: ['d

[{'test_f1': 0.8291758894920349,
  'test_precision': 0.8205362558364868,
  'test_recall': 0.8380933403968811}]

- F1: 0.8091743119266054 
- prec: 0.8032786885245902 
- rec: 0.8151571164510166

In [None]:
!ls -la checkpoints/

In [None]:
!mv checkpoints/best_model-v1.ckpt ./best_model_A-to_try.ckpt

mv: cannot stat 'checkpoints/best_model-v1.ckpt': No such file or directory


In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
# Let's see how training goes on tensorboard. This is optional as one can use the logbar of PL to track training. 
%tensorboard --logdir lightning_logs/