## Imports and setup

In [2]:
from google.colab import drive
# general
import numpy as np
import os
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
from typing import *
import string
import json
import pandas as pd

# torch
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader,SequentialSampler
from torch.nn import functional as F
from torch.utils.tensorboard import SummaryWriter

# torch text
!pip install torchtext --upgrade

from torchtext import data
from torchtext.vocab import Vectors

from pprint import pprint
from torchtext.vocab import *
from collections import Counter
import random

# nltk
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

!pip install pytorch_lightning

import pytorch_lightning as pl
from torch.utils.data import DataLoader
import torch.optim as optim
from pytorch_lightning.callbacks import ModelCheckpoint

# SKLEARN
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.metrics import precision_score

# transformers
!pip install transformers
from transformers import DistilBertTokenizerFast, BertTokenizerFast,BertForTokenClassification,BertTokenizerFast
from transformers import DistilBertForSequenceClassification, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from transformers import DistilBertForTokenClassification
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import BertModel,DistilBertModel

drive.mount('/content/drive')

Requirement already up-to-date: torchtext in /usr/local/lib/python3.7/dist-packages (0.9.1)
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Collecting pytorch_lightning
[?25l  Downloading https://files.pythonhosted.org/packages/b6/6a/20d0bf3b967ab62333efea36fe922aaa252d1762555b4a7afb2be5bbdcbf/pytorch_lightning-1.3.5-py3-none-any.whl (808kB)
[K     |████████████████████████████████| 808kB 38.7MB/s 
Collecting future>=0.17.1
[?25l  Downloading https://files.pythonhosted.org/packages/45/0b/38b06fd9b92dc2b68d58b75f900e97884c45bedd2ff83203d933cf5851c9/future-0.18.2.tar.gz (829kB)
[K     |████████████████████████████████| 829kB 45.0MB/s 
[?25hCollecting tensorboard!=2.5.0,>=2.2.0
[?25l  Downloading https://files.p

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |▏                               | 10kB 22.0MB/s eta 0:00:01[K     |▎                               | 20kB 27.8MB/s eta 0:00:01[K     |▍                               | 30kB 30.7MB/s eta 0:00:01[K     |▋                               | 40kB 32.3MB/s eta 0:00:01[K     |▊                               | 51kB 34.3MB/s eta 0:00:01[K     |▉                               | 61kB 35.8MB/s eta 0:00:01[K     |█                               | 71kB 36.2MB/s eta 0:00:01[K     |█▏                              | 81kB 35.0MB/s eta 0:00:01[K     |█▎                              | 92kB 36.0MB/s eta 0:00:01[K     |█▌                              | 102kB 37.3MB/s eta 0:00:01[K     |█▋                              | 112kB 37.3MB/s eta 0:00:01[K     |█▊                              | 

Set up of seeds and  folders


In [3]:
root_folder = '/content/drive/My Drive/NLP/nlp2021-hw2'
dataset_folder = os.path.join(root_folder,'data')

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True  
device = "cuda" if torch.cuda.is_available() else "cpu"

# !nvidia-smi

In [4]:
#@title Setup of parameters{ run: "auto" }
WE_LENGTH = "50" #@param [50,100,200,300]
WE = "glove" #@param ["glove", "chargram", "fasttext"]
REMOVE_STOPWORDS = True #@param ["True", "False"] {type:"raw"}
LEMMATIZATION = True #@param ["True", "False"] {type:"raw"}
LOWERED = True #@param ["True", "False"] {type:"raw"}
model_b = False #@param ["True", "False"] {type:"raw"}
model_AB = False #@param ["True", "False"] {type:"raw"}
bert_model = "bert-base-uncased" if LOWERED else "bert-base-cased"

needed for all models evaluation

In [5]:
def evaluate_sentiment(samples, predictions_b, mode="Aspect Sentiment"):
    scores = {}
    if mode == 'Category Extraction':
        sentiment_types = ["anecdotes/miscellaneous", "price", "food", "ambience"]
    else:
        sentiment_types = ["positive", "negative", "neutral", "conflict"]
    scores = {sent: {"tp": 0, "fp": 0, "fn": 0} for sent in sentiment_types + ["ALL"]}
    for label, pred in zip(samples, predictions_b):
        for sentiment in sentiment_types:
            if mode == "Aspect Sentiment":
                pred_sent = {(term_pred[0], term_pred[1]) for term_pred in pred["targets"] if
                                    term_pred[1] == sentiment}
                gt_sent = {(term_pred[1], term_pred[2]) for term_pred in label["targets"] if
                                    term_pred[2] == sentiment}
            elif mode == 'Category Extraction' and "categories" in label:
                pred_sent = {(term_pred[0]) for term_pred in pred["categories"] if
                                term_pred[0] == sentiment}
                gt_sent = {(term_pred[0]) for term_pred in label["categories"] if
                                term_pred[0] == sentiment}
            elif "categories" in label:
                pred_sent = {(term_pred[0], term_pred[1]) for term_pred in pred["categories"] if
                                term_pred[1] == sentiment}
                gt_sent = {(term_pred[0], term_pred[1]) for term_pred in label["categories"] if
                                term_pred[1] == sentiment}
            else:
                continue

            scores[sentiment]["tp"] += len(pred_sent & gt_sent)
            scores[sentiment]["fp"] += len(pred_sent - gt_sent)
            scores[sentiment]["fn"] += len(gt_sent - pred_sent)

    # Compute per sentiment Precision / Recall / F1
    for sent_type in scores.keys():
        if scores[sent_type]["tp"]:
            scores[sent_type]["p"] = 100 * scores[sent_type]["tp"] / (scores[sent_type]["fp"] + scores[sent_type]["tp"])
            scores[sent_type]["r"] = 100 * scores[sent_type]["tp"] / (scores[sent_type]["fn"] + scores[sent_type]["tp"])
        else:
            scores[sent_type]["p"], scores[sent_type]["r"] = 0, 0

        if not scores[sent_type]["p"] + scores[sent_type]["r"] == 0:
            scores[sent_type]["f1"] = 2 * scores[sent_type]["p"] * scores[sent_type]["r"] / (
                    scores[sent_type]["p"] + scores[sent_type]["r"])
        else:
            scores[sent_type]["f1"] = 0

    # Compute micro F1 Scores
    tp = sum([scores[sent_type]["tp"] for sent_type in sentiment_types])
    fp = sum([scores[sent_type]["fp"] for sent_type in sentiment_types])
    fn = sum([scores[sent_type]["fn"] for sent_type in sentiment_types])

    if tp:
        precision = 100 * tp / (tp + fp)
        recall = 100 * tp / (tp + fn)
        f1 = 2 * precision * recall / (precision + recall)

    else:
        precision, recall, f1 = 0, 0, 0

    scores["ALL"]["p"] = precision
    scores["ALL"]["r"] = recall
    scores["ALL"]["f1"] = f1
    scores["ALL"]["tp"] = tp
    scores["ALL"]["fp"] = fp
    scores["ALL"]["fn"] = fn

    # Compute Macro F1 Scores
    scores["ALL"]["Macro_f1"] = sum([scores[ent_type]["f1"] for ent_type in sentiment_types])/len(sentiment_types)
    scores["ALL"]["Macro_p"] = sum([scores[ent_type]["p"] for ent_type in sentiment_types])/len(sentiment_types)
    scores["ALL"]["Macro_r"] = sum([scores[ent_type]["r"] for ent_type in sentiment_types])/len(sentiment_types)

    print(f"{mode} Evaluation\n")

    print(
        "\tALL\t TP: {};\tFP: {};\tFN: {}".format(
            scores["ALL"]["tp"],
            scores["ALL"]["fp"],
            scores["ALL"]["fn"]))
    print(
        "\t\t(m avg): precision: {:.2f};\trecall: {:.2f};\tf1: {:.2f} (micro)".format(
            precision,
            recall,
            f1))
    print(
        "\t\t(M avg): precision: {:.2f};\trecall: {:.2f};\tf1: {:.2f} (Macro)\n".format(
            scores["ALL"]["Macro_p"],
            scores["ALL"]["Macro_r"],
            scores["ALL"]["Macro_f1"]))

    for sent_type in sentiment_types:
        print("\t{}: \tTP: {};\tFP: {};\tFN: {};\tprecision: {:.2f};\trecall: {:.2f};\tf1: {:.2f};\t{}".format(
            sent_type,
            scores[sent_type]["tp"],
            scores[sent_type]["fp"],
            scores[sent_type]["fn"],
            scores[sent_type]["p"],
            scores[sent_type]["r"],
            scores[sent_type]["f1"],
            scores[sent_type]["tp"] +
            scores[sent_type][
                "fp"]))

def evaluate_extraction(samples, predictions_b):
    scores = {"tp": 0, "fp": 0, "fn": 0}
    for label, pred in zip (samples, predictions_b):
        pred_terms = {term_pred[0] for term_pred in pred["targets"]}
        gt_terms = {term_gt[1] for term_gt in label["targets"]}

        scores["tp"] += len(pred_terms & gt_terms)
        scores["fp"] += len(pred_terms - gt_terms)
        scores["fn"] += len(gt_terms - pred_terms)

    precision = 100 * scores["tp"] / (scores["tp"] + scores["fp"])
    recall = 100 * scores["tp"] / (scores["tp"] + scores["fn"])
    f1 = 2 * precision * recall / (precision + recall)

    print(f"Aspect Extraction Evaluation")

    print(
        "\tAspects\t TP: {};\tFP: {};\tFN: {}".format(
            scores["tp"],
            scores["fp"],
            scores["fn"]))
    print(
        "\t\tprecision: {:.2f};\trecall: {:.2f};\tf1: {:.2f}".format(
            precision,
            recall,
            f1))


def load_data(data_path):
    with open(data_path) as json_file:
        list_of_sentences = json.load(json_file)
    return list_of_sentences

# TASK A-B (Bi-LSTM)

## Dataset

In [None]:
class BiLSTMDataset(Dataset):

    def __init__(self, 
                 data_path:str,
                 window_size:int, 
                 model_b:bool=False,
                 device="cuda"):
        """
        Args:
            data_path (string): The path containing the data
            window_size (integer): The maximum length of a sentence in terms of number of tokens.
            model_b:bool=False,
        """

        self.window_size = window_size
        self.model_b = model_b

        sentences = self.load_data(data_path)
        self.device = device
        self.data = self.create_windows(sentences)
        self.encoded_data = None
    

    def remove_stopwords(self,sent: str) -> str:
        stop_words = set(stopwords.words('english'))
        # remove punkt
        others = "–" +"—" + "−" + "’" + "”" + "“" #These chars arent inside the standard punctuation
        str_punkt = string.punctuation+ others
        translator = str.maketrans(str_punkt, ' '*len(str_punkt)) 
        word_tokens = word_tokenize(sent.translate(translator)) 
        
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
        return filtered_sentence

    def load_data(self,data_path):
        sentences = []
        with open(data_path) as json_file:
            list_of_sentences = json.load(json_file)
            for obj in list_of_sentences:
                _sentence = []
                lemmatized = obj['text'].split(" ")
                if LEMMATIZATION:
                    lemmatized = [lemmatizer.lemmatize(w)  for w in obj['text'].split(" ")]
                if REMOVE_STOPWORDS:
                    if LEMMATIZATION:    
                        lemmatized = self.remove_stopwords(" ".join(lemmatized))
                    else:
                        lemmatized = self.remove_stopwords(obj['text'])
                if LOWERED:
                    lemmatized = [x.lower() for x in lemmatized]
                for t in lemmatized:
                    ne_label = "O"
                    sentiment = ""
                    for i in range(len(obj['targets'])):
                        lemmatized_target = obj['targets'][i][1]
                        if LEMMATIZATION:
                            lemmatized_target = lemmatizer.lemmatize(obj['targets'][i][1])
                        if LOWERED:
                            lemmatized_target = lemmatized_target.lower()
                        
                        if t in lemmatized_target:
                            ne_label = "B" if t == lemmatized_target.split(" ")[0] else "I"
                            try:
                                if ne_label == "I" and _sentence[-1]["ne_label"] == "O":
                                    ne_label = "O"
                            except:
                                ne_label = "O"
                                
                            sentiment = obj['targets'][i][2]
                            #embed sentiment directly into the position of word
                            if self.model_b and ne_label != "O":
                                ne_label = ne_label +"-"+sentiment
                    token = {"token": t, "ne_label": ne_label , "sentiment" :sentiment}
                    _sentence.append(token)
                sentences.append(_sentence)
            return sentences

    def create_windows(self, sentences): 
        data = []
        for sentence in sentences:
            for i in range(0, len(sentence), self.window_size):
                window = sentence[i:i+self.window_size]
                if len(window) < self.window_size:
                    window = window + [None]*(self.window_size - len(window))  # to match the same length of sentences
                assert len(window) == self.window_size
                data.append(window)
        return data


    def index_dataset(self, l_vocabulary, l_label_vocabulary):
        self.encoded_data = list()
        for i in range(len(self.data)):
            sentence = self.data[i]
            encoded_sentence = torch.LongTensor(self.encode_text(sentence, l_vocabulary)).to(self.device)
            encoded_labels = torch.LongTensor([l_label_vocabulary[d["ne_label"]] if d is not None else l_label_vocabulary["<pad>"] for d in sentence]).to(self.device)
            
            self.encoded_data.append({"inputs":encoded_sentence, "outputs":encoded_labels})


    @staticmethod
    def encode_text(sentence:list, l_vocabulary:Vocab):
        """
        Args:
            sentences (list): list of OrderedDict, each carrying the information about one token.
            l_vocabulary (Vocab): vocabulary with mappings from words to indices and viceversa.
        Return:
            The method returns a list of indices corresponding to the input tokens.
        """
        indices = list()
        for w in sentence:
            if w is None:
                indices.append(l_vocabulary["<pad>"])
            elif w["token"] in l_vocabulary.stoi: # vocabulary string to integer (necessary to search faster)
                indices.append(l_vocabulary[w["token"]])
            else:
                indices.append(l_vocabulary["<unk>"])
        return indices

    
    @staticmethod
    def decode_output(outputs:torch.Tensor,label_vocabulary: Vocab):
        max_indices = torch.argmax(outputs, -1).tolist() # shape = (batch_size, max_len)
        predictions = list()
        for indices in max_indices:
            # vocabulary integer to string is used to obtain the corresponding word from the max index
            predictions.append([label_vocabulary.itos[i] for i in indices])
        return predictions

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.encoded_data[idx]
    
    def get_raw_element(self, idx):
        return self.data[idx]

Data module definition

In [None]:
class DataModuleBiLSTM(pl.LightningDataModule):
    def __init__(self, training_file, dev_file, window_size, vocabulary, label_vocabulary, model_b=None):
        super().__init__()
        self.training_file = training_file
        self.dev_file = dev_file
        self.window_size = window_size
        self.vocabulary = vocabulary
        self.label_vocabulary = label_vocabulary
        self.model_b = model_b

    def setup(self, stage=None):
      self.trainingset = BiLSTMDataset(self.training_file, self.window_size,  self.model_b)
      self.devset = BiLSTMDataset(self.dev_file, self.window_size,  self.model_b)

      self.trainingset.index_dataset(self.vocabulary, self.label_vocabulary)
      self.devset.index_dataset(self.vocabulary, self.label_vocabulary)
          
    def train_dataloader(self):
      return DataLoader(self.trainingset, batch_size=128)
    
    def val_dataloader(self):
        return DataLoader(self.devset, batch_size=128)

### Building vocabulary

In [None]:
def build_vocab(dataset, min_freq=1):
    counter = Counter()
    for i in tqdm(range(len(dataset))):
        for token in dataset.get_raw_element(i):
            if token is not None:
                counter[token["token"]]+=1
    #Counter({'the': 1046, 'and': 671, 'to': 604, etc....}

    return Vocab(counter, specials=['<pad>', '<unk>'], min_freq=min_freq)

def build_label_vocab(dataset):
    counter = Counter()
    for i in tqdm(range(len(dataset))):
        for token in dataset.get_raw_element(i):
            if token is not None:
                counter[token["ne_label"]]+=1
    # No <unk> token for labels. Counter({'O': 19179, 'B': 1548, 'I': 718})
    print(counter)
    return Vocab(counter, specials=['<pad>'])

In [None]:
training_file = dataset_folder+"/laptops_train.json"
dev_file = dataset_folder+"/laptops_dev.json"

window_size = 20
dataset = BiLSTMDataset(training_file, window_size, model_b)

vocabulary = build_vocab(dataset)
label_vocabulary = build_label_vocab(dataset)
dataset.index_dataset(vocabulary, label_vocabulary)

## Usage of pretrained embeddings

In [None]:
if WE =="glove":
    vocabulary.load_vectors("glove.6B."+WE_LENGTH+"d")
elif WE == "charngram":
    vocabulary.load_vectors("charngram.100d")
else:
    vocabulary.load_vectors("fasttext.en.300d")



In [None]:
print(vocabulary.vectors[1165])
#print(vocabulary.vectors[0].get_vecs_by_tokens("pain"))

## Model

In [None]:
class ABSAModel(pl.LightningModule):
    def __init__(self, hparams, embeddings = None, comments="without_pretrained_embeddings",*args, **kwargs):
        super(ABSAModel, self).__init__(*args, **kwargs)

        self.save_hyperparameters(hparams)
        self.loss_function = nn.CrossEntropyLoss(ignore_index=label_vocabulary['<pad>'])
        self.word_embedding = nn.Embedding(self.hparams.vocab_size, self.hparams.embedding_dim) # hparams.vocab_size words in vocab, hparams.embedding_dim dimensional embeddings

        if embeddings is not None:
            print("embeddings from pretrained")
            self.word_embedding = nn.Embedding.from_pretrained(embeddings)

        self.lstm = nn.LSTM(self.hparams.embedding_dim, self.hparams.hidden_dim, 
                            bidirectional=self.hparams.bidirectional,
                            num_layers=self.hparams.num_layers, 
                            dropout = self.hparams.dropout if self.hparams.num_layers > 1 else 0)
        
        lstm_output_dim = self.hparams.hidden_dim if self.hparams.bidirectional is False else self.hparams.hidden_dim * 2

        self.dropout = nn.Dropout(self.hparams.dropout)
        self.classifier = nn.Linear(lstm_output_dim, self.hparams.num_classes)

        self.writer = SummaryWriter(comment=comments+"_modelb="+str(model_b))
        self.epoch_t, self.epoch_ev = -1,-1

    def forward(self, x):
        embeddings = self.word_embedding(x)
        embeddings = self.dropout(embeddings)
        o, (h, c) = self.lstm(embeddings)
        o = self.dropout(o)
        logits = self.classifier(o)
        
        predictions = torch.argmax(logits, -1)
        
        return logits, predictions

    def training_step(self, batch, batch_nb):
        '''
        {
        'inputs': tensor([  5, 121,  34,   6, 834,  68, 307,   4, 370, 684, 663,  40,  42, 748, 0,   0,   0,   0,   0], device='cuda:0'), 
        'outputs': tensor([1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
        }
        '''
        inputs = batch['inputs']
        labels = batch['outputs']
        logits, _ = self.forward(inputs)

        logits = logits.view(-1, logits.shape[-1])
        labels = labels.view(-1)

        loss = self.loss_function(logits, labels)
        
        # Log it:
        self.log('train_loss', loss, prog_bar=True)
        if self.epoch_t != self.current_epoch:
            self.epoch_t = self.current_epoch
            self.writer.add_scalar("train/loss", loss, self.current_epoch)
            self.log_f1(logits,labels,"train")

        return loss

    def validation_step(self, batch, batch_nb):
        inputs = batch['inputs']
        labels = batch['outputs']

        logits, _ = self.forward(inputs)
        logits = logits.view(-1, logits.shape[-1])
        labels = labels.view(-1)
        sample_loss = self.loss_function(logits, labels)
        self.log('valid_loss', sample_loss, prog_bar=True)
        
        if self.epoch_ev != self.current_epoch:
            self.epoch_ev = self.current_epoch
            self.writer.add_scalar("eval/loss", sample_loss, self.current_epoch)
            self.log_f1(logits,labels,"eval")


    def configure_optimizers(self):
        return optim.Adam(self.parameters())


    def log_f1(self,logits,indexed_labels,mode="train"):
        predictions = torch.argmax(logits, -1).view(-1)
        labels = indexed_labels.view(-1)
        valid_indices = labels != 0
        
        valid_predictions = predictions[valid_indices]
        valid_labels = labels[valid_indices]
        macro_f1 = f1_score(valid_labels.tolist(), valid_predictions.tolist(), average="macro", zero_division=0)
        self.writer.add_scalar(mode+"/f1", macro_f1, self.current_epoch)


## Training

In [None]:
to_be_saved = False
try:
    embedding_dim = vocabulary.vectors.shape[1]
except:
    embedding_dim = 100
hparams = {'vocab_size': len(vocabulary),
            'hidden_dim': 80,
            'embedding_dim': embedding_dim,
            'num_classes': len(label_vocabulary), # number of different universal POS tags
            'bidirectional': True,
            'num_layers': 2,
            'dropout': 0.2}
window_size = 25
data_module = DataModuleBiLSTM(training_file, dev_file, window_size, vocabulary, label_vocabulary, model_b)
trainer = pl.Trainer(gpus=1, val_check_interval=1.0, max_epochs=20)

model = ABSAModel(hparams,comments="lem="+str(LEMMATIZATION)+"_stopwords_remove="+str(REMOVE_STOPWORDS) + "_low="+str(LOWERED))
#model = ABSAModel(hparams,vocabulary.vectors,comments="with_charngram") #if pretrained embeddings in use uncomment this line
trainer.fit(model, datamodule=data_module)


## Evaluation

In [None]:
try:
    %reload_ext tensorboard
except:
    %load_ext tensorboard
%tensorboard --logdir=runs/

overall scores without separating from target or sentiment classification

In [None]:
def compute_scores(model:pl.LightningModule, l_dataset:DataLoader, l_label_vocab:Vocab):
    model.freeze()
    model.cuda()
    all_predictions = list()
    all_labels = list()
    for indexed_elem in l_dataset:
        indexed_in = indexed_elem["inputs"]
        indexed_labels = indexed_elem["outputs"]
        predictions, _ = model(indexed_in)
        predictions = torch.argmax(predictions, -1).view(-1)
        labels = indexed_labels.view(-1)
        valid_indices = labels != 0
        
        valid_predictions = predictions[valid_indices]
        valid_labels = labels[valid_indices]
        
        all_predictions.extend(valid_predictions.tolist())
        all_labels.extend(valid_labels.tolist())

    macro_accuracy = accuracy_score(all_labels, all_predictions)
    macro_f1 = f1_score(all_labels, all_predictions, average="macro", zero_division=0)
    per_class_precision = precision_score(all_labels, all_predictions, labels = list(range(len(l_label_vocab))), average=None, zero_division=0)
    model.unfreeze()
    return {"macro_accuracy":macro_accuracy,
            "f1_macro":macro_f1, 
            "per_class_precision":per_class_precision}

scores = compute_scores(model, data_module.val_dataloader(), label_vocabulary)
per_class_precision = scores["per_class_precision"]
print("Accuracy: {}\nMacro F1: {}".format(scores["macro_accuracy"], scores["f1_macro"]))
print("Per class Precision:")
for idx_class, precision in sorted(enumerate(per_class_precision), key=lambda elem: -elem[1]):
    label = label_vocabulary.itos[idx_class]
    print(label, precision)

if to_be_saved:
    torch.save(model.state_dict(), root_folder+'/model/model_b={}_f1_{:0.4f}.pt'.format(str(model_b), scores["f1_macro"])) # save the model state


Classes to evaluate like in the implementation.py file.

In [None]:
class PreprocessAB():
    def __init__(self,sentences_list):
        self.window_size = 40
        self.sentences = self.load_data(sentences_list)
        self.data = self.create_windows(self.sentences)
        self.encoded_data = None
        self.vocabulary = self.build_vocab()
        self.index_dataset(self.vocabulary)

    def remove_stopwords(self,sent: str) -> str:
        stop_words = set(stopwords.words('english'))

        # remove punkt
        others = "–" +"—" + "−" + "’" + "”" + "“" #These chars arent inside the standard punctuation
        str_punkt = string.punctuation+ others
        translator = str.maketrans(str_punkt, ' '*len(str_punkt)) 
        word_tokens = word_tokenize(sent.translate(translator)) 
        
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
        return filtered_sentence

    def load_data(self,list_of_sentences):
        sentences = []
        for obj in list_of_sentences:
            _sentence = []
            # lemmatized = [lemmatizer.lemmatize(w)  for w in obj['text'].split(" ")]
            #lemmatized = self.remove_stopwords(" ".join(lemmatized))
            lemmatized = self.remove_stopwords(obj['text'])

            for t in lemmatized:
                token = {"token": t}
                _sentence.append(token)
            sentences.append(_sentence)
        return sentences

    def create_windows(self, sentences):

        data = []
        for sentence in sentences:
            for i in range(0, len(sentence), self.window_size):
                window = sentence[i:i+self.window_size]
                if len(window) < self.window_size:
                    window = window + [None]*(self.window_size - len(window))  # to match the same length of sentences
                assert len(window) == self.window_size
                data.append(window)
        return data


    def index_dataset(self, l_vocabulary):
        self.encoded_data = list()
        for i in range(len(self.data)):
            # for each window
            sentence = self.data[i]
            encoded_sentence = torch.LongTensor(self.encode_text(sentence, l_vocabulary)).cpu()
            
           
            self.encoded_data.append({"inputs":encoded_sentence})


    @staticmethod
    def encode_text(sentence:list, l_vocabulary:Vocab):
        indices = list()
        for w in sentence:
            if w is None:
                indices.append(l_vocabulary["<pad>"])
            elif w["token"] in l_vocabulary.stoi: # vocabulary string to integer (necessary to search faster)
                indices.append(l_vocabulary[w["token"]])
            else:
                indices.append(l_vocabulary["<unk>"])
        return indices

    
    @staticmethod
    def decode_output(outputs:torch.Tensor, l_label_vocabulary: Vocab):
        max_indices = torch.argmax(outputs, -1).tolist() # shape = (batch_size, max_len)
        predictions = list()
        for indices in max_indices:
            predictions.append([l_label_vocabulary.itos[i] for i in indices])
        return predictions

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.encoded_data[idx]
    
    def get_raw_element(self, idx):
        return self.data[idx]

    def build_vocab(self, min_freq=1):
        counter = Counter()
        for i in tqdm(range(len(self.data))):
            # for each token in the sentence viewed as a dictionary of items from the line
            for token in self.get_raw_element(i):
                if token is not None:
                    counter[token["token"]]+=1
        #Counter({'the': 1046, 'and': 671, 'to': 604, etc....}
        # we add special tokens for handling padding and unknown words at testing time.

        return Vocab(counter, specials=['<pad>', '<unk>'], min_freq=min_freq)


def predict_together_AB(samples: List[Dict]) -> List[Dict]:
    targets = []
    prep = PreprocessAB(samples)
    for i,test_elem in tqdm(enumerate(prep.encoded_data)):
        json_pred = {"targets":[]}
        lst,pred = [],[]
        test_x = test_elem["inputs"].to("cpu")
        logits, predictions = model(test_x.unsqueeze(0))
        decoded_labels = prep.decode_output(logits, label_vocabulary)[0]
        idxs = test_x.tolist()
        for j,word in enumerate(decoded_labels):
            if word.startswith("B"):
                json_pred["targets"].append((prep.vocabulary.itos[idxs[j]],word.split("-")[1]))
            elif (word.startswith("I")) and (decoded_labels[j-1].startswith("B")):
                try:
                    last_tuple = json_pred['targets'][-1]
                    words_tagged = last_tuple[0] + " " + prep.vocabulary.itos[idxs[j]]
                    sent_tagged = last_tuple[1]
                    json_pred['targets'][-1] = (words_tagged, sent_tagged)
                except:
                    words_tagged = prep.vocabulary.itos[idxs[j]]
                    sent_tagged = word.split("-")[1]
                    json_pred['targets'].append((words_tagged, sent_tagged))
            elif word.startswith("I"):
                json_pred["targets"].append((prep.vocabulary.itos[idxs[j]],word.split("-")[1]))

        targets.append(json_pred)
    #print(targets)
    return targets



def predict_a_then_b(samples):
    targets = []
    prep = PreprocessAB(samples)
    for i,test_elem in tqdm(enumerate(prep.encoded_data)):
        json_pred = {"targets":[]}
        lst,pred = [],[]
        test_x = test_elem["inputs"].to("cpu")
        logits, predictions = model(test_x.unsqueeze(0))
        decoded_labels = prep.decode_output(logits, label_vocabulary)[0]
        idxs = test_x.tolist()
        print(prep.sentences[i],test_x.tolist(),decoded_labels)
        # for a,b in zip(decoded_labels,idxs):
        #     print(a,"\t",prep.vocabulary.itos[b])
        for j,word in enumerate(decoded_labels):
            if word == "B":
                json_pred["targets"].append((prep.vocabulary.itos[idxs[j]],"positive"))
            elif (word == "I") and (decoded_labels[j-1] == "B"):
                try:
                    last_tuple = json_pred['targets'][-1]
                    words_tagged = last_tuple[0] + " " + prep.vocabulary.itos[idxs[j]]
                    sent_tagged = last_tuple[1]
                    json_pred['targets'][-1] = (words_tagged, sent_tagged)
                except:
                    words_tagged = prep.vocabulary.itos[idxs[j]]
                    sent_tagged = "positive"
                    json_pred['targets'].append((words_tagged, sent_tagged))
            elif word == "I":
                json_pred["targets"].append((prep.vocabulary.itos[idxs[j]],"positive"))

        targets.append(json_pred)

    return targets
def load_data(data_path):
    with open(data_path) as json_file:
        list_of_sentences = json.load(json_file)
    return list_of_sentences


a = load_data(dataset_folder+"/laptops_dev.json")
#a = a + load_data(dataset_folder+"/restaurants_dev.json")
random.shuffle(a)
model.cpu()
#t = predictB(a[:30])
t = predict_together_AB(a)
for gt,pred in zip(a[:10],t[:10]):
    print(gt['targets'],"SEP",pred['targets'])
evaluate_extraction(a,t)
evaluate_sentiment(a,t)

# TASK B (Bi-LSTM (WiC))

##Dataset

In [None]:
class TaskBDataset(torch.utils.data.Dataset):

    def __init__(self, dataset_path: str):
        self.data_store,self.sentences = self.load_data(dataset_path)
        self.vocabulary = self.build_vocab(self.sentences)
        self.convert_all_2_indices()
        self.mlb = MultiLabelBinarizer()
        self.encode_tags()


    def encode_tags(self):
        self.tags = self.mlb.fit_transform([c for (a,b,c) in self.data_store])
        self.data_store = [(a,b, torch.tensor(self.tags[i]) ) for i,(a,b,c) in enumerate(self.data_store)]
    
    def remove_stopwords(self,sent: str) -> str:
        stop_words = set(stopwords.words('english'))

        # remove punkt
        others = "–" +"—" + "−" + "’" + "”" + "“" #These chars arent inside the standard punctuation
        str_punkt = string.punctuation+ others
        translator = str.maketrans(str_punkt, ' '*len(str_punkt)) 
        word_tokens = word_tokenize(sent.translate(translator)) 
        
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
        return filtered_sentence

    def load_data(self,data_path):
        data_store,sentences = [],[]
        with open(data_path) as json_file:
            list_of_sentences = json.load(json_file)
            for obj in list_of_sentences:
                _sentence = []
                #order the targets in order of apperance in the sentence and not alphabetical
                obj['targets'] = sorted(obj['targets'], key=lambda x: x[0][0]) 

                sentiments = [obj['targets'][j][2] for j in range(len(obj['targets']))]
                #for each target word construct a sentence putting start and end token inside the sentence
                for i,targ_obj in enumerate(obj['targets']):
                    #print(targ_obj)
                    new_sent = obj['text'][:targ_obj[0][0]-1]+" <START> " + obj['text'][targ_obj[0][0]:targ_obj[0][1]] + " <END>" + obj['text'][targ_obj[0][1]:]
                    new_sent = [lemmatizer.lemmatize(w)  for w in new_sent.split(" ")]
                    new_sent = " ".join(self.remove_stopwords(" ".join(new_sent)))    
                    index = self.find_indices(new_sent)
                        
                    sentences.append(new_sent)
                    sentiments_converted = [sentiments[i]]
                    data_store.append((new_sent,torch.tensor(index,dtype=torch.long), sentiments_converted))
                #if no target then construct with a FAKE sentiment classified as NONE so no sentiment
                if len(obj['targets'])==0:
                    sentiments_converted= ["NONE"]
                    new_sent = obj['text']
                    new_sent = [lemmatizer.lemmatize(w)  for w in new_sent.split(" ")]
                    new_sent = " ".join(self.remove_stopwords(" ".join(new_sent)))
                    index = [0,0]
                    sentences.append(new_sent)
                    data_store.append((new_sent,torch.tensor(index,dtype=torch.long), sentiments_converted))
            return data_store,sentences


    def convert_all_2_indices(self):
        self.data_store = [(self.sentence2indices(a),b,c) for (a,b,c) in self.data_store]
    
    def find_indices(self,new_sent):
        splitted = new_sent.split(" ")
        indices = [i+1 for i,w in enumerate(splitted) if (w=="START") or (w=="END")]
        indices[1] = indices[1]-1
        return indices
    
    def build_vocab(self, dataset, min_freq=1):
        counter = Counter()
        for a in dataset:
            for token in a.split(" "):
                if token is not None:
                    counter[token]+=1
        #Counter({'the': 1046, 'and': 671, 'to': 604, etc....}
        vocabulary = Vocab(counter, specials=['<pad>', '<unk>'], min_freq=min_freq)
        vocabulary.load_vectors("glove.6B.50d")
        return vocabulary

    def sentence2indices(self,sentence: str) -> torch.Tensor:
        return torch.tensor([self.vocabulary[word] for word in sentence.split(' ') if word != ''], dtype=torch.long)

    def decode_tag(self,tag):
        return self.mlb.inverse_transform(tag.reshape(1,-1))
    
    def __len__(self) -> int:
        return len(self.sentences)

    def __getitem__(self, idx: int) -> torch.Tensor:
        return self.data_store[idx]

In [None]:
class TaskBDataModule(nn.Module):
    def __init__(
        self, 
        data_train_path: str,
        data_dev_path: str,
        batch_size: int,
        collate_fn=None
    ) -> None:
        super().__init__()
        self.data_train_path = data_train_path
        self.data_dev_path = data_dev_path
        self.batch_size = batch_size
        self.collate_fn = collate_fn

        self.train_dataset = None
        self.validation_dataset = None

    def train_dataloader(self, *args, **kwargs) -> DataLoader:
        self.train_dataset = TaskBDataset(self.data_train_path)
        return DataLoader(self.train_dataset, batch_size=self.batch_size, collate_fn=self.collate_fn)

    def val_dataloader(self, *args, **kwargs) -> Union[DataLoader, List[DataLoader]]:
        self.validation_dataset = TaskBDataset(self.data_dev_path)
        return DataLoader(self.validation_dataset, batch_size=self.batch_size,collate_fn=self.collate_fn)



def rnn_collate_fn(data_elements: List[Tuple[torch.Tensor, list]] 
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    X = [de[0] for de in data_elements]  # list of index tensors
    X = torch.nn.utils.rnn.pad_sequence(X, batch_first=True, padding_value=0)  #  shape (batch_size x max_seq_len)
    
    keyword_position = [de[1] for de in data_elements] # list of tuples indices where keyword is [[start keyword,end_keyword],[...]]

    keyword_position = torch.nn.utils.rnn.pad_sequence(keyword_position, batch_first=True, padding_value=0) 

    y = [de[2] for de in data_elements]
    y = torch.nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=-1) 

    return X, keyword_position, y

## Model

Recurrent classifier definition with a customized forward pass

In [None]:
class TaskBRecurrentClassifier(nn.Module):

    def __init__(
        self,
        vectors_store: torch.Tensor,
        n_hidden: int,
        drop_prob: float,
        bidir: bool,
        n_layer_lstm: int,
        vocab_size:int, 
        embedding_dim: int =100,
    ) -> None:
        super().__init__()


        # embedding layer
        
        self.embedding = torch.nn.Embedding.from_pretrained(vectors_store)
        #self.embedding = nn.Embedding(vocab_size, embedding_dim) # hparams.vocab_size words in vocab, hparams.embedding_dim dimensional embeddings

        self.n_hidden = n_hidden
        # recurrent layer
        self.lstm = torch.nn.LSTM(input_size=vectors_store.shape[1],
                                  hidden_size=n_hidden,
                                  num_layers=n_layer_lstm, 
                                  batch_first=True,
                                  bidirectional=bidir)

        # classification 
        if bidir:
           n_hidden = n_hidden*2
        self.lin1 = torch.nn.Linear(n_hidden, n_hidden)
        self.classifier = torch.nn.Linear(n_hidden, 5)

        self.loss_fn = torch.nn.BCEWithLogitsLoss()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'


    def forward(
        self, 
        X: torch.Tensor, 
        indices_keyword: torch.Tensor, 
        y: Optional[torch.Tensor] = None
    ) -> Dict[str, torch.Tensor]:

        embedding_out = self.embedding(X)
        # recurrent encoding
        lstm_output = self.lstm(embedding_out)[0]
        
        batch_size, seq_len, hidden_size = lstm_output.shape

        #sequence of batch x seq_len vectors 
        flat_output = lstm_output.reshape(-1, hidden_size)
        
        # start offsets of each element in the batch
        sequences_offsets = torch.arange(batch_size, device=self.device) * seq_len
        
        summary_vectors_indices_sent1 = self.get_indices_keyword(indices_keyword, sequences_offsets,0)
        summary_vectors_indices_sent2 = self.get_indices_keyword(indices_keyword, sequences_offsets,1)
        
        # we retrieve the vector of the corrseponding states for the keyword given for each sentence.
          
        summary_vectors_sent1 = flat_output[summary_vectors_indices_sent1]
        summary_vectors_sent2 = flat_output[summary_vectors_indices_sent2]
        
        # do the multiplication of these two vectors retrieved
        summary_vectors = summary_vectors_sent1 * summary_vectors_sent2
        
        # feedforward MLP
        out = self.lin1(summary_vectors)
        out = F.leaky_relu(out)
        
        logits = self.classifier(out)
        
        pred = torch.argmax(logits, -1)
        
        result = {'logits': logits, 'pred': pred} 

        # compute loss
        if y is not None:
            #logits = logits.view(-1, logits.shape[-1])
            #y = y.view(-1)
            try:
                loss = self.loss(logits, torch.tensor(y, dtype=torch.float))
            except:
                print(logits.shape,y.shape)
            result['loss'] = loss
        return result
        
       
    def loss(self, pred, y):
        return self.loss_fn(pred, y)
   
    '''
    return the corresponding position of the indices of the keywords, for the sent_num passed, so the first if 0 is passed and the second if 1 is passed
    summary  = [   0,   57,  114,  171,  228, ...] 
    indices_keywords = [ [ 6, 21],[ 4, 22],[ 6, 21],[ 4, 22], ...]
    '''
    def get_indices_keyword(self,indices_keywords: Sequence[tuple], summary: Sequence[int] ,sent_num: int) -> torch.Tensor:
        tens_idx = torch.tensor([item[sent_num] for item in indices_keywords]).to(self.device)
        return tens_idx + summary

## Training 

Trainer class that will handle the training phase for the RNN classifier

In [None]:
class TaskBTrainer():
    def __init__(self, model, optimizer, device, exp_details):

        self.device = device
        self.model = model
        self.optimizer = optimizer
        self.writer = SummaryWriter(comment="_"+exp_details)
        self.model.train()  # we are using this model for training
        self.model.to(self.device)  # move model to GPU if available

    def train(self, train_dataset, eval_dataset, epochs: int = 1, early_stopping: bool = False, early_stopping_patience:int = 3, to_be_saved: bool =False) -> float:

        train_loss = 0.0
       
        for epoch in tqdm(range(epochs)):
            epoch_loss = 0.0
            len_train = 0
            
            self.model.train()

            # each element (sample) in train_dataset is a batch
            for sample in train_dataset:
                # inputs in the batch
                inputs = sample[0].to(self.device)
                # indices of keywords
                idx_start = sample[1].to(self.device)
                # outputs in the batch
                targets = sample[2].to(self.device)

                forward_result = self.model(inputs, idx_start, targets)

                loss = forward_result['loss']
                
                loss.backward()  #  backpropagate the loss
                # updates the parameters
                #Clips gradient norm of an iterable of parameters.
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), CLIP_GRAD)
                self.optimizer.step()
                self.optimizer.zero_grad()
                
                epoch_loss += loss.item()
                len_train += 1
            avg_epoch_loss = epoch_loss / len_train

            
            train_loss += avg_epoch_loss


            print(avg_epoch_loss)
            self.writer.flush()
    
        return train_loss/(epoch+1)

Loading of the handler for the dataset and choose of the batch size

In [None]:
BATCH_SIZE = 40 #@param {type:"slider", min:8, max:64, step:4}

sentences_rnn_dm = TaskBDataModule(
    data_train_path=dataset_folder+'/laptops_train.json',
    data_dev_path=dataset_folder+'/laptops_dev.json',
    batch_size=BATCH_SIZE,
    collate_fn = rnn_collate_fn,
)

Hyperparameter setup

In [None]:
#@title Setup of Hyper-parameters{ run: "auto" }

n_hidden=82 #@param {type:"slider", min:50, max:300, step:16}
drop_prob=0.15 #@param {type:"slider", min:0, max:1, step:0.05}
bidir = True #@param ["True", "False"] {type:"raw"}
learning_rate = 0.0001 #@param {type:"slider", min:0.00001, max:0.001, step:0.00001}
epochs = 25 #@param {type:"slider", min:10, max:100, step:10}
n_layer_lstm = 2 #@param {type:"slider", min:1, max:4, step:1}
CLIP_GRAD = 2 #@param {type:"slider", min:1, max:10, step:1}

Start the training

In [None]:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

#loading of the datasets
train_dataloader = sentences_rnn_dm.train_dataloader()
val_dataloader = sentences_rnn_dm.val_dataloader()


task_b_classifier = TaskBRecurrentClassifier(sentences_rnn_dm.train_dataset.vocabulary.vectors, n_hidden=n_hidden,drop_prob=drop_prob, bidir = bidir, n_layer_lstm = n_layer_lstm, vocab_size=len(sentences_rnn_dm.train_dataset.vocabulary),embedding_dim=100)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

#define the optimizer
optimizer = torch.optim.Adam(task_b_classifier.parameters(), lr=learning_rate)

# string to indetify the model once saved or in the graphs
exp_details="mul_leakyrelu_" + str(drop_prob) + "drop_"+str(n_hidden) +"hidden_"+str(learning_rate) +"lr_" + str(BATCH_SIZE) +"batch_" + str(n_layer_lstm) +"lstmLayer_" + str(CLIP_GRAD) +"clipGrad"

trainer = TaskBTrainer(task_b_classifier, optimizer, device, exp_details)

avg_train_loss= trainer.train(train_dataloader, val_dataloader, epochs=epochs, early_stopping=False, early_stopping_patience=6, to_be_saved = True)
print(" avg_train_loss={}\n ".format(avg_train_loss))



## Evaluation

general evaluation taking in consideration also the NONE tag and also the fact that there are sentences with multiple tags that now are splitted

In [None]:
 def eval_metrics(eval_dataset):
    task_b_classifier.eval()
    task_b_classifier.cuda()
    pred_outs, true_labels = [],[]
    for sample in eval_dataset:
        # inputs in the batch
        inputs = sample[0].to(device)
        idx_start = sample[1].to(device)

        # outputs in the batch
        targets = sample[2].to(device)
        with torch.no_grad():

            forward_result = task_b_classifier(inputs, idx_start,targets)
            loss = forward_result['loss']
            pred_out = torch.sigmoid(forward_result['logits'])
        
            pred_out = pred_out.detach().cpu().numpy()
            y_true = targets.cpu().numpy()

        pred_outs.append(pred_out)
        true_labels.append(y_true)

    flat_pred_outs = np.concatenate(pred_outs, axis=0)

    # Combine the correct labels for each batch into a single list.
    flat_true_labels = np.concatenate(true_labels, axis=0)

    return flat_pred_outs,flat_true_labels


#find the higher probability of the returned logits and set it to 1
def classify(pred_prob):
    y_pred = []
    for tag_label_row in pred_prob:
        temp=[]
        max_prob = max(tag_label_row)
        for tag_label in tag_label_row:
            if tag_label == max_prob:
                temp.append(1)
            else:
                temp.append(0) 
        y_pred.append(temp)
    return y_pred


In [None]:
flat_pred_outs,flat_true_labels= eval_metrics(val_dataloader)
y_true = flat_true_labels.ravel()

y_pred_labels = classify(flat_pred_outs)

y_pred = np.array(y_pred_labels).ravel() # Flatten

metr = {}
metr['f1'] = f1_score(y_true,y_pred, average="macro")

print("F1 = ",metr['f1'])

y_pred = sentences_rnn_dm.validation_dataset.mlb.inverse_transform(np.array(y_pred_labels))
y_act = sentences_rnn_dm.validation_dataset.mlb.inverse_transform(flat_true_labels)

df = pd.DataFrame({'Sentence':sentences_rnn_dm.validation_dataset.sentences,'Actual Tags':y_act,'Predicted Tags':y_pred})
pd.set_option('display.max_rows', 30)
df.head(30)

### Evaluation for implementation.py

In [None]:
class PreprocessBWiC():
    def __init__(self, sentences):
        self.data_store,self.sentences,self.targets = self.load_data(sentences)
        self.vocabulary = self.build_vocab(self.sentences)
        self.convert_all_2_indices()
        # self.tags = [["neutral","negative", "conflict", "positive", "NONE"]]
        # self.mlb = MultiLabelBinarizer()
        # self.mlb.fit_transform(self.tags)
        self.mlb = sentences_rnn_dm.validation_dataset.mlb
    def remove_stopwords(self,sent: str) -> str:
        stop_words = set(stopwords.words('english'))

        # remove punkt
        others = "–" +"—" + "−" + "’" + "”" + "“" #These chars arent inside the standard punctuation
        str_punkt = string.punctuation+ others
        translator = str.maketrans(str_punkt, ' '*len(str_punkt)) 
        word_tokens = word_tokenize(sent.translate(translator)) 
        
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
        return filtered_sentence

    def load_data(self,list_of_sentences):
        data_store,sentences,targets = [],[],[]
        for obj in list_of_sentences:
            _sentence = []
            obj['targets'] = sorted(obj['targets'], key=lambda x: x[0][0])
            sentiments = [obj['targets'][j][2] for j in range(len(obj['targets']))]

            for i,targ_obj in enumerate(obj['targets']):
                #print(targ_obj)
                new_sent = obj['text'][:targ_obj[0][0]-1]+" <START> " + obj['text'][targ_obj[0][0]:targ_obj[0][1]] + " <END>" + obj['text'][targ_obj[0][1]:]
                new_sent = [lemmatizer.lemmatize(w)  for w in new_sent.split(" ")]
                new_sent = " ".join(self.remove_stopwords(" ".join(new_sent)))    
                index = self.find_indices(new_sent)
                    
                sentences.append(new_sent)
                sentiments_converted = [sentiments[i]]
                targets.append([(targ[1], "") for j,targ in enumerate(obj['targets'])])

                data_store.append((new_sent,torch.tensor(index,dtype=torch.long), sentiments_converted))

            if len(obj['targets'])==0:
                sentiments_converted= ["NONE"]
                targets.append([(targ[1], "") for j,targ in enumerate(obj['targets'])])

                new_sent = obj['text']
                # new_sent = [lemmatizer.lemmatize(w)  for w in new_sent.split(" ")]
                # new_sent = " ".join(self.remove_stopwords(" ".join(new_sent)))
                index = [0,0]
                sentences.append(new_sent)
                data_store.append((new_sent,torch.tensor(index,dtype=torch.long), sentiments_converted))
        
        return data_store,sentences,targets


    def convert_all_2_indices(self):
        self.data_store = [(self.sentence2indices(a),b,c) for (a,b,c) in self.data_store]
    
    def find_indices(self,new_sent):
        splitted = new_sent.split(" ")
        indices = [i+1 for i,w in enumerate(splitted) if (w=="START") or (w=="END")]
        indices[1] = indices[1]-1
        return indices
                
    
    def build_vocab(self, dataset, min_freq=1):
        counter = Counter()
        for a in dataset:
            # for each token in the sentence viewed as a dictionary of items from the line
            for token in a.split(" "):
                if token is not None:
                    counter[token]+=1
        #Counter({'the': 1046, 'and': 671, 'to': 604, etc....}
        # we add special tokens for handling padding and unknown words at testing time.
        vocabulary = Vocab(counter, specials=['<pad>', '<unk>'], min_freq=min_freq)
        #vocabulary.load_vectors("glove.6B.50d")
        return vocabulary


    def __len__(self) -> int:
        return len(self.sentences)

    def __getitem__(self, idx: int) -> torch.Tensor:
        return self.data_store[idx]

    def sentence2indices(self,sentence: str) -> torch.Tensor:
        return torch.tensor([self.vocabulary[word] for word in sentence.split(' ') if word != ''], dtype=torch.long)

    def decode_tag(self,tag):
        return self.mlb.inverse_transform(tag.reshape(1,-1))
    
    
def rnn_collate_fn(data_elements: List[Tuple[torch.Tensor, list]] # list of (x, y,z) pairs
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    X = [de[0] for de in data_elements]  # list of index tensors
    X = torch.nn.utils.rnn.pad_sequence(X, batch_first=True, padding_value=0)  #  shape (batch_size x max_seq_len)
    
    keyword_position = [de[1] for de in data_elements] # list of tuples indices where keyword is [[1st sent, 2nd sent]]

    keyword_position = torch.nn.utils.rnn.pad_sequence(keyword_position, batch_first=True, padding_value=0) 



    return X, keyword_position

def classify(pred_prob):
    y_pred = []

    for tag_label_row in pred_prob:
        temp=[]
        max_prob = max(tag_label_row)
        for tag_label in tag_label_row:
            if tag_label == max_prob:
                temp.append(1) 
            else:
                temp.append(0) 
        y_pred.append(temp)

    return y_pred
    
def predictBBNEW(samples: List[Dict]) -> List[Dict]:
    targets = []
    prep = PreprocessBWiC(samples)
    task_b_classifier.eval()
    i = 0
    task_b_classifier.cpu()
    print(len(prep.data_store))
    while i < len(prep.data_store):
        cont = len(prep.targets[i])
        json_pred = {"targets":prep.targets[i]}
        if cont==0:
            i+=1
        sentiments = []
        while cont > 0:
            inputs,idx_start = rnn_collate_fn([prep.data_store[i]]) # inputs in the batch
            
            with torch.no_grad():
                forward_result = task_b_classifier(inputs.cpu(), idx_start.cpu())
                pred_out = torch.sigmoid(forward_result['logits'])
                pred_out = pred_out.detach().cpu().numpy()

            y_pred_labels = classify(pred_out)
            y_pred = prep.mlb.inverse_transform(np.array(y_pred_labels))
            for pred in y_pred:
                sentiments += list(pred)
            cont-=1
            i+=1
        for k,targ in enumerate(json_pred['targets']):
            try:
                if sentiments[k] != "NONE":
                    json_pred["targets"][k] = (json_pred["targets"][k][0],sentiments[k])
            except:
                json_pred["targets"][k] = (json_pred["targets"][k][0],"conflict")
        targets.append(json_pred)
    return targets

a = load_data(dataset_folder+"/laptops_dev.json")
random.shuffle(a)
t = predictBBNEW(a)
for x,y in zip(a[:30],t[:30]):
    print(x['targets'],"SEP",y['targets'])

In [None]:
evaluate_sentiment(a,t)

# TASK A-B (BERT for token classification fine tuning)

## Dataset

Define of the transfomers dataset class, this class will load raw data and then encode the labels into tag and using the Bert tokenizer, tokenize the sentences.

In [39]:
class TaskABDataset(Dataset):
    def __init__(self, path,path2, model_b):
        self.model_b = model_b
        self.texts, self.tags = self.load_data(path)
        texts2,tags2 = self.load_data(path2)
        self.texts = self.texts + texts2
        self.tags = self.tags + tags2
        self.tokenizer = BertTokenizerFast.from_pretrained(bert_model)
        self.encodings = self.tokenizer(self.texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
        self.tag2id = {'O': 0, 'neutral': 1, 'positive': 2, 'negative': 3, 'conflict': 4}
        self.id2tag = {0: 'O', 1: 'neutral', 2: 'positive', 3: 'negative', 4: 'conflict'}
        if model_AB:
            self.tag2id = {'B-conflict': 1, 'B-negative': 5, 'B-neutral': 2, 'B-positive': 4, 'I-conflict': 6, 'I-negative': 7, 'I-neutral': 8, 'I-positive': 3, 'O': 0}
            self.id2tag = {0: 'O', 1: 'B-conflict', 2: 'B-neutral', 3: 'I-positive', 4: 'B-positive', 5: 'B-negative', 6: 'I-conflict', 7: 'I-negative', 8: 'I-neutral'}
        if not model_b:
            self.tag2id = {'O': 0, 'B': 1, 'I': 2}
            self.id2tag = {0: 'O', 1: 'B', 2: 'I'}
        self.labels = self.encode_tags(self.tags, self.encodings)
        

    def load_data(self,data_path):
        sentences,texts,tags = [], [], []
        with open(data_path) as json_file:
            list_of_sentences = json.load(json_file)
            for obj in list_of_sentences:
                _sentence = []
                sent = obj['text']
                if LOWERED:
                    sent = obj['text'].lower()
                for t in tokenizer.tokenize(sent):
                    ne_label = "O"
                    sentiment = ""
                    for i in range(len(obj['targets'])):
                        targ = obj['targets'][i][1]
                        if LOWERED:
                            targ = obj['targets'][i][1].lower()
                        if t in targ and len(t)>2:
                            #if its first word i.e. "battery life" (battery)
                            ne_label = "B" if t == targ.split(" ")[0] else "I"
                            #if for any reason the I tag is assigned but the word before its not a B
                            try:
                                if ne_label == "I" and _sentence[-1]["ne_label"] == "O":
                                    ne_label = "O"
                            except:
                                pass
                            sentiment = obj['targets'][i][2]
                            if self.model_b and ne_label != "O":
                                ne_label = sentiment #ne_label +"-"+sentiment
                    token = {"token": t, "ne_label": ne_label , "sentiment" :sentiment}
                    _sentence.append(token)
                sentences.append(_sentence)

            for elem in sentences:
                texts.append([tok['token'] for tok in elem])
                tags.append([tag['ne_label'] for tag in elem])
            return texts, tags
    
    def encode_tags(self,tags, encodings):
        labels = [[self.tag2id[tag] for tag in doc] for doc in tags]
        encoded_labels = []
        for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
            # create an empty array of -100
            doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
            arr_offset = np.array(doc_offset)

            try:
                # set labels whose first offset position is 0 and the second is not 0
                doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
                encoded_labels.append(doc_enc_labels.tolist())
            except:
                print(doc_labels, doc_offset)

        return encoded_labels


    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        #item['text'] = self.texts[idx] + ["<end>"] *(100-len(self.texts[idx]))
        return item

    def __len__(self):
        return len(self.labels)


In [31]:
class DataModuleTaskAB(pl.LightningDataModule):
    def __init__(self, training_file, training_file2, dev_file, dev_file2, model_b=None):
        super().__init__()
        self.training_file = training_file
        self.dev_file = dev_file
        self.training_file2 = training_file2
        self.dev_file2 = dev_file2
        self.model_b = model_b

    def setup(self, stage=None):
      self.trainingset = TaskABDataset(self.training_file, self.training_file2, self.model_b)
      self.devset = TaskABDataset(self.dev_file, self.dev_file2, self.model_b)
          
    def train_dataloader(self):
      return DataLoader(self.trainingset, batch_size=64)
    
    def val_dataloader(self):
        return DataLoader(self.devset, batch_size=64)

## Model

Lightning module wrapping around pretrained bert

In [32]:
class TaskABModel(pl.LightningModule):
    def __init__(self, device,  comments="",*args, **kwargs):
        super(TaskABModel, self).__init__(*args, **kwargs)      
        self.model = BertForTokenClassification.from_pretrained(bert_model, num_labels=5 if model_b else 3, output_hidden_states = True)
        
        self.writer = SummaryWriter(comment=comments+"_modelb="+str(model_b))
        self.epoch_t, self.epoch_ev = -1,-1
        self.model.to(device)
        self.save_hyperparameters()
    
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)        
        return outputs

    def training_step(self, batch, batch_nb):
        self.model.train()
        input_ids = batch['input_ids'].to(self.device)
        attention_mask = batch['attention_mask'].to(self.device)
        labels = batch['labels'].to(self.device)

        outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs[0]

        # Log it:
        self.log('train_loss', loss, prog_bar=True)
        if self.epoch_t != self.current_epoch:
            self.epoch_t = self.current_epoch
            self.writer.add_scalar("train/loss", loss, self.current_epoch)

        return loss

    def validation_step(self, batch, batch_nb):
        self.model.eval()
        with torch.no_grad():
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            labels = batch['labels'].to(self.device)
            outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
            sample_loss = outputs[0]
        self.log('valid_loss', sample_loss, prog_bar=True)
        
        if self.epoch_ev != self.current_epoch:
            self.epoch_ev = self.current_epoch
            self.writer.add_scalar("eval/loss", sample_loss, self.current_epoch)


    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=2e-5)

## Training

Declaration of the datasets, dataloaders and start of the training

In [None]:
data_module = DataModuleTaskAB(dataset_folder+"/laptops_train.json",dataset_folder+"/restaurants_train.json",dataset_folder+"/laptops_dev.json",dataset_folder+"/restaurants_dev.json",model_b)
trainer = pl.Trainer(gpus=1, val_check_interval=1.0, max_epochs=5)
model = TaskABModel(device,"testAB_distilbert")
trainer.fit(model, datamodule=data_module)

Save the model and the id2tag correspondant


In [None]:
scores = {"f1_macro": 71.4}
#torch.save(model.state_dict(), root_folder+'/model/model_A={}_f1_{:0.4f}.pt'.format(str(model_b), scores["f1_macro"])) # save the model state
# with open(root_folder+'/model/model_AB_together={}_f1_{:0.4f}.json'.format(str(model_b), scores["f1_macro"]), 'w') as outfile:
#     json.dump(val_dataset.id2tag, outfile)
trainer.save_checkpoint(root_folder+"/model/taskA_"+str(scores["f1_macro"])+".ckpt")

## Evaluation

In [37]:
def reconstruct_original_logits(text,logits,tokenizer):
    offset = tokenizer(text, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)['offset_mapping']
    new_logits = list()
    for i, tup in enumerate(offset):
        if tup[0] == 0:
            new_logits.append(logits[i])
    return new_logits


In [None]:
# modelB = TaskBModel.load_from_checkpoint(root_folder+"/model/taskB.ckpt",device=device)
# modelB.eval()

# modelA = TaskABModel.load_from_checkpoint(root_folder+"/model/taskA_71.4.ckpt",device = device)
# modelA.eval()

#### Model B

In [None]:
class PreprocessB():
    def __init__(self, sentences):
        self.texts, self.tags = self.load_data(sentences)
        self.tokenizer = DistilBertTokenizerFast.from_pretrained(bert_model)
        self.encodings = self.tokenizer(self.texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
        self.tag2id = {'O': 0, 'neutral': 1, 'positive': 2, 'negative': 3, 'conflict': 4}
        self.id2tag = {0: 'O', 1: 'neutral', 2: 'positive', 3: 'negative', 4: 'conflict'}
                

    def load_data(self,list_of_sentences):
        sentences,texts,tags = [], [], []
        for obj in list_of_sentences:
            _sentence = []
            for t in tokenizer.tokenize(obj['text']):
                ne_label ="O"
                for i in range(len(obj['targets'])):
                    if t in obj['targets'][i][1]: #if target word
                        ne_label = obj['targets'][i][1]
                        break
                        
                token = {"token": t, "ne_label": ne_label}
                _sentence.append(token)

            sentences.append(_sentence)

        for i,elem in enumerate(sentences):
            texts.append([tok['token'] for tok in elem])
            #new_lst = list()
            tags.append([(targ[1], "") for j,targ in enumerate(list_of_sentences[i]['targets'])])
            #print(tags)
            # for tag in elem:
            #     if tag['ne_label'] != "O" and tag['ne_label'] not in new_lst:
            #         new_lst.append(tag['ne_label'])
            # tags.append(new_lst)
        return texts, tags
    

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


def predictB(samples: List[Dict]) -> List[Dict]:
    targets = []
    prep = PreprocessB(samples)
    for i,encode in enumerate(prep.encodings['input_ids']):
        json_pred = {"targets":prep.tags[i]}
        lst,pred = [],[]
        ids = torch.unsqueeze(torch.tensor(encode),0)
        attention_mask = torch.unsqueeze(torch.tensor(prep.encodings['attention_mask'][i]),0)

        logits = model(torch.tensor(ids).to("cuda"), torch.tensor(attention_mask).cuda())["logits"]
        for id in logits[0].argmax(1):
            lst.append(id.item())
        p = reconstruct_original_logits(prep.texts[i], lst,prep.tokenizer)[1:-1]
        idtotag = [prep.id2tag[raw_pred] for raw_pred in p]
        sentiments = []
        # for j,sentiment in enumerate(idtotag):
        #     if (sentiment != "O"):
        #         try:
        #             if (idtotag[j+1] == "O"):
        #                 sentiments.append(sentiment)
        #                 #print(prep.texts[i][j],sentiment)
        #         except:
        #             sentiments.append(sentiment)
        #             #print(prep.texts[i][j],sentiment)

                
        # for k,targ in enumerate(json_pred['targets']):
        #     try:
        #         json_pred["targets"][k] = (json_pred["targets"][k][0],sentiments[k])
        #     except:
        #         print(prep.texts[i],idtotag)
        #         json_pred["targets"][k] = (json_pred["targets"][k][0],"conflict")
        idd = 0
        if (len(json_pred['targets'])>0):
            for word, sentiment in zip(prep.texts[i],idtotag):
                if word in json_pred["targets"][idd][0]:
                    if sentiment !="O": 
                        json_pred["targets"][idd] = (json_pred["targets"][idd][0],sentiment)
                    else:
                        json_pred["targets"][idd] = (json_pred["targets"][idd][0],"positive") 
                    idd+=1
                    if idd==len(json_pred["targets"]):
                        break
        targets.append(json_pred)
    return targets

####Model A-->B

In [42]:
class PreprocessAB():
    def __init__(self, sentences):
        self.texts = self.load_data(sentences)
        self.tokenizer = BertTokenizerFast.from_pretrained(bert_model)
        self.encodings = self.tokenizer(self.texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
        
        self.tag2id = {'O': 0, 'B': 1, 'I': 2}
        self.id2tag = {0: 'O', 1: 'B', 2: 'I'}
       
                

    def load_data(self,list_of_sentences):
        sentences,texts = [], []
        for obj in list_of_sentences:
            _sentence = []
            for t in tokenizer.tokenize(obj['text']):
                token = {"token": t}
                _sentence.append(token)
            sentences.append(_sentence)

        for elem in sentences:
            texts.append([tok['token'] for tok in elem])
        return texts
    
    

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def predict_A_then_B(samples: List[Dict]) -> List[Dict]:
    targets = []
    prep = PreprocessAB(samples)
    bt_tokenizer = DistilBertTokenizerFast.from_pretrained(bert_model)

    for i,encode in tqdm(enumerate(prep.encodings['input_ids'])):
        json_pred = {"targets":[]}
        lst,pred = [],[]

        ids = torch.unsqueeze(torch.tensor(encode),0)
        attention_mask = torch.unsqueeze(torch.tensor(prep.encodings['attention_mask'][i]),0)
        with torch.no_grad():
            try:
                logits = modelA(torch.tensor(ids).to(device), torch.tensor(attention_mask).to(device))["logits"]
            except Exception as e:
                #print(encode,prep.texts[i])
                print(e)
                targets.append(json_pred)
                continue
            for id in logits[0].argmax(1):
                lst.append(id.item())
            p = reconstruct_original_logits(prep.texts[i], lst,prep.tokenizer)[1:-1]
            idtotag = [prep.id2tag[raw_pred] for raw_pred in p]
            idx_tgt_list = []
        for j,word in enumerate(idtotag):
            if word == "B":
                json_pred["targets"].append((prep.texts[i][j],"positive"))
                start = return_indices(prep.texts[i],j)
                idx_tgt_list.append([start, start + len(prep.texts[i][j])])
            elif (word == "I") and (idtotag[j-1] == "B"):
                try:
                    last_tuple = json_pred['targets'][-1]
                    words_tagged = last_tuple[0] + " " + prep.texts[i][j]
                    sent_tagged = last_tuple[1]
                    json_pred['targets'][-1] = (words_tagged, sent_tagged)
                    
                    idx_tgt_list[-1][1] = idx_tgt_list[-1][1] + len(prep.texts[i][j]) +1
                except:
                    words_tagged = prep.texts[i][j]
                    sent_tagged = "positive"
                    json_pred['targets'].append((words_tagged, sent_tagged))
                    start = return_indices(prep.texts[i],j)
                    idx_tgt_list.append([start, start + len(prep.texts[i][j])])

            elif word == "I":
                json_pred["targets"].append((prep.texts[i][j],"positive"))
                start = return_indices(prep.texts[i],j)
                idx_tgt_list.append([start, start + len(prep.texts[i][j])])
        batches = create_B_batches(prep.texts[i],json_pred,idx_tgt_list,bt_tokenizer)
        json_pred_sent = predict_B_after_A_2(json_pred,batches)
        targets.append(json_pred_sent)
    return targets



def predict_B_after_A_2(json_pred,batches):
    cont = len(json_pred['targets'])

    id2tag = {0: 'NONE', 1: 'conflict', 2: 'negative', 3: 'neutral', 4: 'positive'}

    sentiments = []
    i = 0
    while cont > 0:
        #inputs,idx_start = rnn_collate_fn([prep.data_store[i]]) # inputs in the batch
        batch = batches[i]
        input_ids = torch.unsqueeze(batch['input_ids'],0).to(device)
        attention_mask = torch.unsqueeze(batch['attention_mask'],0).to(device)
        indices_keyword = torch.unsqueeze(batch['indices'],0).to(device)

        with torch.no_grad():
            out = modelB.forward(input_ids, attention_mask=attention_mask, indices_keyword=indices_keyword)
            logits = out['logits']

        y_pred_labels = torch.argmax(logits, axis=-1)
        pred_label = y_pred_labels.tolist()[0]
        y_pred = id2tag[pred_label]
        sentiments.append(y_pred)
        cont-=1
        i+=1
    for k,targ in enumerate(json_pred['targets']):
        try:
            if sentiments[k] != "NONE":
                json_pred["targets"][k] = (json_pred["targets"][k][0],sentiments[k])
        except:
            json_pred["targets"][k] = (json_pred["targets"][k][0],"conflict")

    return json_pred


def return_indices(frase_splitt,word_stop):
    c = 0
    for i,w in enumerate(frase_splitt):
        if word_stop == i:
            return c
        c+=len(" "+w)

def create_B_batches(text,targ_list,idx_list,bt_tokenizer):
    #id2tag = {0: 'NONE', 1: 'conflict', 2: 'negative', 3: 'neutral', 4: 'positive'}
    #tokenizer = BertTokenizerFast.from_pretrained(bert_model)
    #encodings = self.tokenizer(self.sentences, is_split_into_words=False, return_offsets_mapping=True, padding=True, truncation=True)
    data_store = []
    text =  " ".join(text)
    sentences = []
    lst = []
    for i,(targ,_) in enumerate(targ_list['targets']):
        new_sent = text[:idx_list[i][0]]+" <START> " + text[idx_list[i][0]:idx_list[i][1]] + " <END>" + text[idx_list[i][1]:]
        new_sent = [lemmatizer.lemmatize(w)  for w in new_sent.split(" ")]
        new_sent = " ".join(remove_stopwords(" ".join(new_sent)))    
        index = find_indices(new_sent)
            
        sentences.append(new_sent)
        data_store.append((new_sent,torch.tensor(index,dtype=torch.long)))

    if len(targ_list['targets']) == 0:
        new_sent = " ".join(text)
        # new_sent = [lemmatizer.lemmatize(w)  for w in new_sent.split(" ")]
        # new_sent = " ".join(self.remove_stopwords(" ".join(new_sent)))
        index = [0,0]
        sentences.append(new_sent)
    
        data_store.append((new_sent,torch.tensor(index,dtype=torch.long)))
    
    encodings = bt_tokenizer(sentences, is_split_into_words=False, return_offsets_mapping=True, padding=True, truncation=True)
    for idx,batch in enumerate(data_store):
        item = {key: torch.tensor(val[idx]) for key, val in encodings.items()}
        item['indices'] = batch[1]
        lst.append(item)
    
    return lst

def find_indices(new_sent):
    splitted = new_sent.split(" ")
    indices = [i+1 for i,w in enumerate(splitted) if (w=="START") or (w=="END")]
    indices[1] = indices[1]-1
    return indices


def get_batch(idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item['indices'] = self.data_store[idx][1]
    return item

def remove_stopwords(sent: str) -> str:
    stop_words = set(stopwords.words('english'))

    # remove punkt
    others = "–" +"—" + "−" + "’" + "”" + "“" #These chars arent inside the standard punctuation
    str_punkt = string.punctuation+ others
    translator = str.maketrans(str_punkt, ' '*len(str_punkt)) 
    word_tokens = word_tokenize(sent.translate(translator)) 
    
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return filtered_sentence

#### Model AB

In [None]:
class PreprocessAB():
    def __init__(self, sentences):
        self.texts = self.load_data(sentences)
        self.tokenizer = BertTokenizerFast.from_pretrained(bert_model)
        self.encodings = self.tokenizer(self.texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
        

        self.tag2id = {'B-conflict': 1, 'B-negative': 5, 'B-neutral': 2, 'B-positive': 4, 'I-conflict': 6, 'I-negative': 7, 'I-neutral': 8, 'I-positive': 3, 'O': 0}
        self.id2tag = {0: 'O', 1: 'B-conflict', 2: 'B-neutral', 3: 'I-positive', 4: 'B-positive', 5: 'B-negative', 6: 'I-conflict', 7: 'I-negative', 8: 'I-neutral'}
       
                

    def load_data(self,list_of_sentences):
        sentences,texts = [], []
        for obj in list_of_sentences:
            _sentence = []
            for t in tokenizer.tokenize(obj['text']):
                token = {"token": t}
                _sentence.append(token)
            sentences.append(_sentence)

        for elem in sentences:
            texts.append([tok['token'] for tok in elem])
        return texts
    
    

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


def predict_together_AB(samples: List[Dict]) -> List[Dict]:
    targets = []
    prep = PreprocessAB(samples)
    for i,encode in tqdm(enumerate(prep.encodings['input_ids'])):
        json_pred = {"targets":[]}
        lst,pred = [],[]
        ids = torch.unsqueeze(torch.tensor(encode),0)
        attention_mask = torch.unsqueeze(torch.tensor(prep.encodings['attention_mask'][i]),0)
        logits = model(torch.tensor(ids).to("cuda"), torch.tensor(attention_mask).cuda())["logits"]
        
        for id in logits[0].argmax(1):
            lst.append(id.item())
        p = reconstruct_original_logits(prep.texts[i], lst,prep.tokenizer)[1:-1]
        idtotag = [val_dataset.id2tag[raw_pred] for raw_pred in p]
        #idtotag = [label_vocabulary.itos[raw_pred] for raw_pred in p]

        for j,word in enumerate(idtotag):
            if word.startswith("B"):
                json_pred["targets"].append((prep.texts[i][j],word.split("-")[1]))
            elif (word.startswith("I")) and (idtotag[j-1].startswith("B")):
                try:
                    last_tuple = json_pred['targets'][-1]
                    words_tagged = last_tuple[0] + " " + prep.texts[i][j]
                    sent_tagged = last_tuple[1]
                    json_pred['targets'][-1] = (words_tagged, sent_tagged)
                except:
                    words_tagged = prep.texts[i][j]
                    sent_tagged = word.split("-")[1]
                    json_pred['targets'].append((words_tagged, sent_tagged))
            elif word.startswith("I"):
                json_pred["targets"].append((prep.texts[i][j],word.split("-")[1]))

        targets.append(json_pred)
    return targets


#### Actual evaluation

In [None]:
a = load_data(dataset_folder+"/laptops_dev.json")
a = a + load_data(dataset_folder+"/restaurants_dev.json")
random.shuffle(a)
#t = predictB(a)
# modelA.to(device)
# modelB.to(device)
model.cuda()
modelB = model
t = predict_A_then_B(a)

In [44]:
evaluate_sentiment(a,t)
evaluate_extraction(a,t)

Aspect Sentiment Evaluation

	ALL	 TP: 527;	FP: 554;	FN: 559
		(m avg): precision: 48.75;	recall: 48.53;	f1: 48.64 (micro)
		(M avg): precision: 33.10;	recall: 33.61;	f1: 33.35 (Macro)

	positive: 	TP: 321;	FP: 248;	FN: 222;	precision: 56.41;	recall: 59.12;	f1: 57.73;	569
	negative: 	TP: 152;	FP: 151;	FN: 150;	precision: 50.17;	recall: 50.33;	f1: 50.25;	303
	neutral: 	TP: 54;	FP: 155;	FN: 162;	precision: 25.84;	recall: 25.00;	f1: 25.41;	209
	conflict: 	TP: 0;	FP: 0;	FN: 25;	precision: 0.00;	recall: 0.00;	f1: 0.00;	0
Aspect Extraction Evaluation
	Aspects	 TP: 771;	FP: 303;	FN: 311
		precision: 71.79;	recall: 71.26;	f1: 71.52


In [None]:
for gt,pred in zip(a[:50],t[:50]):
    print(gt['targets'],"SEP",pred['targets'])

# TASK B (BERT (WiC))

## Dataset

In [23]:
class TaskBDataset(Dataset):
    def __init__(self, path,path2, model_b):
        self.model_b = model_b
        self.texts, self.tags,self.data_store = self.load_data(path)
        texts2,tags2,data_store2 = self.load_data(path2)
        self.texts = self.texts + texts2
        self.tags = self.tags + tags2
        self.data_store = self.data_store + data_store2


        self.tokenizer = BertTokenizerFast.from_pretrained(bert_model)
        self.encodings = self.tokenizer(self.texts, is_split_into_words=False, return_offsets_mapping=True, padding=True, truncation=True)
        self.tag2id = {'NONE': 0, 'conflict': 1, 'negative': 2, 'neutral': 3, 'positive': 4}
        self.id2tag = {0: 'NONE', 1: 'conflict', 2: 'negative', 3: 'neutral', 4: 'positive'}
        self.labels = self.encode_tags(self.tags, self.encodings)
        

    def encode_tags(self,tags, encodings):
        labels = [[self.tag2id[tag] for tag in doc] for doc in tags]
        encoded_labels = []
        for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
            # create an empty array of -100
            doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
            arr_offset = np.array(doc_offset)

            try:
                # set labels whose first offset position is 0 and the second is not 0
                doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
                encoded_labels.append(doc_enc_labels.tolist())
            except:
                print(doc_labels, doc_offset)

        return encoded_labels

    def remove_stopwords(self,sent: str) -> str:
        stop_words = set(stopwords.words('english'))

        # remove punkt
        others = "–" +"—" + "−" + "’" + "”" + "“" #These chars arent inside the standard punctuation
        str_punkt = string.punctuation+ others
        translator = str.maketrans(str_punkt, ' '*len(str_punkt)) 
        word_tokens = word_tokenize(sent.translate(translator)) 
        
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
        return filtered_sentence

    def load_data(self,data_path):
        data_store,sentences = [],[]
        with open(data_path) as json_file:
            list_of_sentences = json.load(json_file)
            for obj in list_of_sentences:
                _sentence = []
                obj['targets'] = sorted(obj['targets'], key=lambda x: x[0][0])
                sentiments = [obj['targets'][j][2] for j in range(len(obj['targets']))]
                if LOWERED:
                    obj['text'] = obj['text'].lower()
                for i,targ_obj in enumerate(obj['targets']):
                    #print(targ_obj)
                    new_sent = obj['text'][:targ_obj[0][0]-1]+" <START> " + obj['text'][targ_obj[0][0]:targ_obj[0][1]] + " <END>" + obj['text'][targ_obj[0][1]:]
                    if LEMMATIZATION:
                        new_sent = [lemmatizer.lemmatize(w)  for w in new_sent.split(" ")]
                    if REMOVE_STOPWORDS:
                        new_sent = " ".join(self.remove_stopwords(" ".join(new_sent)))    
                    index = self.find_indices(new_sent)
                    if LOWERED:
                        new_sent = new_sent.lower()    
                    sentences.append(new_sent)
                    sentiments_converted = [sentiments[i]]
                    data_store.append((new_sent,torch.tensor(index,dtype=torch.long), sentiments_converted))

                if len(obj['targets'])==0:
                    sentiments_converted= ["NONE"]
                    new_sent = obj['text']
                    if LEMMATIZATION:
                        new_sent = [lemmatizer.lemmatize(w)  for w in new_sent.split(" ")]
                    if REMOVE_STOPWORDS:
                        new_sent = " ".join(self.remove_stopwords(" ".join(new_sent)))
                    index = [0,0]
                    sentences.append(new_sent)
                    data_store.append((new_sent,torch.tensor(index,dtype=torch.long), sentiments_converted))
            tags = []
            for tupl in data_store:
                tags.append(tupl[2])
        return sentences, tags, data_store


    
    def find_indices(self,new_sent):
        splitted = new_sent.split(" ")
        indices = [i+1 for i,w in enumerate(splitted) if (w=="START") or (w=="END")]
        indices[1] = indices[1]-1
        return indices


    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        #item['text'] = self.texts[idx] + ["<end>"] *(100-len(self.texts[idx]))
        item['indices'] = self.data_store[idx][1]
        return item

    def __len__(self):
        return len(self.labels)

data module

In [24]:
class DataModuleTaskB(pl.LightningDataModule):
    def __init__(self, training_file,training_file2,dev_file, dev_file2, model_b=None):
        super().__init__()
        self.training_file = training_file
        self.dev_file = dev_file
        self.training_file2 = training_file2
        self.dev_file2 = dev_file2
        self.model_b = model_b

    def setup(self, stage=None):
      self.trainingset = TaskBDataset(self.training_file, self.training_file2, self.model_b)
      self.devset = TaskBDataset(self.dev_file, self.dev_file2, self.model_b)
          
    def train_dataloader(self):
      return DataLoader(self.trainingset, batch_size=64)
    
    def val_dataloader(self):
        return DataLoader(self.devset, batch_size=64)

## Model

In [25]:
class TaskBModel(pl.LightningModule):
    def __init__(self, device,  comments="",*args, **kwargs):
        super(TaskBModel, self).__init__(*args, **kwargs)      
        self.model = BertModel.from_pretrained(bert_model, num_labels=5 if model_b else 3, output_hidden_states = True)
        
        self.writer = SummaryWriter(comment=comments+"_modelb="+str(model_b))
        self.epoch_t, self.epoch_ev = -1,-1
        self.lin1 = torch.nn.Linear(768, 768)
        self.classifier = torch.nn.Linear(768, 5)
        self.loss_fn = torch.nn.CrossEntropyLoss()
        self.model.to(device)
        self.save_hyperparameters()
    
    def forward(self, input_ids, attention_mask, indices_keyword, labels=None):
        outputs = self.model(input_ids, attention_mask=attention_mask)# labels=labels)
        # hidden_states = outputs['hidden_states']
        # hidden_states = self.tuple_of_tensors_to_tensor(hidden_states)
        hidden_states = outputs['last_hidden_state']
        batch_size, seq_len, hidden_size = hidden_states.shape

        #sequence of batch x seq_len vectors 
        flat_output = hidden_states.reshape(-1, hidden_size)
        
        # start offsets of each element in the batch
        sequences_offsets = torch.arange(batch_size, device=self.device) * seq_len
        
        summary_vectors_indices_sent1 = self.get_indices_keyword(indices_keyword, sequences_offsets,0)
        summary_vectors_indices_sent2 = self.get_indices_keyword(indices_keyword, sequences_offsets,1)
        
        # we retrieve the vector of the corrseponding states for the keyword given for each sentence.
        
        summary_vectors_sent1 = flat_output[summary_vectors_indices_sent1]
        summary_vectors_sent2 = flat_output[summary_vectors_indices_sent2]
        
        # do the multiplication of these two vectors retrieved
        summary_vectors = summary_vectors_sent1 * summary_vectors_sent2
        out = self.lin1(summary_vectors)
        out = F.leaky_relu(out)
        
        logits = self.classifier(out)
        res = {}
        res['logits'] = logits
        if labels is not None:
            labels = torch.stack([labels[i][1] for i in range(labels.shape[0])])
            pred = torch.argmax(logits, -1)
            loss = self.loss_fn(logits, torch.tensor(labels) )
            res['loss'] = loss
        return res

    def training_step(self, batch, batch_nb):
        self.model.train()
        input_ids = batch['input_ids'].to(self.device)
        attention_mask = batch['attention_mask'].to(self.device)
        labels = batch['labels'].to(self.device)
        indices_keyword = batch['indices'].to(self.device)

        out = self.forward(input_ids, attention_mask=attention_mask, indices_keyword=indices_keyword, labels=labels)
        logits = out['logits']
        loss = out['loss']
        # Log it:
        self.log('train_loss', loss, prog_bar=True)
        if self.epoch_t != self.current_epoch:
            self.epoch_t = self.current_epoch
            res = self.compute_F1(logits,labels)
            self.log('train_f1', res['f1'], prog_bar = True)
            self.writer.add_scalar("train/loss", loss, self.current_epoch)

        return loss

    def validation_step(self, batch, batch_nb):
        self.model.eval()
        with torch.no_grad():
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            labels = batch['labels'].to(self.device)
            indices_keyword = batch['indices'].to(self.device)

            out = self.forward(input_ids, attention_mask=attention_mask, indices_keyword=indices_keyword,labels=labels)
            logits = out['logits']
            sample_loss = out['loss']

        self.log('valid_loss', sample_loss, prog_bar=True)
        
        if self.epoch_ev != self.current_epoch:
            self.epoch_ev = self.current_epoch
            self.writer.add_scalar("eval/loss", sample_loss, self.current_epoch)
            res = self.compute_F1(logits,labels)
            self.log('val_f1', res['f1'], prog_bar = True)


    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=5e-5)
    
    def tuple_of_tensors_to_tensor(self, tuple_of_tensors):
        return torch.stack(list(tuple_of_tensors[6]), dim=0).to(self.device)
        # vec = torch.stack(list(tuple_of_tensors),dim=0)
        # vec = torch.mean(vec,dim=0)
        # return vec.to(self.device)
    '''
    return the corresponding position of the indices of the keywords, for the sent_num passed, so the first if 0 is passed and the second if 1 is passed
    summary  = [   0,   57,  114,  171,  228, ...] 
    indices_keywords = [ [ 6, 21],[ 4, 22],[ 6, 21],[ 4, 22], ...]
    '''
    def get_indices_keyword(self,indices_keywords: Sequence[tuple], summary: Sequence[int] ,sent_num: int) -> torch.Tensor:
        tens_idx = torch.tensor([item[sent_num] for item in indices_keywords]).to(self.device)
        return tens_idx + summary

    def compute_F1(self, logits, labels):

        valid_predictions = torch.argmax(logits, axis=-1)
        valid_labels = torch.stack([labels[i][1] for i in range(labels.shape[0])])

        all_predictions = valid_predictions.tolist()
        all_labels = valid_labels.tolist() 

        precision = precision_score(all_labels, all_predictions, average="macro", zero_division=0)
        recall = recall_score(all_labels, all_predictions, average="macro", zero_division=0)
        f1 = f1_score(all_labels, all_predictions, average="macro", zero_division=0)
        return {"precision": precision, "recall": recall, "f1": f1}

## Training

In [26]:
data_module = DataModuleTaskB(dataset_folder+"/laptops_train.json",dataset_folder+"/restaurants_train.json",dataset_folder+"/laptops_dev.json",dataset_folder+"/restaurants_dev.json",model_b)
trainer = pl.Trainer(gpus=1, val_check_interval=1.0, max_epochs=5)
model = TaskBModel(device,"test_bert")
trainer.fit(model, datamodule=data_module)
#trainer.save_checkpoint(root_folder+"/model/taskB.ckpt")

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type             | Params
------------------------------------------------
0 | model      | BertModel        | 109 M 
1 | lin1       | Linear           | 590 K 
2 | classifier | Linear           | 3.8 K 
3 | loss_fn    | CrossEntropyLoss | 0     
------------------------------------------------
110 M     Trainable params
0         Non-trainable params
110 M     Total params
440.307   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…





HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




## Evaluation

In [27]:
metr = {"f1":48.7}
#torch.save(model.state_dict(), root_folder+'/model/TASKB_model_b={}_f1_{:0.4f}.pt'.format(str(model_b), metr["f1"])) # save the model state

In [30]:
class PreprocessBWiC():
    def __init__(self, sentences):
        self.data_store,self.sentences,self.targets = self.load_data(sentences)
        #self.texts, self.tags,self.data_store = self.load_data(sentences)

        self.id2tag = {0: 'NONE', 1: 'conflict', 2: 'negative', 3: 'neutral', 4: 'positive'}


        self.tokenizer = DistilBertTokenizerFast.from_pretrained(bert_model)
        self.encodings = self.tokenizer(self.sentences, is_split_into_words=False, return_offsets_mapping=True, padding=True, truncation=True)
        
    
    def remove_stopwords(self,sent: str) -> str:
        stop_words = set(stopwords.words('english'))

        # remove punkt
        others = "–" +"—" + "−" + "’" + "”" + "“" #These chars arent inside the standard punctuation
        str_punkt = string.punctuation+ others
        translator = str.maketrans(str_punkt, ' '*len(str_punkt)) 
        word_tokens = word_tokenize(sent.translate(translator)) 
        
        filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
        return filtered_sentence

    def load_data(self,list_of_sentences):
        data_store,sentences,targets = [],[],[]
        for obj in list_of_sentences:
            _sentence = []
            obj['targets'] = sorted(obj['targets'], key=lambda x: x[0][0])
            sentiments = [obj['targets'][j][2] for j in range(len(obj['targets']))]

            for i,targ_obj in enumerate(obj['targets']):
                #print(targ_obj)
                new_sent = obj['text'][:targ_obj[0][0]-1]+" <START> " + obj['text'][targ_obj[0][0]:targ_obj[0][1]] + " <END>" + obj['text'][targ_obj[0][1]:]
                new_sent = [lemmatizer.lemmatize(w)  for w in new_sent.split(" ")]
                new_sent = " ".join(self.remove_stopwords(" ".join(new_sent)))    
                index = self.find_indices(new_sent)
                    
                sentences.append(new_sent)
                sentiments_converted = [sentiments[i]]
                targets.append([(targ[1], "") for j,targ in enumerate(obj['targets'])])

                data_store.append((new_sent,torch.tensor(index,dtype=torch.long), sentiments_converted))

            if len(obj['targets'])==0:
                sentiments_converted= ["NONE"]
                targets.append([(targ[1], "") for j,targ in enumerate(obj['targets'])])

                new_sent = obj['text']
                # new_sent = [lemmatizer.lemmatize(w)  for w in new_sent.split(" ")]
                # new_sent = " ".join(self.remove_stopwords(" ".join(new_sent)))
                index = [0,0]
                sentences.append(new_sent)
                data_store.append((new_sent,torch.tensor(index,dtype=torch.long), sentiments_converted))
        
        return data_store,sentences,targets

    
    def find_indices(self,new_sent):
        splitted = new_sent.split(" ")
        indices = [i+1 for i,w in enumerate(splitted) if (w=="START") or (w=="END")]
        indices[1] = indices[1]-1
        return indices


    def get_batch(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['indices'] = self.data_store[idx][1]
        return item
    

    def __len__(self) -> int:
        return len(self.sentences)

    
def predictBBNEW(samples: List[Dict]) -> List[Dict]:
    targets = []
    prep = PreprocessBWiC(samples)
    model.eval()
    model.cuda()
    i = 0
    print(len(prep.data_store))
    while i < len(prep.data_store):
        cont = len(prep.targets[i])
        json_pred = {"targets":prep.targets[i]}
        if cont==0:
            i+=1
        sentiments = []
        while cont > 0:
            #inputs,idx_start = rnn_collate_fn([prep.data_store[i]]) # inputs in the batch
            batch = prep.get_batch(i)
            input_ids = torch.unsqueeze(batch['input_ids'],0).to(device)
            attention_mask = torch.unsqueeze(batch['attention_mask'],0).to(device)
            indices_keyword = torch.unsqueeze(batch['indices'],0).to(device)

            with torch.no_grad():
                out = model.forward(input_ids, attention_mask=attention_mask, indices_keyword=indices_keyword)
                logits = out['logits']
                #forward_result = task_b_classifier(inputs.cpu(), idx_start.cpu())

            y_pred_labels = torch.argmax(logits, axis=-1)
            y_pred_labels.tolist()[0]
            y_pred = prep.id2tag[y_pred_labels.tolist()[0]]
            sentiments += [y_pred]
            cont-=1
            i+=1
        for k,targ in enumerate(json_pred['targets']):
            try:
                if sentiments[k] != "NONE":
                    json_pred["targets"][k] = (json_pred["targets"][k][0],sentiments[k])
            except:
                json_pred["targets"][k] = (json_pred["targets"][k][0],"conflict")
        targets.append(json_pred)
    return targets

a = load_data(dataset_folder+"/laptops_dev.json")
a = a + load_data(dataset_folder+"/restaurants_dev.json")
random.shuffle(a)
t = predictBBNEW(a)
for x,y in zip(a[:30],t[:30]):
    print(x['targets'],"SEP",y['targets'])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


1546
[[[74, 77], 'use', 'positive']] SEP [('use', 'positive')]
[[[30, 34], 'cost', 'positive']] SEP [('cost', 'negative')]
[[[89, 94], 'staff', 'conflict']] SEP [('staff', 'positive')]
[] SEP []
[] SEP []
[] SEP []
[[[0, 7], 'Service', 'positive'], [[18, 25], 'takeout', 'positive']] SEP [('Service', 'positive'), ('takeout', 'positive')]
[[[13, 30], 'pastrami sandwich', 'positive']] SEP [('pastrami sandwich', 'positive')]
[[[87, 97], 'hard drive', 'negative']] SEP [('hard drive', 'negative')]
[[[54, 58], 'meal', 'positive']] SEP [('meal', 'positive')]
[[[21, 35], 'standard os cd', 'negative'], [[47, 75], 'proprietary hardware drivers', 'negative']] SEP [('standard os cd', 'neutral'), ('proprietary hardware drivers', 'neutral')]
[] SEP []
[[[4, 9], 'staff', 'positive']] SEP [('staff', 'positive')]
[[[4, 10], 'people', 'positive']] SEP [('people', 'positive')]
[] SEP []
[[[12, 29], 'seltzer with lime', 'neutral']] SEP [('seltzer with lime', 'neutral')]
[[[2, 25], 'glass of Leaping Lizard'

In [31]:
evaluate_sentiment(a,t)

Aspect Sentiment Evaluation

	ALL	 TP: 730;	FP: 356;	FN: 356
		(m avg): precision: 67.22;	recall: 67.22;	f1: 67.22 (micro)
		(M avg): precision: 46.61;	recall: 47.84;	f1: 47.20 (Macro)

	positive: 	TP: 427;	FP: 122;	FN: 116;	precision: 77.78;	recall: 78.64;	f1: 78.21;	549
	negative: 	TP: 209;	FP: 117;	FN: 93;	precision: 64.11;	recall: 69.21;	f1: 66.56;	326
	neutral: 	TP: 94;	FP: 117;	FN: 122;	precision: 44.55;	recall: 43.52;	f1: 44.03;	211
	conflict: 	TP: 0;	FP: 0;	FN: 25;	precision: 0.00;	recall: 0.00;	f1: 0.00;	0


# TASK C-D (DistilBERT for sequence classification fine tuning)

## Dataset

In [6]:
class TaskCDDataset (Dataset):
    def __init__(self,path, tokenizer, max_len,model_b):
        self.tokenizer = tokenizer
        self.model_b = model_b
        self.texts, self.tags = self.load_data(path)
        self.mlb = MultiLabelBinarizer()
        self.mlb.fit_transform([['ambience-conflict', 'ambience-negative', 'ambience-neutral',
       'ambience-positive', 'anecdotes/miscellaneous-conflict',
       'anecdotes/miscellaneous-negative',
       'anecdotes/miscellaneous-neutral',
       'anecdotes/miscellaneous-positive', 'food-conflict',
       'food-negative', 'food-neutral', 'food-positive', 'price-conflict',
       'price-negative', 'price-neutral', 'price-positive',
       'service-conflict', 'service-negative', 'service-neutral',
       'service-positive']])
        self.encode_tags()
        self.max_len = max_len

    def encode_tags(self):
        self.tags = self.mlb.fit_transform(self.tags)

    def decode_tag(self,tag):
        return self.mlb.inverse_transform(tag.reshape(1,-1))

    def load_data(self,datapath):
        sentences = []     
        texts = []
        tags = []
        with open(datapath) as json_file:
            list_of_sentences = json.load(json_file)
            for obj in list_of_sentences:
                texts.append(obj['text'])
                tags.append([cat[0]+"-"+cat[1] for cat in obj['categories']])

        return texts, tags  
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item_idx):
        text = self.texts[item_idx]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True, # Add [CLS] [SEP]
            max_length= self.max_len,
            padding = 'max_length',
            return_token_type_ids= False,
            return_attention_mask= True, 
            truncation=True,
            return_tensors = 'pt'
          )
        
        input_ids = inputs['input_ids'].flatten()
        attn_mask = inputs['attention_mask'].flatten()
        
        return {
            'input_ids': input_ids ,
            'attention_mask': attn_mask,
            'label': torch.tensor(self.tags[item_idx], dtype=torch.float)
        }
        

In [8]:
class TaskCDDataModule (pl.LightningDataModule):
    
    def __init__(self,training_file, dev_file, tokenizer,batch_size=16,max_token_len=200):
        super().__init__()
        self.training_file = training_file
        self.dev_file = dev_file
        
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len

    def setup(self):
        self.train_dataset = TaskCDDataset(self.training_file, tokenizer=self.tokenizer,max_len = self.max_token_len,model_b=model_b)
        self.val_dataset  = TaskCDDataset(self.dev_file, tokenizer=self.tokenizer,max_len = self.max_token_len,model_b=model_b)
           
    def train_dataloader(self):
        return DataLoader(self.train_dataset,batch_size = self.batch_size,shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset,batch_size= 16)

## Model

In [9]:
class TaskCDClassifier(pl.LightningModule):
    def __init__(self, n_classes=5, steps_per_epoch=None, n_epochs=3, lr=2e-5,device="cuda" ):
        super().__init__()
        self.bert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased',num_labels=n_classes)
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        self.lr = lr
        self.criterion = nn.BCEWithLogitsLoss()
        self.to(device)
    
    def forward(self,input_ids, attn_mask):
        output = self.bert(input_ids = input_ids ,attention_mask = attn_mask)
        output = output['logits']        
        return output
    
    
    def training_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('train_loss',loss , prog_bar=True,logger=True)
        
        return {"loss" :loss, "predictions":outputs, "labels": labels }


    def validation_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('val_loss',loss , prog_bar=True,logger=True)
        
        return loss
    
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters() , lr=self.lr)
        return optimizer
    

## Training

Set up hyper-parameters

In [11]:
bert_tokenizer=DistilBertTokenizerFast.from_pretrained('distilbert-base-cased') 
N_EPOCHS = 5 #@param {type:"slider", min:1, max:10, step:1}
BATCH_SIZE = 32 #@param {type:"slider", min:8, max:64, step:2}
MAX_LEN = 20 #@param {type:"slider", min:20, max:50, step:1}
LR = 0.00002 #@param {type:"slider", min:0.00002, max:0.002, step:0.00001}

Set up the data module and instantiate the classifier model


In [12]:
TaskCD_data_module = TaskCDDataModule(
    dataset_folder+"/restaurants_train.json",
    dataset_folder+"/restaurants_dev.json",
    bert_tokenizer,
    BATCH_SIZE,
    MAX_LEN
    )
TaskCD_data_module.setup()

train_d = TaskCD_data_module.train_dataset
steps_per_epoch = len(train_d.texts)//BATCH_SIZE
model = TaskCDClassifier(n_classes=20, steps_per_epoch=steps_per_epoch,n_epochs=N_EPOCHS,lr=LR)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263273408.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weigh

Instantiate the Model Trainer


In [13]:
trainer = pl.Trainer(max_epochs = N_EPOCHS , gpus = 1,progress_bar_refresh_rate = 10)
trainer.fit(model, TaskCD_data_module)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                                | Params
------------------------------------------------------------------
0 | bert      | DistilBertForSequenceClassification | 65.8 M
1 | criterion | BCEWithLogitsLoss                   | 0     
------------------------------------------------------------------
65.8 M    Trainable params
0         Non-trainable params
65.8 M    Total params
263.188   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




In [14]:
trainer.save_checkpoint(root_folder+"/model/taskCD.ckpt")

## Evaluation

In [15]:
def evaluate_CD():
    pred_outs, true_labels = [], []
    model.to(device)
    for batch in TaskCD_data_module.val_dataloader():
        b_input_ids = batch["input_ids"].to(device)
        b_attn_mask = batch["attention_mask"].to(device)
        b_labels = batch["label"].to(device)
        
        with torch.no_grad():
            pred_out = model(b_input_ids,b_attn_mask)
            pred_out = torch.sigmoid(pred_out)        
            pred_out = pred_out.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

        pred_outs.append(pred_out)
        true_labels.append(label_ids)
    return pred_outs,true_labels

def classify(pred_prob,thresh):
    y_pred = []

    for tag_label_row in pred_prob:
        temp=[]
        for tag_label in tag_label_row:
            if tag_label >= thresh:
                temp.append(1) 
            else:
                temp.append(0)
        y_pred.append(temp)

    return y_pred


def find_best_threshold(flat_pred_outs,y_true):
    scores=[]
    threshold  = np.arange(0.2,0.51,0.01)

    for thresh in threshold:
        #classes for each threshold
        pred_bin_label = classify(flat_pred_outs,thresh) 
        #convert to 1D array
        y_pred = np.array(pred_bin_label).ravel()

        scores.append(f1_score(y_true,y_pred))

    return threshold[scores.index(max(scores))]

In [16]:
pred_outs,true_labels = evaluate_CD()

flat_pred_outs = np.concatenate(pred_outs, axis=0)
flat_true_labels = np.concatenate(true_labels, axis=0).ravel()
y_true = np.concatenate(true_labels, axis=0)
opt_thresh = find_best_threshold(flat_pred_outs,flat_true_labels)
print(opt_thresh)

0.2


Convert probabilities into 0 or 1 based on a threshold value, then calculate the F1 score. I decided the range between 0.2 and 0.51 to find the threshold that maximize the F1


In [17]:
y_pred_labels = classify(flat_pred_outs,opt_thresh)
y_pred = np.array(y_pred_labels).ravel() 

f1_macro = f1_score(flat_true_labels,y_pred, average="macro")

print("F1 =",f1_macro)

F1 = 0.752859207010842


See some prediction example

In [18]:
y_pred = TaskCD_data_module.val_dataset.mlb.inverse_transform(np.array(y_pred_labels))
y_act = TaskCD_data_module.val_dataset.mlb.inverse_transform(y_true)
df = pd.DataFrame({'Sentence':TaskCD_data_module.val_dataset.texts,'Actual Tags':y_act,'Predicted Tags':y_pred})

pd.set_option('display.max_rows', 30)
df.sample(30)

Unnamed: 0,Sentence,Actual Tags,Predicted Tags
229,"I asked for seltzer with lime, no ice.","(food-neutral,)","(food-negative,)"
73,Won't or Can't is not in the service directory.,"(service-positive,)",()
352,"While the ambiance and atmosphere were great, ...","(ambience-positive, food-negative, service-neg...","(ambience-positive, food-positive, service-pos..."
86,"They're rude at times, and not very friendly.","(service-negative,)","(service-negative,)"
470,Both times we waited well over a half hour for...,"(service-negative,)","(service-negative,)"
77,Frites were delicious if a bit on the thick side.,"(food-positive,)","(food-positive,)"
297,"If your visiting, you'll enjoy the ambiance an...","(ambience-positive,)","(ambience-positive,)"
468,"I have never eaten in the restaurant, however,...","(anecdotes/miscellaneous-neutral,)","(anecdotes/miscellaneous-neutral,)"
342,Much more reasonably priced too!,"(price-positive,)","(food-positive, price-negative, price-positive)"
384,I highly recommend the Sophia pizza.,"(anecdotes/miscellaneous-positive,)","(anecdotes/miscellaneous-positive, food-positive)"


In [19]:
#torch.save(model.state_dict(), root_folder+'/model/TASKCD_model_b={}_f1_{:0.4f}.pt'.format(str(model_b), metr["f1"])) # save the model state

### Code for implementation.py

In [21]:
class PreprocessCD():
    def __init__(self, sentences):
        self.texts, self.tags = self.load_data(sentences)
        self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
        self.max_len = 20
        self.mlb = MultiLabelBinarizer()
        self.mlb.fit_transform([['ambience-conflict', 'ambience-negative', 'ambience-neutral',
       'ambience-positive', 'anecdotes/miscellaneous-conflict',
       'anecdotes/miscellaneous-negative',
       'anecdotes/miscellaneous-neutral',
       'anecdotes/miscellaneous-positive', 'food-conflict',
       'food-negative', 'food-neutral', 'food-positive', 'price-conflict',
       'price-negative', 'price-neutral', 'price-positive',
       'service-conflict', 'service-negative', 'service-neutral',
       'service-positive']])
        self.encodings = self.encode_texts()
        
    def encode_texts(self):
        encodings = []
        for item_idx,text in enumerate(self.texts):
            inputs = self.tokenizer.encode_plus(
                text,
                None,
                add_special_tokens=True, # Add [CLS] [SEP]
                max_length= self.max_len,
                padding = 'max_length',
                return_token_type_ids= False,
                return_attention_mask= True, # Differentiates padded vs normal token
                truncation=True, # Truncate data beyond max length
                return_tensors = 'pt' # PyTorch Tensor format
            )
            
            input_ids = inputs['input_ids'].flatten()
            attn_mask = inputs['attention_mask'].flatten()
            
            encodings.append({
                'input_ids': input_ids ,
                'attention_mask': attn_mask,
            })
        return encodings
        

    def load_data(self,list_of_sentences):
        texts = []
        tags = []

        for obj in list_of_sentences:
            texts.append(obj['text'])            
        return texts, tags  
    

    def __len__(self):
        return len(self.labels)


def classify(pred_prob,thresh):
    y_pred = []

    for tag_label_row in pred_prob:
        temp=[]
        for tag_label in tag_label_row:
            if tag_label >= thresh:
                temp.append(1) # Infer tag value as 1 (present)
            else:
                temp.append(0) # Infer tag value as 0 (absent)
        y_pred.append(temp)

    return y_pred

def predictCD(samples: List[Dict]) -> List[Dict]:
    targets = []
    prep = PreprocessCD(samples)
    model.eval()
    model.cpu()
    for i,batch in enumerate(prep.encodings):
        with torch.no_grad():
            json_pred = {"targets":[], "categories": []}
            b_input_ids = batch["input_ids"]
            b_attn_mask = batch["attention_mask"]
            
            ids = torch.unsqueeze(torch.tensor(b_input_ids),0).cpu()
            attmasks = torch.unsqueeze(torch.tensor(b_attn_mask),0).cpu()
            # Forward pass, calculate logit predictions
            pred_out = model(ids,attmasks)
            pred_out = torch.sigmoid(pred_out)
            
            pred_out = pred_out.detach().cpu().numpy()

            y_pred_labels = classify(pred_out,0.2)
            
            y_pred = prep.mlb.inverse_transform(np.array(y_pred_labels))
        for pred in y_pred:
            try:
                asd = pred[0].split("-")
                json_pred["categories"].append((asd[0],asd[1]))
            except:
                continue

        targets.append(json_pred)
    
    return targets

            
a = load_data(dataset_folder+"/restaurants_dev.json")
random.shuffle(a)
t = predictCD(a)

for obj,o in zip(a[:30],t):
    print(obj["categories"],"SEP",o["categories"])
# print(t['categories'])



[['food', 'positive']] SEP [('food', 'positive')]
[['food', 'positive']] SEP [('food', 'positive')]
[['service', 'positive']] SEP [('service', 'positive')]
[['food', 'neutral']] SEP [('food', 'negative')]
[['food', 'positive'], ['service', 'positive'], ['ambience', 'positive']] SEP [('ambience', 'positive')]
[['food', 'positive']] SEP [('ambience', 'positive')]
[['ambience', 'negative']] SEP [('anecdotes/miscellaneous', 'neutral')]
[['food', 'positive']] SEP [('food', 'positive')]
[['food', 'positive'], ['service', 'positive']] SEP [('service', 'negative')]
[['food', 'negative']] SEP [('food', 'negative')]
[['price', 'negative'], ['ambience', 'negative']] SEP []
[['anecdotes/miscellaneous', 'positive']] SEP []
[['food', 'positive'], ['service', 'positive']] SEP [('ambience', 'positive')]
[['service', 'positive'], ['anecdotes/miscellaneous', 'positive']] SEP [('food', 'positive')]
[['food', 'positive'], ['anecdotes/miscellaneous', 'positive']] SEP [('anecdotes/miscellaneous', 'neutral')

In [22]:
evaluate_sentiment(a,t,mode="Category Extraction")
evaluate_sentiment(a,t,mode="Category Sentiment")

Category Extraction Evaluation

	ALL	 TP: 357;	FP: 79;	FN: 187
		(m avg): precision: 81.88;	recall: 65.62;	f1: 72.86 (micro)
		(M avg): precision: 82.31;	recall: 53.39;	f1: 60.13 (Macro)

	anecdotes/miscellaneous: 	TP: 140;	FP: 27;	FN: 51;	precision: 83.83;	recall: 73.30;	f1: 78.21;	167
	price: 	TP: 9;	FP: 0;	FN: 44;	precision: 100.00;	recall: 16.98;	f1: 29.03;	9
	food: 	TP: 173;	FP: 28;	FN: 51;	precision: 86.07;	recall: 77.23;	f1: 81.41;	201
	ambience: 	TP: 35;	FP: 24;	FN: 41;	precision: 59.32;	recall: 46.05;	f1: 51.85;	59
Category Sentiment Evaluation

	ALL	 TP: 268;	FP: 203;	FN: 395
		(m avg): precision: 56.90;	recall: 40.42;	f1: 47.27 (micro)
		(M avg): precision: 39.25;	recall: 30.02;	f1: 32.69 (Macro)

	positive: 	TP: 186;	FP: 96;	FN: 190;	precision: 65.96;	recall: 49.47;	f1: 56.53;	282
	negative: 	TP: 41;	FP: 33;	FN: 126;	precision: 55.41;	recall: 24.55;	f1: 34.02;	74
	neutral: 	TP: 41;	FP: 74;	FN: 48;	precision: 35.65;	recall: 46.07;	f1: 40.20;	115
	conflict: 	TP: 0;	FP: 0;	FN: