# **Table-filling approach**

## Import Libraries

Run this section to import and/or download the libraries needed for the proper functioning of the notebook and to download the dataset from source.

In [None]:
!git clone https://github.com/SapienzaNLP/nlp2023-hw3

Cloning into 'nlp2023-hw3'...
remote: Enumerating objects: 23, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 23 (delta 0), reused 20 (delta 0), pack-reused 0[K
Receiving objects: 100% (23/23), 8.01 MiB | 11.34 MiB/s, done.


In [None]:
!pip install sentencepiece # used by huggingface
!pip install transformers # install huggingface transformers
!pip install -q pytorch-lightning

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m778.1/778.1 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m840.2/840.2 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
#import libraries here
import os
import copy
import math
import gc
import random
import numpy as np

import torch

import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader, random_split
import pytorch_lightning as pl
from pytorch_lightning import Trainer

import matplotlib.pyplot as plt
from typing import List, Tuple, Any, Dict, Union, Set, Callable

from transformers import AutoTokenizer, AutoModel, AlbertTokenizer, AlbertModel

import json

from tqdm import tqdm

from google.colab import drive
drive.mount('/content/drive/')

SEED:int = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
pl.seed_everything(SEED)


INFO:lightning_fabric.utilities.seed:Seed set to 1234


Mounted at /content/drive/


1234

In [None]:
language_model = "distilbert-base-uncased"
# language_model = 'bert-base-cased'
# language_model = 'roberta-base'
# language_model = 'albert-base-v2'

if language_model == "distilbert-base-uncased":
  tokenizer = AutoTokenizer.from_pretrained(language_model)
if language_model == 'bert-base-cased':
  tokenizer = AutoTokenizer.from_pretrained(language_model)
  # model_bert = AutoModel.from_pretrained("bert-base-cased")
if language_model == 'roberta-base':
  tokenizer = AutoTokenizer.from_pretrained(language_model, add_prefix_space=True)
if language_model == 'albert-base-v2':
  tokenizer = AutoTokenizer.from_pretrained(language_model)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
#some global parameters & constants
DATASET_DIR = "nlp2023-hw3/data"
PRINT_BAR = '-' * 10
file_type_dir = "jsonl"  #or "xml"

#Not used
BIO_TYPES = {'LOCATION', 'ORGANIZATION', 'PERSON'}
BIO_DICT = {
    'O': 0,
    'B-LOCATION': 1,
    'I-LOCATION': 2,
    'B-ORGANIZATION': 3,
    'I-ORGANIZATION': 4,
    'B-PERSON': 5,
    'I-PERSON': 6,
    'PAD': -100
}

idx2bio = {v: k for k, v in BIO_DICT.items()}

## Dataset Downloader


In [None]:
def downloadDataset(dataset_prefix: str) -> List[Dict]:
  '''
  Download the dataset.
  '''
  data_path = os.path.join(DATASET_DIR, dataset_prefix)
  data_path += '.' + file_type_dir

  with open(data_path) as f:
    sentences = f.read().splitlines()

    for i, sentence in enumerate(sentences):
      sentences[i] = json.loads(sentence)
    return sentences

In [None]:
t_set = downloadDataset("train")
te_set = downloadDataset("test")
d_set = downloadDataset("dev")

In [None]:
# print(len(t_set))
# print(len(te_set))
# print(len(d_set))
# print(t_set[0])
# print(t_set[0]['relations'][0]['subject'])
# print(t_set[0]['relations'][0]['relation'])
# print(t_set[0]['relations'][0]['object'])
# print(t_set[0]['relations'][0].keys())
# print(t_set[0].keys())


## From dataset to dictionary

In [None]:
def downloadRelations(DATASET_DIR: str, dataset_prefix: str, file_type_dir: str) -> Tuple[Dict, Dict]:
  '''
  Get the relations from the relations2id.json file and create the inverse relation idx2rel.
  '''
  data_path = os.path.join(DATASET_DIR, dataset_prefix)
  data_path += '.' + file_type_dir

  with open(data_path) as f:
    relations = f.read()
    rel2idx = json.loads(relations)

  idx2rel = {}
  for key, value in rel2idx.items():
    idx2rel[value] = key

  return rel2idx, idx2rel

In [None]:
rel2idx, idx2rel = downloadRelations(DATASET_DIR, 'relations2id', 'json')

In [None]:
#DEV

tokens_s = []
relations_s = []
for i, data in enumerate(d_set):
  tokens_s.append(data["tokens"])
  relations_s.append(data["relations"])

In [None]:
#TEST

test_tokens_s = []
test_relations_s = []
for i, data in enumerate(te_set):
  test_tokens_s.append(data["tokens"])
  test_relations_s.append(data["relations"])

In [None]:
#TRAIN
train_tokens_s = []
train_relations_s = []
for i, data in enumerate(t_set):
  train_tokens_s.append(data["tokens"])
  train_relations_s.append(data["relations"])

In [None]:
print(tokens_s[0])
print(relations_s[0])

['In', 'Queens', ',', 'North', 'Shore', 'Towers', ',', 'near', 'the', 'Nassau', 'border', ',', 'supplanted', 'a', 'golf', 'course', ',', 'and', 'housing', 'replaced', 'a', 'gravel', 'quarry', 'in', 'Douglaston', '.']
[{'subject': {'start_idx': 24, 'end_idx': 25, 'entity_type': 'LOCATION', 'text': 'Douglaston'}, 'relation': '/location/neighborhood/neighborhood_of', 'object': {'start_idx': 1, 'end_idx': 2, 'entity_type': 'LOCATION', 'text': 'Queens'}}, {'subject': {'start_idx': 1, 'end_idx': 2, 'entity_type': 'LOCATION', 'text': 'Queens'}, 'relation': '/location/location/contains', 'object': {'start_idx': 24, 'end_idx': 25, 'entity_type': 'LOCATION', 'text': 'Douglaston'}}]


## Aux Functions

In [None]:
def test(model, dataloader):
  all_test_losses = []
  model.eval()

  for i, batch in tqdm(enumerate(dataloader), total = len(dataloader), position = 0, leave = True):
    optimizer.zero_grad()

    sentences = batch['sentences']
    # ner_labels = batch['ner_labels']
    re_labels = batch['table_re']

    tokens = tokenizer(sentences,
                      return_tensors="pt",
                      padding=True,
                      is_split_into_words=True)

    tokens['labels'] = re_labels

    words_ids = [
      tokens[i].word_ids
      for i in range(len(tokens['input_ids']))
    ]

    batch = {k: v.to(hypers.device) for k, v in tokens.items()}
    batch['compute_loss'] = True
    batch['compute_predictions'] = True

    with torch.no_grad():
      outputs = model(**batch)
    loss = outputs["loss"]
    # loss.backward()
    # optimizer.step()
    all_test_losses.append(loss.item())
    if i % 100 == 0:
      print(f'iteration: {i}, loss: {loss.item()}')

  return np.array(all_test_losses).mean()



In [None]:
def split_answer_in_samples(indices: Tuple[torch.Tensor, torch.Tensor, torch.Tensor]) -> Dict:
  '''
  Return 3 List[List[int]], where each List[int] contains the indices for teh subjects and objects belonging to that specific batch
  batch_idxs_sample will look like:
  [[0, 0, 0, 0],
  [1, 1],
  [2, 2],
  [3, 3, 3, 3, 3, 3],
  [4, 4, 4, 4],
  [5, 5, 5, 5],
  [6, 6, 6, 6, 6, 6],
  [7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]]

  While subjects_sample looks like :
  [[5, 6, 7, 8],
  [10, 11],
  [6, 6],
  [78, 87, 87, 87, 87, 87],
  [12, 20, 37, 37],
  [14, 14, 14, 14],
  [2, 2, 2, 23, 24, 25],
  [27, 27, 28, 28, 29, 29, 52, 52, 52, 53, 53, 53]]

  '''
  batch_idxs = indices[0]
  subjects_idxs = indices[1]
  objects_idxs = indices[2]

  relation_pred = []

  #It consider always batches with 8 samples
  #improvement: pass batch_size as parameters

  batch_idxs_sample = [[] for i in range(8)]
  subjects_sample = [[] for i in range(8)]
  objects_sample = [[] for i in range(8)]


  for i, sample in enumerate(batch_idxs):
    batch_idxs_sample[sample].append(batch_idxs[i].item())
    subjects_sample[sample].append(subjects_idxs[i].item())
    objects_sample[sample].append(objects_idxs[i].item())

  rets = {
      'batch_idxs_sample': batch_idxs_sample,
      'subjects_sample': subjects_sample,
      'objects_sample': objects_sample,
  }

  return rets

def getTuples(l_subjects_batches: List[List[int]], l_objects_batches: List[List[int]]) -> Tuple[List[List[List[int]]], List[List[List[int]]]]:
  '''
  For each batch, return two List[List[int]] corresponding to the subjects and objects.
  they contain all the spans relative the the subject and the object the partecipate to the same relation.

  Example for subject in output:
  [[[5, 6, 7, 8]],
  [[10, 11]],
  [[6, 6]],
  [[78], [87, 87, 87, 87], [87]],
  [[12], [20], [37], [37]],
  [[14, 14, 14, 14]],
  [[2, 2, 2], [23, 24, 25]],
  [[27, 27], [28, 28], [29, 29], [52, 52, 52], [53, 53, 53]]]

  Example for object in output:
  [[[12, 12, 12, 12]],
  [[13, 13]],
  [[15, 16]],
  [[87], [6, 7, 8, 9], [78]],
  [[37], [37], [12], [20]],
  [[8, 9, 10, 11]],
  [[23, 24, 25], [2, 2, 2]],
  [[52, 53], [52, 53], [52, 53], [27, 28, 29], [27, 28, 29]]]

  This means that in the first sample, we have only one relation, and the token span for the subject ranges in [5,8]
  '''
  all_tups_subj = []
  all_tups_obj = []

  #loop over the sample in the batch, i.e. subjs = [78, 87, 87, 87, 87, 87]
  for sample_num in range(len(l_subjects_batches)):
    subjs = l_subjects_batches[sample_num]
    objs = l_objects_batches[sample_num]

    tups_subj = []
    tups_obj = []

    tup_sub = []
    tup_obj = []

    #loop over each relation in the sample, i.e 87
    for i, sub, in enumerate(subjs):

      #init teh emoty arraays
      if i == 0:
        tup_sub.append(subjs[i])
        tup_obj.append(objs[i])

      #if the new token_idx is too much greater or is smaller than the last one, it means i changed subject
      elif (subjs[i] - subjs[i-1] > 1) or (subjs[i] - subjs[i-1] < 0):

        tups_subj.append(tup_sub)
        tups_obj.append(tup_obj)

        tup_sub = [subjs[i]]
        tup_obj = [objs[i]]

      #if the new token_idx is too much greater or is smaller than the last one, it means i changed object
      elif (objs[i] - objs[i-1] > 1) or (objs[i] - objs[i-1] < 0):

        tups_subj.append(tup_sub)
        tups_obj.append(tup_obj)

        tup_sub = [subjs[i]]
        tup_obj = [objs[i]]

      #it continues the same subject or object
      else:
        tup_sub.append(subjs[i])
        tup_obj.append(objs[i])

    tups_subj.append(tup_sub)
    tups_obj.append(tup_obj)

    all_tups_subj.append(tups_subj)
    all_tups_obj.append(tups_obj)

  return all_tups_subj, all_tups_obj

def getOriginalRelations(all_tups_subjs: List[List[List[int]]], all_tups_objs: List[List[List[int]]], answers: torch.Tensor, words_ids: List[List[int]]) -> List[list[Dict]]:
  '''
  Given two List[List[List[int]]], return the original relation between subject and object back mapping the idx to the relation.
  '''

  all_relations = []
  #Loop over each sample in the batch
  for i, (tup_subjs, tups_objs) in enumerate(zip(all_tups_subjs, all_tups_objs)):
    word_ids = words_ids[i]
    relations = []

    #Loop over a singe relation, where subj and obj are List[int] corresponding to the token spans, i.e. subj = [5,6,7,8] -> ([5,8])
    for subj, obj in zip(tup_subjs, tups_objs):
      #skip if they are empty
      if not subj or not obj:
        continue

      #skip if a padding has been predicted
      if word_ids[subj[0]] == None or word_ids[subj[-1]] == None or word_ids[obj[0]] == None or word_ids[obj[-1]] == None:
        continue

      #retrieve the token span, i.e. i.e. subj = [5,6,7,8] -> [5,9]
      subj_start, subj_end = word_ids[subj[0]], (word_ids[subj[-1]] + 1)
      obj_start, obj_end = word_ids[obj[0]], (word_ids[obj[-1]] + 1)

      rel = {
          'subject': {
              'start_idx': subj_start,
              'end_idx': subj_end,
          },

          'relation': idx2rel[answers[i][subj[0]][obj[0]].item()],

          'object': {
              'start_idx': obj_start,
              'end_idx': obj_end,
          }
      }

      #Avoid duplicate
      if rel not in relations:
        relations.append(rel)

    all_relations.append(relations)

  return all_relations

In [None]:
def getLabelsIndices(labels: torch.Tensor) -> torch.Tensor:
  '''
  Return the indices of all relations, so this function filters out all the padding and all 'no_relations' indices.
  '''
  labels_no_zero = labels != 0
  labels_no_minus_100 = labels != -100
  labels_index = labels_no_zero & labels_no_minus_100
  labels_indices = torch.where(labels_index == True)
  return labels_indices

def computeRelations(indices: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], result: torch.Tensor, words_ids: List[List[int]]) -> List[List[Dict]]:
  '''
  Given a tuple of indices retrieve the correct subjects and objects token spans and their relative relations.
  '''
  l_batch_indices_batches, l_subjects_batches, l_objects_batches = split_answer_in_samples(indices).values()
  all_tups_subjs, all_tups_objs = getTuples(l_subjects_batches, l_objects_batches)
  all_relations_pred = getOriginalRelations(all_tups_subjs, all_tups_objs, result, words_ids)
  return all_relations_pred

def score(targets: List[List[Dict]], predictions: List[List[Dict]]) -> Tuple[float, float, float]:
    '''
    Compute f1, precision and recall. Taken from the Homework Repository.
    '''

    true_positives = 0
    num_golds = 0
    num_preds = 0

    def get_tupled_labels(rel_dicts: List[Dict]) -> List[Tuple[Tuple[int, int], str, Tuple[int, int]]]:
        tupled_labels = []
        for rel_dict in rel_dicts:
            subject_dict, object_dict = rel_dict["subject"], rel_dict["object"]
            tupled_labels.append(
                (
                    (subject_dict["start_idx"], subject_dict["end_idx"]),
                    rel_dict["relation"],
                    (object_dict["start_idx"], object_dict["end_idx"])
                )
            )
        return tupled_labels

    for sent_targets, sent_predictions in zip(targets, predictions):
        sent_targets, sent_predictions = get_tupled_labels(sent_targets), get_tupled_labels(sent_predictions)
        true_positives += len(set(sent_targets).intersection(set(sent_predictions)))
        num_golds += len(sent_targets)
        num_preds += len(sent_predictions)
    precision = true_positives / num_preds if num_preds > 0 else 0.0
    recall = true_positives / num_golds if num_golds > 0 else 0.0

    f1_score = (
        (2 * precision * recall) / (precision + recall)
        if precision + recall > 0
        else 0.0
    )
    return f1_score, precision, recall

## Collate_fn

In [None]:
def getWordIds(tokens_batch: List[List[str]]) -> List[List[int]]:
  '''
  Compure the tokenization of a list of sentences and return the back mapping to the original sentences for each sample.
  '''
  tokenized = tokenizer(tokens_batch, return_tensors="pt",
                                    padding='longest',
                                    is_split_into_words=True)
  words_ids = [
    tokenized[i].word_ids
    for i in range(len(tokenized['input_ids']))
  ]

  return tokenized, words_ids

In [None]:
def find_indices(arr: List[int], target: int) -> List[int]:
    '''
    Return a list of indices that correspond to the original word in the tokenized sentence.
    '''
    return [index for index, value in enumerate(arr) if value == target]

In [None]:
def genTrueLabels(tokenized: Dict, relations_batch: List[List[Dict]], words_ids: List[List[int]]) -> torch.Tensor:
  '''
  Generate the ground truths matrices for each sentence, where given a subject with span [2,4] and an object with span [5,6],
  return a matrix where the values [2,5], [3,5] are equals to the relation label.
  '''

  all_ner = []
  table_re = []

  #loop over the batch
  for i, (text, relations) in enumerate(zip(tokenized['input_ids'], relations_batch)):

    #for each sample i create a matrix with shape (max_len X max_len) and init all the values equals -100
    table_re_item = torch.ones( (text.shape[0], text.shape[0]) ) * -100

    indexes = np.where(np.array(words_ids[i]) != None)[0]
    rows, cols = np.ix_(indexes, indexes)

    #set to 'no_relation' all the tokens that are not paddings
    table_re_item[rows, cols] = 0

    for relation in relations:
      subj = relation['subject']
      obj = relation['object']
      rel = relation['relation']

      subj_tokens_indices = list( range(subj['start_idx'], subj['end_idx']) )
      obj_tokens_indices = list( range(obj['start_idx'], obj['end_idx']) )

      word_ids = words_ids[i]

      #given the span in the original sentence, finds the new span in the tokenized sentence
      subj_bert_indices = []
      for tok in subj_tokens_indices:
        subj_bert_indices += find_indices(word_ids, tok)

      obj_bert_indices = []
      for tok in obj_tokens_indices:
        obj_bert_indices += find_indices(word_ids, tok)

      subj_bert_indices = torch.tensor(subj_bert_indices)
      obj_bert_indices = torch.tensor(obj_bert_indices)

      #set the correct relation between the subjct and object spans in the tokenized matrix
      if len(subj_bert_indices) > len(obj_bert_indices):
        table_re_item[subj_bert_indices[:, None], obj_bert_indices] = rel2idx[rel]
      elif len(subj_bert_indices) < len(obj_bert_indices):
        table_re_item[subj_bert_indices, obj_bert_indices[:, None]] = rel2idx[rel]
      else:
        table_re_item[subj_bert_indices, obj_bert_indices] = rel2idx[rel]

    table_re.append(table_re_item)

  table_re = torch.stack(table_re)

  return table_re


In [None]:
# tokens_batch, relations_batch = tokens_s[0:2], relations_s[0:2]
# tokenized = tokenizer(tokens_batch, return_tensors="pt",
#                                     padding='longest',
#                                     is_split_into_words=True)

# words_ids = [
#   tokenized[i].word_ids
#   for i in range(len(tokenized['input_ids']))
# ]

# table_re = genTrueLabels(tokenized, relations_batch, words_ids)

#print for the re table
#SAMPLE 0
# print(table_re[0,27,2])
# print(table_re[0,28,2])
# print(table_re[0,2,27])
# print(table_re[0,2,28])
# print()

# #SAMPLE 1
# print(table_re[1,122,23])
# print(table_re[1,123,23])
# print()

# #SAMPLE 2
# print(table_re[2,10,7])
# print(table_re[2,10,8])

## Dataset

In [None]:
import torch
from torch.utils.data import Dataset

import copy



class DatasetRe(Dataset):
  def __init__(self,
               sentences,
               relations) -> None:

    self.sentences = sentences
    self.relations = relations

  ''' returns how many entries we have for a specific category '''
  def __len__(self) -> int:
    return len(self.sentences)

  ''' returns one item of one category '''
  def __getitem__(self, idx:int) -> dict:
    return self.sentences[idx], self.relations[idx]

## Hyper Parameters

In [None]:
#hyperparameters
class hypers:
    save_model = False
    save_model_path = '/content/drive/MyDrive/AI/NLP_HW3/re_table_final_doccia.pth'

    load_model = True

    load_model_path = '/content/drive/MyDrive/AI/NLP_HW3/re_table_final_doccia.pth' #'/content/drive/MyDrive/AI/final_combo_re_hw3_1.pth'

    embedding_dim = 768
    input_size = 768
    learning_rate = 1e-3
    epochs = 5
    batch_size = 8
    print_step = 100
    device = 'cuda' if torch.cuda.is_available() else 'cpu'


## Models

### Wise-Product and Projection

In [None]:
import torch
import torch.nn.functional as F

class REWise(torch.nn.Module):
    def __init__(self, language_model_name: str, num_labels: int, fine_tune_lm: bool = True, *args, **kwargs) -> None:
        super().__init__()
        self.num_labels = num_labels
        # layers
        self.transformer_model = AutoModel.from_pretrained(language_model_name, output_hidden_states=True, output_attentions=True)
        if not fine_tune_lm:
            for param in self.transformer_model.parameters():
                param.requires_grad = False
        self.dropout = torch.nn.Dropout(0.2)
        self.classifier = torch.nn.Linear(
            100, num_labels, bias=False
        )


        self.fcAttention1 = nn.Linear(12, 768)
        self.fcAttentionClassifier = nn.Linear(100, num_labels, bias=False)


        self.fc1 = nn.Linear(self.transformer_model.config.hidden_size, self.transformer_model.config.hidden_size)
        self.fc2 = nn.Linear(self.transformer_model.config.hidden_size , 100)
        self.relu = nn.ReLU()


    def forward(
        self,
        input_ids: torch.Tensor = None,
        attention_mask: torch.Tensor = None,
        token_type_ids: torch.Tensor = None,
        labels: torch.Tensor = None,
        compute_predictions: bool = False,
        compute_loss: bool = True,
        mask: torch.Tensor = None,
        *args,
        **kwargs,
    ) -> torch.Tensor:
        # group model inputs and pass to the model
        model_kwargs = {
          "input_ids": input_ids,
          "attention_mask": attention_mask
        }
        # not every model supports token_type_ids
        if token_type_ids is not None:
          model_kwargs["token_type_ids"] = token_type_ids
        transformers_outputs = self.transformer_model(**model_kwargs) # batch_size X max_len X 768
        # we would like to use the sum of the last four hidden layers
        transformers_outputs_sum = torch.stack(transformers_outputs.hidden_states[-4:], dim=0).sum(dim=0)
        transformers_outputs_sum = self.dropout(transformers_outputs_sum)

        #I get the attentio matrix
        attention_matrix = transformers_outputs.attentions[-1].transpose(1,2).transpose(2,3) # batch_size X  X max_len X max_len x num_heads
        attention_matrix = self.dropout(self.relu(self.fcAttention1(attention_matrix)))

        #I compute the element-wise matrix
        # seq_len = transformers_outputs_sum.shape[1]
        # transformers_outputs_sum_1 = transformers_outputs_sum[:].unsqueeze(1).repeat(1, seq_len, 1, 1)# batch_size X max_len X 1 X 768
        # transformers_outputs_sum_2 = transformers_outputs_sum[:].unsqueeze(2).repeat(1, 1, seq_len, 1)# batch_size X 1 X max_len X 768
        # transformers_outputs_sum_3 = transformers_outputs_sum_1 * transformers_outputs_sum_2

        transformers_outputs_sum_3 = torch.einsum('bik,bjk->bijk', transformers_outputs_sum, transformers_outputs_sum)

        #Concat the 2 matrix
        # concat = torch.concatenate((transformers_outputs_sum_3, attention_matrix), axis=-1)

        concat = attention_matrix * transformers_outputs_sum_3

        # 2 Linear layers to decrease the size and finally thr classifier
        concat = self.dropout(self.relu(self.fc1(concat)))
        concat = self.dropout(self.relu(self.fc2(concat)))
        logits = self.classifier(concat)

        output = {"logits": logits}

        if compute_predictions:
            predictions = logits.argmax(dim=-1)
            output["predictions"] = predictions

        if compute_loss and labels is not None:
            output["loss"] = self.compute_loss(logits, labels)

        return output

    def compute_loss(
        self, logits: torch.Tensor, labels: torch.Tensor
    ) -> torch.Tensor:
        """
        Compute the loss of the model.
        Args:
            logits (`torch.Tensor`):
                The logits of the model.
            labels (`torch.Tensor`):
                The labels of the model.
        Returns:
            obj:`torch.Tensor`: The loss of the model.
        """
        return F.cross_entropy(
            logits.view(-1, self.num_labels),
            labels.view(-1).to(torch.long),
            ignore_index=-100,
        )

### Baseline model

In [None]:
import torch
import torch.nn.functional as F

class FranzModel(torch.nn.Module):
    def __init__(self, language_model_name: str, num_labels: int, fine_tune_lm: bool = True, *args, **kwargs) -> None:
        super().__init__()
        self.num_labels = num_labels
        # layers
        self.transformer_model = AutoModel.from_pretrained(language_model_name, output_hidden_states=True, output_attentions=True)
        if not fine_tune_lm:
            for param in self.transformer_model.parameters():
                param.requires_grad = False
        self.dropout = torch.nn.Dropout(0.2)
        self.dense = torch.nn.Linear(768, 768, bias=True)
        self.classifier = torch.nn.Linear(768, num_labels, bias=True)





    def forward(
        self,
        input_ids: torch.Tensor = None,
        attention_mask: torch.Tensor = None,
        token_type_ids: torch.Tensor = None,
        labels: torch.Tensor = None,
        compute_predictions: bool = False,
        compute_loss: bool = True,
        mask: torch.Tensor = None,
        *args,
        **kwargs,
    ) -> torch.Tensor:
        # group model inputs and pass to the model
        model_kwargs = {
          "input_ids": input_ids,
          "attention_mask": attention_mask
        }
        # not every model supports token_type_ids
        if token_type_ids is not None:
          model_kwargs["token_type_ids"] = token_type_ids
        transformers_outputs = self.transformer_model(**model_kwargs) # batch_size X max_len X 768

        # we would like to use the sum of the last four hidden layers
        transformers_outputs_sum = torch.stack(transformers_outputs.hidden_states[-4:], dim=0).sum(dim=0)
        transformers_outputs_sum = self.dropout(transformers_outputs_sum)

        # transformers_outputs_sum = transformers_outputs.hidden_states[-1]
        # print(transformers_outputs_sum.shape)

        batch_result_tensor = torch.einsum('bik,bjk->bijk', transformers_outputs_sum, transformers_outputs_sum)
        # print(batch_result_tensor.shape)

        batch_result_tensor = self.dropout(self.dense(batch_result_tensor))
        logits = self.classifier(batch_result_tensor)

        output = {"logits": logits}

        if compute_predictions:
            predictions = logits.argmax(dim=-1)
            output["predictions"] = predictions

        if compute_loss and labels is not None:
            output["loss"] = self.compute_loss(logits, labels)

        return output

    def compute_loss(
        self, logits: torch.Tensor, labels: torch.Tensor
    ) -> torch.Tensor:
        """
        Compute the loss of the model.
        Args:
            logits (`torch.Tensor`):
                The logits of the model.
            labels (`torch.Tensor`):
                The labels of the model.
        Returns:
            obj:`torch.Tensor`: The loss of the model.
        """
        return F.cross_entropy(
            logits.view(-1, self.num_labels),
            labels.view(-1).to(torch.long),
            ignore_index=-100,
        )

## Dataloader




In [None]:
datasetReDev = DatasetRe(tokens_s, relations_s)
datasetReTest = DatasetRe(test_tokens_s, test_relations_s)
datasetReTrain = DatasetRe(train_tokens_s, train_relations_s)


In [None]:
def collate_fn(data):
  '''
  Each sample in the batch has a List[str] corresponding to the original sentence,
  a labels which is matrix that express the relations between subects and objects
  '''

  sentences = [d[0] for d in data]
  relations = [d[1] for d in data]
  tokenized = tokenizer(sentences, return_tensors="pt",
                                    padding='longest',
                                    is_split_into_words=True)

  words_ids = [
    tokenized[i].word_ids
    for i in range(len(tokenized['input_ids']))
  ]

  table_re = genTrueLabels(tokenized, relations, words_ids)

  rets = {
      'sentences': sentences,
      'table_re': table_re
  }

  return rets

In [None]:
dataloaderDev = DataLoader(
    datasetReDev,
    batch_size=8,
    shuffle=False,
    collate_fn=collate_fn
)

dataloaderTest = DataLoader(
    datasetReTest,
    batch_size=8,
    shuffle=False,
    collate_fn=collate_fn
)

dataloaderTrain = DataLoader(
    datasetReTrain,
    batch_size=8,
    shuffle=False,
    collate_fn=collate_fn
)

## RUN

### Model Init

In [None]:
# model = REWise("bert-base-uncased", len(rel2idx), fine_tune_lm=False).to(hypers.device)
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

model = REWise(language_model, len(rel2idx), fine_tune_lm=False).to(hypers.device)
# model = FranzModel(language_model, len(rel2idx), fine_tune_lm=False).to(hypers.device)
# tokenizer = AutoTokenizer.from_pretrained(language_model, add_prefix_space=True)

optimizer = torch.optim.Adam(model.parameters(), lr=hypers.learning_rate)


### Train

In [None]:
pretrained_weights = torch.load('/content/drive/MyDrive/AI/NLP_HW3/re_table_noncela_9.pth', map_location=torch.device(hypers.device))

  # Load the weights into your model
model.load_state_dict(pretrained_weights)

### Main Train

In [None]:
model.train()
best_loss = float('+inf')
train_losses = []
for epoch in range(5,7):
  bar_tqdm = tqdm(enumerate(dataloaderTrain), total = len(dataloaderTrain), position = 0, leave = True)

  for i, batch in bar_tqdm:
    optimizer.zero_grad()

    sentences = batch['sentences']
    re_labels = batch['table_re']

    tokens = tokenizer(sentences,
                      return_tensors="pt",
                      padding=True,
                      is_split_into_words=True)

    tokens['labels'] = re_labels

    batch = {k: v.to(hypers.device) for k, v in tokens.items()}

    outputs = model(**batch)
    loss = outputs["loss"]
    loss.backward()
    optimizer.step()

    if i % 100 == 0:
      print(f'iteration: {i}, loss: {loss.item()}')
    train_losses.append(loss.item())

  # test_loss = test(model, dataloaderDev)
  # print(f'train_loss: {np.array(train_losses).mean()}, test_loss: {test_loss}, best_loss: {best_loss}')
  # if test_loss < best_loss:
  #   print("best loss found")
  #   best_loss = test_loss
  save_path = f'/content/drive/MyDrive/AI/NLP_HW3/re_table_franz_wise_combo_lr_low_{epoch}.pth'
  torch.save(model.state_dict(), save_path)



  0%|          | 2/7025 [00:00<14:08,  8.28it/s]

iteration: 0, loss: 0.012628231197595596


  1%|▏         | 102/7025 [00:09<09:44, 11.84it/s]

iteration: 100, loss: 0.00919689703732729


  3%|▎         | 202/7025 [00:17<08:45, 12.99it/s]

iteration: 200, loss: 0.00552060641348362


  4%|▍         | 301/7025 [00:26<09:36, 11.67it/s]

iteration: 300, loss: 0.009421775117516518


  6%|▌         | 402/7025 [00:35<09:30, 11.62it/s]

iteration: 400, loss: 0.00416558887809515


  7%|▋         | 502/7025 [00:43<08:28, 12.83it/s]

iteration: 500, loss: 0.006601123604923487


  8%|▊         | 566/7025 [00:49<09:21, 11.50it/s]


KeyboardInterrupt: 

In [None]:
all_test_losses = []
model.eval()

all_relations_pred_labels = []
all_relations_pred = []

for i, batch in tqdm(enumerate(dataloaderTest), total = len(dataloaderTest), position = 0, leave = True):
  optimizer.zero_grad()

  sentences = batch['sentences']
  # ner_labels = batch['ner_labels']
  re_labels = batch['table_re']

  tokens = tokenizer(sentences,
                    return_tensors="pt",
                    padding=True,
                    is_split_into_words=True)

  tokens['labels'] = re_labels

  words_ids = [
    tokens[i].word_ids
    for i in range(len(tokens['input_ids']))
  ]

  batch = {k: v.to(hypers.device) for k, v in tokens.items()}
  batch['compute_loss'] = True
  batch['compute_predictions'] = True

  with torch.no_grad():
    outputs = model(**batch)
  loss = outputs["loss"]
  pred = outputs["predictions"]

  #Extract Original Relations
  labels_indices = getLabelsIndices(batch['labels'])
  prediction_indices = torch.where(pred != 0)

  all_relations_pred_labels += computeRelations(labels_indices, batch['labels'], words_ids)
  all_relations_pred += computeRelations(prediction_indices, pred, words_ids)

  all_test_losses.append(loss.item())
  if i % 100 == 0:
    print(f'iteration: {i}, loss: {loss.item()}')



  2%|▏         | 4/250 [00:00<00:16, 14.50it/s]

iteration: 0, loss: 0.005053708329796791


 41%|████      | 102/250 [00:04<00:07, 19.19it/s]

iteration: 100, loss: 0.0037056799046695232


 82%|████████▏ | 205/250 [00:09<00:02, 19.24it/s]

iteration: 200, loss: 0.011923396028578281


100%|██████████| 250/250 [00:12<00:00, 20.64it/s]


In [None]:
score(all_relations_pred_labels, all_relations_pred)


(0.4136742315426601, 0.38095238095238093, 0.4525455688246386)

### Test

In [None]:
pretrained_weights = torch.load('/content/drive/MyDrive/AI/NLP_HW3/re_table_franz_dev_1.pth', map_location=torch.device(hypers.device))
model.load_state_dict(pretrained_weights)

<All keys matched successfully>

In [None]:
all_test_losses = []
model.eval()

all_relations_pred_labels = []
all_relations_pred = []

for i, batch in tqdm(enumerate(dataloaderTest), total = len(dataloaderTest), position = 0, leave = True):
  optimizer.zero_grad()

  sentences = batch['sentences']
  # ner_labels = batch['ner_labels']
  re_labels = batch['table_re']

  tokens = tokenizer(sentences,
                    return_tensors="pt",
                    padding=True,
                    is_split_into_words=True)

  tokens['labels'] = re_labels

  words_ids = [
    tokens[i].word_ids
    for i in range(len(tokens['input_ids']))
  ]

  batch = {k: v.to(hypers.device) for k, v in tokens.items()}
  batch['compute_loss'] = True
  batch['compute_predictions'] = True

  with torch.no_grad():
    outputs = model(**batch)
  loss = outputs["loss"]
  pred = outputs["predictions"]

  #Extract Original Relations
  labels_indices = getLabelsIndices(batch['labels'])
  prediction_indices = torch.where(pred != 0)

  all_relations_pred_labels += computeRelations(labels_indices, batch['labels'], words_ids)
  all_relations_pred += computeRelations(prediction_indices, pred, words_ids)

  all_test_losses.append(loss.item())
  if i % 100 == 0:
    print(f'iteration: {i}, loss: {loss.item()}')



  1%|          | 3/250 [00:00<00:25,  9.75it/s]

iteration: 0, loss: 0.004184530582278967


 41%|████      | 102/250 [00:06<00:10, 13.94it/s]

iteration: 100, loss: 0.00247001089155674


 81%|████████  | 203/250 [00:12<00:03, 14.24it/s]

iteration: 200, loss: 0.010397104546427727


100%|██████████| 250/250 [00:16<00:00, 15.37it/s]


In [None]:
score(all_relations_pred_labels, all_relations_pred)

(0.4257320588681535, 0.41155764153710767, 0.4409176618478944)

# **Pipeline Approach**

##Import libraries

In [None]:
!git clone https://github.com/SapienzaNLP/nlp2023-hw3

Cloning into 'nlp2023-hw3'...
remote: Enumerating objects: 23, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 23 (delta 0), reused 20 (delta 0), pack-reused 0[K
Receiving objects: 100% (23/23), 8.01 MiB | 10.04 MiB/s, done.


In [None]:
!pip install seqeval
!pip install -q transformers

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=48db206e80673d54d14dce495791058065faa11eebc42a2b3fef4c65b7ad5e8a
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
import gdown
from tqdm.auto import tqdm
import json
import shutil
import torch.nn as nn
import numpy as np
import os
import torch
import pandas as pd
from matplotlib.pyplot import savefig
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader, Dataset, IterableDataset
import matplotlib.pyplot as plt
import pickle
import torch.nn.functional as F
import torch.optim as optim
import gdown
import seaborn as sns
import pprint
import itertools
from collections import Counter
from sklearn.metrics import classification_report, accuracy_score, f1_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score as seq_f1
from transformers import AutoTokenizer, BertTokenizerFast, BertForTokenClassification, AutoModel, BertForSequenceClassification
from seqeval.scheme import IOB2, IOB1
from google.colab import drive
drive.mount('/content/drive/')

SEED:int = 42

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


##From sentence to vocab

In [None]:
#Convert original words into vocabulary indices
def fromWordToVocab(sentence, vocab): #cambia in sentence2vocab
  new_sentence = [vocab[p] for p in sentence]
  return new_sentence

def fromVocabToWord(sentence, idx2vocab):
  new_sentence = [idx2vocab[p] for p in sentence]
  return new_sentence

## Hyperparameters

In [None]:
#hyperparameters
class hypers:

    input_size = 5
    hidden_size = 512
    num_layers = 2
    num_classes = 5

    dropout_rate = 0.2

    learning_rate = 1e-3

    epochs = 50
    batch_size = 64
    print_step = 100
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Phase 1: NER

### Tokenizer and dictionaries

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

#Dictionary for labels
label2idx = {"O":0, "B-ENT":1, "I-ENT":2}
idx2label = {0: "O", 1: "B-ENT", 2: "I-ENT"}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

### Downloader

In [None]:
class DatasetDownloader():
  '''
  This class download the dataset and creates all the labels for the NER task
  '''

  def __init__(self, path, label2idx):
    self.data = self.downloadDataset(path)
    self.label2idx = label2idx

  def generateLabels(self):
    for i, sample in tqdm(enumerate(self.data), total = len(self.data)):
      tokens = sample["tokens"]
      relations = sample["relations"]

      labels = self.getTrueLabels(tokens, relations, self.label2idx)

      self.data[i]['labels'] = labels
    return

  def getTrueLabels(self, tokens, rel, label2idx):
    #Create the labels converting from "B-NET" AND "I-ENT" to their relative indices
    labels = []
    labels = ["O" for i in tokens]

    #Loop over all relations
    for relation in rel:
      sub, obj = relation["subject"], relation["object"]
      sub_span = [sub["start_idx"], sub["end_idx"]]
      obj_span = [obj["start_idx"], obj["end_idx"]]

      for i, tok in enumerate(tokens):
        if i == sub_span[0]:
          labels[i] = "B-ENT"
          for k in range(i, sub_span[1]):
            labels[k] = "I-ENT"
        if i == obj_span[0]:
          labels[i] = "B-ENT"
          for k in range(i, obj_span[1]):
            labels[k] = "I-ENT"


    labels_enc = [label2idx[p] for p in labels] #From sentence to vocab
    return labels_enc

  def setVocab(self, label2idx):
    self.label2idx = label2idx

  def downloadDataset(self, path):
    data = []
    for line in open(path):
      data.append(json.loads(line))
    return data

###Dataset

In [None]:
class NerDataset(Dataset):
  '''
  Dataset class for NER task
  '''

  def __init__(self, data, label2idx):
    self.data = data
    self.label2idx = label2idx

  def __getitem__(self, idx):
    data = self.data[idx]

    tokens = data["tokens"]
    labels = data["labels"]

    return tokens, labels

  def __len__(self):
    return len(self.data)

  def get_original_item(self, idx):
     data = self.data[idx]
     return data["tokens"], data["relations"]



In [None]:
def collate_fn(batch):
  tokens, labels_enc = zip(*batch)

  labels = []
  tokenized = tokenizer(tokens, padding=True, return_tensors="pt", is_split_into_words=True)

  #Loop over all the batch
  for i, label in enumerate(labels_enc):
    word_ids_sentence_i =  tokenized.word_ids(batch_index=i)
    previous_word_idx = None
    label_ids = []


    for word_idx in word_ids_sentence_i:
        #PAD, [CLS], [SEP] are set to -100
        if word_idx is None:
          label_ids.append(-100)
        # Every beginning token for B-ENT and I-ENT are set to thei relative labels
        elif word_idx != previous_word_idx:
          label_ids.append(label[word_idx])
        # Sub-tokens for labels generated by the tokenizer are set to -100
        else:
          label_ids.append(-100)
        previous_word_idx = word_idx
    labels.append(label_ids)
  final_labels = pad_sequence(torch.tensor(labels), batch_first=True, padding_value=-100)

  return tokenized, final_labels

###Model

In [None]:
class NerModel(nn.Module):
  def __init__(self, label_size, len_tokenizer, device, load=False):
    super().__init__()
    self.label_size = label_size
    self.transformer = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=label_size) #it contains a dropout with p=0.1
    self.transformer.resize_token_embeddings(len_tokenizer)
    self.device = device

  def forward(self, x, labels=None, compute_loss=True, compute_predictions=False):
    input_ids = x["input_ids"].to(self.device, dtype=torch.long)
    attention_mask = x["attention_mask"].to(self.device, dtype=torch.long)

    #Use a BertForTokenClassification to classify the entities
    if compute_loss:
      labels = labels.long()
      out = self.transformer(input_ids, attention_mask=attention_mask, labels=labels)
      return out

    #Compute the predictions
    if compute_predictions:
      outputs = self.transformer(input_ids, attention_mask=attention_mask)
      logits = outputs[0].view(-1, self.label_size)
      predictions = logits.argmax(-1)
      return logits, predictions

###Handler class

In [None]:
class HandlerNer():
  '''
  This class implements the methods to train and validate the model.
  it also instantiates the optimizer and print the metrics during the training.
  '''

  def __init__(self, model, label2idx, tokenizer, device, num_ckpt=0):
    self.model = model
    self.device = device
    self.label2idx = label2idx
    self.tokenizer = tokenizer

  def setDataset(self, train_dataset, valid_dataset, epochs):
    self.train_dataset = train_dataset
    self.valid_dataset = valid_dataset
    self.epochs = epochs

  def train(self, config=None):
    self.optimizer = optim.Adam(self.model.parameters(), 0.0001)
    self.train_epoch()
    return

  def train_epoch(self):
    print("Start Training")

    for epoch in range(self.epochs):

      losses = []
      accuracies = []

      self.model.train()

      #Train over the batches

      for i, batch in tqdm(enumerate(self.train_dataset), total=len(self.train_dataset), desc="Batch"):
        with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=True):
          input, label = batch
          input = input.to(self.device)
          label = label.to(self.device)

          out = self.model(input, label)
        loss, logits = out[0], out[1]

        # Get the predictions
        label = label.view(-1)
        logits = logits.view(-1, self.model.label_size)
        preds = logits.argmax(axis=1)

        # Create a mask to remove the paddings
        mask = label != -100

        labels = label[mask]
        predictions = preds[mask]

        # compute accuracy
        accuracies.append(accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy()))

        loss.backward()
        self.optimizer.step()
        self.optimizer.zero_grad()

        losses.append(loss.item())

        del preds
        del labels
        del logits
        del mask

      train_loss_mean = np.array(losses).mean()
      train_accuracy_mean = np.array(accuracies).mean()
      valid_loss_mean, eval_accuracy_mean, micro_f1 = self.validate(self.valid_dataset)

      print(f'train_loss: {train_loss_mean:0.4f}  val_oss: {valid_loss_mean:0.4f} \n train_acc: {train_accuracy_mean:0.4f} val_acc: {eval_accuracy_mean:0.4f} \n F1: {micro_f1:0.4f}')

    print("End Training")
    return

  def validate(self, valid_dataset):
    self.model.eval()

    total_preds, total_labels = [], []
    accuracies = []
    losses = []

    print("Start Validation")

    with torch.no_grad():
      for idx, batch in tqdm(enumerate(valid_dataset), total = len(valid_dataset)):
        input, label = batch
        input = input.to(self.device)
        label = label.to(self.device)

        out = self.model(input, label)
        loss, logits = out[0], out[1]

        losses.append(loss.item())

        # get the predictions
        label = label.view(-1)
        logits = logits.view(-1, self.model.label_size)
        preds = torch.argmax(logits, axis=1)

        # create a mask to filter the paddings
        mask = label.view(-1) != -100

        labels = torch.masked_select(label, mask)
        predictions = torch.masked_select(preds, mask)

        labels = label[mask]
        predictions = preds[mask]

        total_labels.extend(labels.tolist())
        total_preds.extend(predictions.cpu().tolist())

        # compute accuracy
        accuracies.append(accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy()))

    #compute f1_score converting from vocabulary indices to Original NER tokens (i.e. "B-ENT")
    micro_f1 = seq_f1([fromVocabToWord(total_preds, idx2label)], [fromVocabToWord(total_labels, idx2label)], mode="strict", average="micro", scheme=IOB2)

    print("End Validation")
    return np.array(losses).mean(), np.array(accuracies).mean(), micro_f1

  def predict(self, x):
    self.model.eval()
    with torch.no_grad():
          logits, predictions = self.model(x, compute_loss = False, compute_predictions=True)
          return logits, predictions


### Instantiate Dataset

In [None]:
train_downloader = DatasetDownloader("nlp2023-hw3/data/train.jsonl", label2idx)
valid_downloader = DatasetDownloader("nlp2023-hw3/data/dev.jsonl", label2idx)
test_downloader = DatasetDownloader("nlp2023-hw3/data/test.jsonl", label2idx)

train_downloader.generateLabels()
valid_downloader.generateLabels()
test_downloader.generateLabels()

train_dataset = NerDataset(train_downloader.data, label2idx)
valid_dataset = NerDataset(valid_downloader.data, label2idx)
test_dataset = NerDataset(test_downloader.data, label2idx)

  0%|          | 0/56196 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_fn, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, collate_fn=collate_fn, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn, shuffle=False)

###Train

In [None]:
#Instantiate the model
model = NerModel(label_size=len(label2idx), len_tokenizer=len(tokenizer), device=hypers.device).to(hypers.device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#instantiate the handler
handlerNer = HandlerNer(model=model, label2idx=label2idx, tokenizer=tokenizer, device=hypers.device)

In [None]:
handlerNer.setDataset(train_loader, valid_loader, 2)

In [None]:
handlerNer.train()

###Test

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

model = NerModel(label_size=len(label2idx), len_tokenizer=len(tokenizer), device=hypers.device).to(hypers.device)
model_weights = torch.load('/content/drive/MyDrive/AI/NLP_HW3/pipeline/final/model_0.pth', map_location=torch.device(hypers.device))
model.load_state_dict(model_weights)

handlerNer = HandlerNer(model=model, label2idx=label2idx, tokenizer=tokenizer, device=hypers.device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
final_pred = []
final_labels = []
for (x, label) in tqdm(test_loader, total = len(test_loader)):

        x = x.to(hypers.device)

        label = label.to(hypers.device)
        logits, preds = handlerNer.predict(x)

        preds = preds.view(-1)
        labels = label.view(-1)

        indices = labels != -100

        predictions = preds[indices]
        labels = labels[indices]

        final_pred.extend(predictions.cpu().tolist())
        final_labels.extend(labels.tolist())


  0%|          | 0/63 [00:00<?, ?it/s]

In [None]:
print(classification_report([fromVocabToWord(final_pred, idx2label)],  [fromVocabToWord(final_labels, idx2label)], mode="strict", scheme=IOB2))

##Phase 2: RE

### CONST

In [None]:
f_relations2id = open("nlp2023-hw3/data/relations2id.json")
rel_vocab = json.load(f_relations2id)
f_relations2id.close()

idx2rel = {v: k for k, v in rel_vocab.items()}

# Load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

#Token annotations to specify to the RE model who are the entities identified by the NER Model
tokenizer.add_tokens(['[SUB]', '[/SUB]', '[OBJ]', '[/OBJ]'])

4

###RE Build

In [None]:
#FUNCTIONS USED TO BUILD THE NEW DATASET FOR RE TASK

def padLabels(input_ids, labels):
  #Add the paddings to the batch and sub-tokens
  word_ids =  input_ids.word_ids()
  previous_word_idx = None
  label_ids = []

  #Loop over all tokens generated by the tokenizer
  for word_idx in word_ids:
      # PAD, [CLS], [SEP] are set to -100
      if word_idx is None:
        label_ids.append(-100)
      # Set the label for each starting token of each entity word
      elif word_idx != previous_word_idx:
        label_ids.append(labels[word_idx])
      # Set all the sub-tokens equal to -100
      else:
        label_ids.append(-100)
      previous_word_idx = word_idx

  return torch.tensor(label_ids)

def getPredictionsAndLabelsPad(input, model, label):
    #Get the entities predicted by the NER model

    tokens = tokenizer(input, return_tensors="pt", is_split_into_words=True)

    # Set padding and tokens equal to -100
    labels_padded = padLabels(tokens, label)

    logits, preds = model(tokens, compute_loss=False, compute_predictions=True)

    # Filter out all the paddings and sub-tokens
    indices = labels_padded != -100

    predictions = preds[indices]
    labels = labels_padded[indices]

    return predictions.cpu().numpy(), labels

def getPermutatedEntities(entity_idx, sentence):
  # Get all the possibile permutations between all the entities predicted
  permutated_entities = []

  for subject_idx, object_idx in entity_idx:
    start_subj, end_subj = subject_idx[0], subject_idx[1]
    start_obj, end_obj = object_idx[0], object_idx[1]

    subj = sentence[start_subj: end_subj]
    obj = sentence[start_obj: end_obj]

    permutated_entities.append({"subject": subj, "object": obj})

  return permutated_entities

def getEntitiesIndices(labels):
    # return the indices of all the entities
    entities_indices = []
    entity_start = -1

    for idx, label in enumerate(labels):
        if label == "B-ENT":
            if entity_start != -1:  # Save the previous entity if one is open
                entities_indices.append([entity_start, idx])
            entity_start = idx  # Start a new entity
        elif label != "I-ENT" and entity_start != -1:

            # End of the current entity
            entities_indices.append([entity_start, idx])
            entity_start = -1  # Reset the start index since the entity has ended

    # If there's an entity that goes up to the last label, close it
    if entity_start != -1:
        entities_indices.append([entity_start, len(labels)])

    return entities_indices

def addAnnotations(sentence, entities, entities_ids, relations):
  #Add the tokens annotations where the entities have been predicted

  sentences_annotated = []

  subj_temp, obj_temp = entities["subject"], entities["object"]

  # Subject and object text for each entity
  subj_temp = ' '.join(subj_temp)
  obj_temp = ' '.join(obj_temp)

  new_relation = []

  start_subject, end_subject = entities_ids[0][0], entities_ids[0][1]
  start_object, end_object = entities_ids[1][0], entities_ids[1][1]

  for i, tok in enumerate(sentence):
    if i == start_subject:
      sentences_annotated.append("[SUB]")
    if i == end_subject:
      sentences_annotated.append("[/SUB]")
    if i == start_object:
      sentences_annotated.append("[OBJ]")
    if i == end_object:
      sentences_annotated.append("[/OBJ]")
    sentences_annotated.append(tok)

  #Loop over all the relations
  for rel in relations:
    sub, obj = rel["subject"], rel["object"]
    relation = rel["relation"]

    #return the relation
    if sub["text"] == subj_temp and obj["text"] == obj_temp:
      new_relation.append(relation)

  # If no relation has been found between the entities, return no_relation
  if len(new_relation) == 0:
      new_relation.append("no_relation")

  return sentences_annotated, new_relation

def getAllPermutations(entities):
  #Generate all the possibile combinations between the given entities
  all_premutations = []

  for i, elem in enumerate(entities):
    for j, e in enumerate(entities):
      if i != j:
        all_premutations.append((elem, e))

  return all_premutations

def generateReDataset(dataset, model):
    '''
      generate a new complete dataset for the RE task,
      adding the tokens annotations and 1 sentence for each pair of entities predicted by the NER model
    '''

    rel_dataset = []

    #Loop over the original dataset
    for idx, elem in tqdm(enumerate(dataset), total = len(dataset)):
      original_tokens = elem[0]
      original_label = elem[1]
      relations = dataset.get_original_item(idx)[1]

      # Get the entitities predicted by the NER model
      label_pred, _ = getPredictionsAndLabelsPad(original_tokens, model, original_label)

      #get the ids of subject and object predicted
      entitiesIds = getEntitiesIndices(fromVocabToWord(label_pred, idx2label))

      #all the possibile combinations for the predicted entities
      allPermutation = getAllPermutations(entitiesIds)
      permutatedEntities = getPermutatedEntities(allPermutation, original_tokens)

      #For each pair of entities return their relation if any, else append no_relation
      for i, entities in enumerate(permutatedEntities):
        item = {}
        sentence_annotated, relation_annotated = addAnnotations(original_tokens, entities, allPermutation[i], relations)
        item["Sentence"] = sentence_annotated
        item["Relation"] = relation_annotated
        rel_dataset.append(item)

    return rel_dataset

In [None]:
#CREATE A NEW DATASET FOR THE RE TASK WITH THE ANNOTATIONS TOKENS [SUB], [OBJ]

# model_ner = NerModel(label_size=len(label2idx), len_tokenizer=len(tokenizer), device=hypers.device).to(hypers.device)

# model_weights = torch.load('/content/drive/MyDrive/AI/NLP_HW3/pipeline/final/model_0.pth', map_location=torch.device(hypers.device))
# model_ner.load_state_dict(model_weights)

# rel_train_dataset = generateReDataset(train_dataset, model_ner)
# rel_val_dataset = generateReDataset(valid_dataset, model_ner)

# f_train = open("train_rel.jsonl", 'w')
# f_val = open("val_rel.jsonl", 'w')

# for item in rel_train_dataset:
#   f_train.write(json.dumps(item) + "\n")

# for item in rel_val_dataset:
#   f_val.write(json.dumps(item) + "\n")

# f_train.close()
# f_val.close()

### Aux Functions for prediction

In [None]:
#predicts the entities for each sentences
def getPredictions(input, model, tokenizer):

    tokenized = tokenizer(input, return_tensors="pt", padding=True, return_offsets_mapping=True, is_split_into_words=True) #tokenize the input

    with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=True):
      with torch.no_grad():
        logits, preds = model(tokenized, compute_loss=False, compute_predictions=True)

    #create a mask to remove paddings
    offset = tokenized["offset_mapping"].squeeze()
    prediction_mask = (offset[:, 0] == 0) & (offset[:, 1] != 0)

    #get the filtered predictions
    filtered_predictions = predictions[prediction_mask]

    return filtered_predictions.cpu().numpy()


#perform predictions of the relation for each relation canditates tuple
def addAnnotationsPred(sentence, entities, model, tokenizer_re):
  sentences_annotated = []

  start_subj, end_subj = entities[0][0], entities[0][1]
  start_obj, end_obj = entities[1][0], entities[1][1]

  #loop over all tokens of the sentence and add the annotation based on SUB and OBJ tokens
  for idx, token in enumerate(sentence):
    if idx == start_subj:
      sentences_annotated.append("[SUB]")
    if idx == end_subj:
      sentences_annotated.append("[/SUB]")
    if idx == start_obj:
      sentences_annotated.append("[OBJ]")
    if idx == end_obj:
      sentences_annotated.append("[/OBJ]")

    sentences_annotated.append(token)

  tokenized = tokenizer_re(sentences_annotated, return_tensors="pt", is_split_into_words=True) #tokenize the input
  with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=True):
    logits, preds = model(tokenized, compute_predictions=True)


  mask = preds != -100 #remove padding

  predictions = preds[mask]
  return predictions.cpu().numpy()

def completePredict(sentence):
      predictions = []

      #Phase 1: NER
      label_pred = getPredictions(sentence, model_ner, tokenizer_ner)
      sentence_converted = fromVocabToWord(label_pred, idx2label)

      #get the ids of subject and object predicted
      label_ids = getEntitiesIndices(sentence_converted)

      #Extract all the possible combinations of entities predicted in phase 1
      relation_permutations = getAllPermutations(label_ids)

      for i, _ in enumerate(relation_permutations):
        elem = {}

        start_subj, end_subj = relation_permutations[0][0], relation_permutations[0][1]
        start_obj, end_obj = relation_permutations[1][0], relation_permutations[1][1]

        subject = {"start_idx": start_subj, "end_idx" : end_subj}
        obj = {"start_idx": start_obj, "end_idx" : end_obj}

        #phase 2, compute the RE task
        annotated_relations = addAnnotationsPred(sentence, relation_permutations[i], model_re, tokenizer_re)

        #filter out sentences without relations
        rel = annotated_relations != 0
        annotated_relations = annotated_relations[rel]

        if len(annotated_relations) == 0:
          continue


        relation_converted = fromVocabToWord(annotated_relations, idx2rel)

        elem["subject"] = subject
        elem["relation"] = relation_converted[0]
        elem["object"] = obj

        predictions.append(elem)

      return predictions

In [None]:
for i in range(5, 4):
  print(i)
print(i)

4


In [None]:
def get_tupled_labels(rel_dicts):
        tupled_labels = []
        for rel_dict in rel_dicts:
            subject_dict, object_dict = rel_dict["subject"], rel_dict["object"]
            tupled_labels.append(
                (
                    (subject_dict["start_idx"], subject_dict["end_idx"]),
                    rel_dict["relation"],
                    (object_dict["start_idx"], object_dict["end_idx"])
                )
            )
        return tupled_labels

def score(total_labels, total_preds):
  true_positives = 0
  num_golds = 0
  num_preds = 0
  for sent_targets, sent_predictions in zip(total_labels, total_preds):
      #sent_targets, sent_predictions = get_tupled_labels(sent_targets), get_tupled_labels(sent_predictions)
      true_positives += len(set(sent_targets).intersection(set(sent_predictions)))
      num_golds += len(sent_targets)
      num_preds += len(sent_predictions)
      precision = true_positives / num_preds if num_preds > 0 else 0.0
      recall = true_positives / num_golds if num_golds > 0 else 0.0

      f1_score = (
          (2 * precision * recall) / (precision + recall)
          if precision + recall > 0
          else 0.0
      )

  return f1_score

### Collate_fn

In [None]:
def collate_fn_re(batch):
  tokens, labels = zip(*batch)

  tokenized = tokenizer(tokens,
                        padding=True,
                        return_tensors="pt",
                        is_split_into_words=True)

  #pad the batch with -100
  labels_padded = pad_sequence(torch.LongTensor(np.array(labels)), batch_first=True, padding_value=-100)

  return tokenized, torch.LongTensor(np.array(labels_padded))

###RE Dataset

In [None]:
class RelationDataset(Dataset):
  '''
  Dataset class for Relation Extraction task
  '''


  def __init__(self, path, relation_vocab):
    self.rel_dataset = []

    #retrieve the previously generated dataset
    for line in open(path):
      self.rel_dataset.append(json.loads(line))

    self.relation_vocab = relation_vocab

  def __getitem__(self, idx):
    data = self.rel_dataset[idx]

    sentence = data["Sentence"]
    relation = data["Relation"]

    return sentence, fromWordToVocab(relation, self.relation_vocab) #encoded relation

  def __len__(self):
    return len(self.rel_dataset)


###RE Model

In [None]:
class ReModel(nn.Module):
  def __init__(self, hidden_size, num_classes, len_tokenizer, device):#, checkpoint_path, load=False):
    super().__init__()
    self.hidden_size = hidden_size
    self.num_classes = num_classes
    self.transformer = AutoModel.from_pretrained('bert-base-uncased')
    self.transformer.resize_token_embeddings(len_tokenizer)

    self.fc1 = nn.Linear(self.transformer.config.hidden_size, self.hidden_size)
    self.classifier = nn.Linear(self.hidden_size, self.num_classes)
    self.dropout = nn.Dropout(p=0.2)

    self.device = device

  def forward(self, batch, compute_predictions=False):
      model_kwargs = {
          "input_ids": batch["input_ids"].to(self.device, dtype=torch.long),
          "attention_mask": batch["attention_mask"].to(self.device, dtype=torch.long)
        }

      x = self.transformer(**model_kwargs)[0] #last hidden state
      x = self.dropout(x)

      #Take only the [CLS] token to classify the sentence in order to identify the relation that occurs if any
      x = x[:, 0, :]

      #Linear layer and classifier
      x = self.dropout(F.relu(self.fc1(x)))
      out = self.classifier(x)

      if compute_predictions:

        logits = out.view(-1, self.num_classes)
        predictions = torch.argmax(logits, -1)
        return logits, predictions

      return out

###Handler

In [None]:
class HandlerRE():
  '''
  This class implements the methods to train and validate the RE model.
  it also instantiates the optimizer and print the metrics during the training.
  '''

  def __init__(self, model, rel_vocab, tokenizer, device, num_ckpt=0):
    self.model = model
    self.device = device
    self.loss_function = nn.CrossEntropyLoss(ignore_index=-100)
    self.rel_vocab = rel_vocab
    self.tokenizer = tokenizer

  def setDataset(self, train_dataset, valid_dataset, epochs):
    self.train_dataset = train_dataset
    self.valid_dataset = valid_dataset
    self.epochs = epochs


  def train(self, config=None):
    self.optimizer = optim.Adam(self.model.parameters(), 0.0001)
    self.train_epoch()
    return

  def train_epoch(self):
    print("Start training")

    for epoch in range(self.epochs):
      print(f"Start Epoch {epoch+1}")
      losses = []
      accuracies = []

      self.model.train()

      for i, batch in tqdm(enumerate(self.train_dataset), total = len(self.train_dataset), desc="Batch"):
        with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=True):
          input, label = batch
          input = input.to(self.device)
          label = label.to(self.device)

          y_pred = self.model(input)

        # get the predictions
        label = label.view(-1)
        logits = y_pred.view(-1, self.model.num_classes)
        preds = torch.argmax(logits, axis=1)

        #compute the loss
        loss = self.loss_function(logits, label)

        # Create a mask to filter out paddings and sub-tokens
        mask = label.view(-1) != -100

        labels = label[mask]
        predictions = preds[mask]

        #compute accuracy
        accuracies.append(accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy()))

        loss.backward()
        self.optimizer.step()
        self.optimizer.zero_grad()

        losses.append(loss.item())

        del preds
        del labels
        del logits
        del mask

      train_loss_mean = np.array(losses).mean()
      train_accuracy_mean = np.array(accuracies).mean()
      valid_loss_mean, eval_accuracy_mean, micro_f1 = self.validate(self.valid_dataset)

      print(f'train_loss: {train_loss_mean:0.4f}  val_oss: {valid_loss_mean:0.4f} \n train_acc: {train_accuracy_mean:0.4f} val_acc: {eval_accuracy_mean:0.4f} \n F1: {micro_f1:0.4f}')

    print("End training")
    return


  def validate(self, valid_dataset):
    self.model.eval()
    total_preds, total_labels = [], []
    decoded_preds, decoded_labels = [], []

    accuracies = []
    losses = []

    print("Start Validation")

    with torch.no_grad():
      for idx, batch in tqdm(enumerate(valid_dataset), total = len(valid_dataset)):
        input, label = batch

        input = input.to(self.device)
        label = label.to(self.device)

        y_pred = self.model(input)

        # get predictions
        label = label.view(-1)
        logits = y_pred.view(-1, self.model.num_classes)
        preds = torch.argmax(logits, axis=1)

        #compute the loss
        loss = self.loss_function(logits, label)

        losses.append(loss.item())

        # create a mask to remove paddings and sub-tokens
        mask = label.view(-1) != -100

        labels = label[mask]
        predictions = preds[mask]

        total_labels.extend(labels.tolist())
        total_preds.extend(predictions.cpu().tolist())

        #compute predictions
        accuracies.append(accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy()))

    #compute f1_score
    micro_f1 = f1_score(fromVocabToWord(total_preds, idx2rel), fromVocabToWord(total_labels, idx2rel), average="micro")

    print("End Validation")

    return np.array(losses).mean(), np.array(accuracies).mean(), micro_f1

  def predict(self, batch):
    self.model.eval()
    with torch.no_grad():
          logits, predictions = self.model(batch=batch, compute_predictions=True)
          return logits, predictions


### Instantiate dataset

In [None]:
rel_train_dataset = RelationDataset("/content/drive/MyDrive/Magistrale/Natural Language Processing/NLP-HW3/train_rel.jsonl", rel_vocab)
rel_val_dataset = RelationDataset("/content/drive/MyDrive/Magistrale/Natural Language Processing/NLP-HW3/val_rel.jsonl", rel_vocab)
rel_train_loader = DataLoader(rel_train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn_re)
rel_val_loader = DataLoader(rel_val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn_re)

###Train

In [None]:
# Instantiate RE Model
modelRe = ReModel(hidden_size=hypers.hidden_size, num_classes=len(rel_vocab), len_tokenizer=len(tokenizer), device=hypers.device).to(hypers.device)

# Instantiate RE handler
handlerRe = HandlerRE(model = modelRe, rel_vocab=rel_vocab, tokenizer=tokenizer, device=hypers.device)

# Set the dataset and the number of training epochs
handlerRe.setDataset(rel_train_loader, rel_val_loader, 2)

In [None]:
# Train the model
handlerRe.train()

##A-Z Test

In [None]:
# Load the tokenizers
tokenizer_ner = BertTokenizerFast.from_pretrained('bert-base-uncased')
tokenizer_re = BertTokenizerFast.from_pretrained('bert-base-uncased')
tokenizer_re.add_tokens(['[SUB]', '[/SUB]', '[OBJ]', '[/OBJ]'])

4

In [None]:
# Load the models
model_ner = NerModel(label_size = len(label2idx), len_tokenizer=len(tokenizer_ner), device=hypers.device).to(hypers.device)
model_re = ReModel(hidden_size=hypers.hidden_size, num_classes=len(rel_vocab), len_tokenizer=len(tokenizer_re), device=hypers.device).to(hypers.device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Load the checkpoints
checkpoint_path_ent = '/content/drive/MyDrive/AI/NLP_HW3/pipeline/final/model_0.pth'
checkpoint_path_rel = '/content/drive/MyDrive/AI/NLP_HW3/pipeline/final/model_rel_0.pth'

model_weights = torch.load(checkpoint_path_ent, map_location=torch.device(hypers.device))
model_ner.load_state_dict(model_weights)


model_weights = torch.load(checkpoint_path_rel, map_location=torch.device(hypers.device))
model_re.load_state_dict(model_weights)

<All keys matched successfully>

In [None]:
final_preds = []
final_labels = []
for i, (x, _) in tqdm(enumerate(test_dataset)):

  pred = completePredict(x)

  final_preds.append(get_tupled_labels(pred))
  final_labels.append(get_tupled_labels(test_dataset.get_original_item(i)[1]))

  if i == 10:
    break

0it [00:00, ?it/s]

In [None]:
f1_score = score(final_labels, final_preds)
print(f1_score)

0.8648648648648649
