# **Natural Language Processing Homework 1: Event Detection (ED)**

*Authors:*

*   Lorenzo Ciarpaglini (student ID: 1813738)

This is the notebook running my implementation of the ED task.

# Preliminary

Run this section to import and/or download the libraries needed for the proper functioning of the notebook and to download the dataset from source.

In [None]:
! rm -rf sample_data/
! git clone https://github.com/SapienzaNLP/nlp2023-hw1

Cloning into 'nlp2023-hw1'...
remote: Enumerating objects: 30, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 30 (delta 3), reused 3 (delta 3), pack-reused 21[K
Receiving objects: 100% (30/30), 1.95 MiB | 20.76 MiB/s, done.
Resolving deltas: 100% (6/6), done.


In [None]:
#import libraries here
import os
import copy
import gc
import random
import numpy as np

import torch

import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader, random_split

import torchvision
import torchvision.transforms as transforms

from sklearn.metrics import f1_score

import matplotlib.pyplot as plt
from typing import List #, Union, Set, Callable
from xml.dom import minidom
import xml.etree.ElementTree as ET

import json

# from google.colab import drive
# drive.mount('/content/drive/')

SEED:int = 1234

In [None]:
#some global parameters & constants
DATASET_DIR = "nlp2023-hw1/data"
PRINT_BAR = '-' * 10

file_type_dir = "jsonl"  #or "xml"

# Dataset playground

This section contains code to inspect the dataset and comments to intepret its main features. Many of the following methods will be imported in what will be the final `Dataset` class. Open this section for further explanations on the strategies adopted for building the final version of the dataset.

In [None]:
def downloadDataset(dataset_prefix):
  data_path = os.path.join(DATASET_DIR, dataset_prefix)
  data_path += '.' + file_type_dir

  with open(data_path) as f:
    sentences = f.read().splitlines()

    for i, sentence in enumerate(sentences):
      sentences[i] = json.loads(sentence)
    return sentences

In [None]:
train_set = downloadDataset('train')
test_set = downloadDataset('test')
dev_set = downloadDataset('dev')

In [None]:
dev_set[0]

{'idx': 0,
 'tokens': ['However',
  ',',
  'as',
  'the',
  'day',
  'progressed',
  ',',
  'Morris',
  'improved',
  'while',
  'Park',
  'fell',
  'away',
  '.'],
 'labels': ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O']}

# From dataset to dictionary

In [None]:
def createVocabolary(data_set):
  vocab = {}
  tags = {}
  # words_set = set()
  words_set = []
  tags_set = []

  for data in data_set:
    # data = json.loads(data)

    for token in data['tokens']:
      # words_set.add(token)
      if token not in words_set:
        words_set.append(token)

    for label in data['labels']:
      # words_set.add(token)
      if label not in tags_set:
        tags_set.append(label)

  for idx, word in enumerate(words_set):
    vocab[word] = idx

  vocab['UNK'] = idx + 1
  vocab['PAD'] = idx + 2

  for idx, tag in enumerate(tags_set):
    tags[tag] = idx

  return vocab, tags

def createWordsFile(data_set):
  return

In [None]:
def map_to_dict(data_set, vocab, tags):
  data_set_new = []

  for i, data in enumerate(data_set):
    data_set_new.append(data.copy())

    data_set_new[i]['tokens_mapped'] = []
    data_set_new[i]['labels_mapped'] = []

    for token in data['tokens']:
      data_set_new[i]['tokens_mapped'].append(vocab[token] if token in vocab else vocab['UNK'])

    for label in data['labels']:
      data_set_new[i]['labels_mapped'].append(tags[label])

    data_set_new[i]['len'] = len(data['tokens'])

  return data_set_new

In [None]:
def get_max_len(data_set):
  longest = 0
  for i, elem in enumerate(data_set):
    if elem['len'] > longest:
      longest = elem['len']
  return longest



def mask_to_tensor(len_list, batch_size):
    token_len = get_max_len(len_list)
    tokens = torch.LongTensor(token_len, batch_size).fill_(0)
    for i, s in enumerate(len_list):
      tokens[:s['len'], i] = 1

    return tokens

In [None]:
def generate_batches(data_set):
  batch = []

  batches_tokens = []
  batches_labels = []

  max_len = get_max_len(data_set)
  for i, data in enumerate(data_set):
    batch.append(copy.deepcopy(data_set[i]))
    # data_set[i]['tokens'].extend('PAD' for i in range(max_len - data_set[i]['len'] ))
    batch[i]['tokens_mapped'].extend(vocab['PAD'] for i in range(max_len - data_set[i]['len']))
    # data_set[i]['labels'].extend('PAD' for i in range(max_len - data_set[i]['len'] ))
    batch[i]['labels_mapped'].extend(-1 for i in range(max_len - data_set[i]['len']))

    batches_tokens.append(torch.LongTensor(batch[i]['tokens_mapped']))
    batches_labels.append(torch.LongTensor(batch[i]['labels_mapped']))

  batches_tokens = torch.stack(batches_tokens)
  batches_labels = torch.stack(batches_labels)

  return batches_tokens, batches_labels



In [None]:
#REMEMBER we have to add UNK('O' a label??), PAD(-1 as label)
vocab, tags = createVocabolary(train_set)

In [None]:
train_set_mapped = map_to_dict(train_set, vocab, tags)
test_set_mapped = map_to_dict(test_set, vocab, tags)
dev_set_mapped = map_to_dict(dev_set, vocab, tags)

In [None]:
print(len(vocab))
print(len(tags))

# Hyper Parameteres

In [None]:
#hyper parameters
hypers = {

    'vocab_size': len(vocab),

    'embedding_dim': 128,

    'lstm_hidden_dim': 128,

    'number_of_tags': len(tags),

    'input_size': 768,

    'hidden_size': 0,

    'num_classes': 2,

    'learning_rate': 2e-5,

    'batch_size': 16,

    'epochs': 1,

    'dropout_rate': 0.2,

    'print_step': 10,

    'device': 'cuda' if torch.cuda.is_available() else 'cpu'

}

print(hypers['device'])

class Dict2Class(object):

    def __init__(self, my_dict):

        for key in my_dict:
            setattr(self, key, my_dict[key])

hypers = Dict2Class(hypers)


# Model

In [None]:
class Net(nn.Module):

    def __init__(self, hypers):
        super(Net, self).__init__()

        self.embedding = nn.Embedding(hypers.vocab_size + 1, hypers.embedding_dim)

        #the LSTM takens embedded sentence
        self.lstm = nn.LSTM(hypers.embedding_dim, hypers.lstm_hidden_dim, batch_first=True, bidirectional=True)

        #fc layer transforms the output to give the final output layer
        self.fc = nn.Linear(hypers.lstm_hidden_dim * 2, hypers.number_of_tags)

        self.dropout = nn.Dropout(hypers.dropout_rate)

        self.relu = nn.ReLU()

    # def hidden_init(batch_size, hidden_size):
    #   h0 = torch.zeros(batch_size, hidden_size).requires_grad_(False).to(hypers.device)
    #   return h0

    def forward(self, s):

        s = self.embedding(s)   # dim: batch_size x batch_max_len x embedding_dim

        #run the LSTM along the sentences of length batch_max_len
        s, _ = self.lstm(s)     # dim: batch_size x batch_max_len x lstm_hidden_dim

        #reshape the Variable so that each row contains one token
        s = s.reshape(-1, s.shape[2])  # dim: batch_size*batch_max_len x lstm_hidden_dim

        #apply the fully connected layer and obtain the output for each token
        s = self.fc(s)          # dim: batch_size*batch_max_len x num_tags
        s = self.dropout(s)

        return F.log_softmax(s, dim=1)   #


In [None]:
def loss_fn(outputs, labels):
    #reshape labels to give a flat vector of length batch_size*seq_len
    labels = labels.view(-1)

    #mask out 'PAD' tokens
    mask = (labels >= 0).float()

    #the number of tokens is the sum of elements in mask
    num_tokens = int(torch.sum(mask).item())

    #pick the values corresponding to labels and multiply by mask
    outputs = outputs[range(outputs.shape[0]), labels]*mask

    #cross entropy loss for all non 'PAD' tokens
    return -torch.sum(outputs)/num_tokens

# Instantion and Training Lightning





In [None]:
model = Net(hypers)
model.to(hypers.device)

Net(
  (embedding): Embedding(38009, 128)
  (lstm): LSTM(128, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=11, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (relu): ReLU()
)

In [None]:
# output = model(batches_tokens)

In [None]:
# loss_fn(output, batches_labels)

In [None]:
# accuracy = Accuracy(task="multiclass", num_classes=10)
# criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=hypers.learning_rate)

In [None]:

losses = []
accuracies = []

In [None]:
model.train()
for epoch in range(100):

    num_iterations = len(train_set_mapped) // 16

    for i in range(num_iterations):

        batches_tokens, batches_labels = generate_batches(train_set_mapped[i * 16:(i+1)*16])

        batches_tokens = batches_tokens.to(hypers.device)
        batches_labels = batches_labels.to(hypers.device)

        batch_size = batches_tokens.size(0)

        output = model(batches_tokens)

        loss = loss_fn(output, batches_labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        losses.append(loss)
        # accuracies.append(accuracy(output, labels).item())

    if epoch % 1 == 0:
      print("epoch: ", epoch, " loss: ", loss.item())



print("End")

In [None]:
batches_tokens, batches_labels = generate_batches(dev_set_mapped[0:8])

batches_tokens = batches_tokens.to(hypers.device)
batches_labels = batches_labels.to(hypers.device)

In [None]:
batches_labels = batches_labels.reshape(-1)
print(batches_labels)

In [None]:
mask = (batches_labels >= 0).float()
print(mask)

In [None]:
num_tokens = int(torch.sum(mask).item())
print(num_tokens)

190


In [None]:
output = model(batches_tokens)
print(output)

In [None]:
list(range(output.shape[0]))

In [None]:
output[list(range(output.shape[0])), batches_labels]


In [None]:
output[range(output.shape[0]), batches_labels].shape

In [None]:
out = output[list(range(output.shape[0])), batches_labels] * mask

In [None]:
out.exp()

In [None]:
losses_pred = []

In [None]:
losses_pred = []

model.eval()
num_iterations = len(dev_set_mapped) // 16

for i in range(num_iterations):

    batches_tokens, batches_labels = generate_batches(dev_set_mapped[i * 16:(i+1)*16])

    batches_tokens = batches_tokens.to(hypers.device)
    batches_labels = batches_labels.to(hypers.device)

    batch_size = batches_tokens.size(0)

    output = model(batches_tokens)

    loss = loss_fn(output, batches_labels)

    # optimizer.zero_grad()
    # # loss.backward()
    # optimizer.step()

    losses_pred.append(loss.detach().item())
    # accuracies.append(accuracy(output, labels).item())

if epoch % 1 == 0:
  print("epoch: ", epoch, " loss: ", loss.item())



print("End")

epoch:  99  loss:  0.1873970776796341
End


In [None]:
torch.tensor(losses_pred).mean()

tensor(0.1858)