# Assignment 3
Training a simple neural named entity recognizer (NER)

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from statistics import mean
import warnings

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
warnings.filterwarnings('ignore')  # "error", "ignore", "always", "default", "module" or "once"

In this assignment you are required to build a full training and testing pipeline for a neural sequentail tagger for named entities, using LSTM.

The dataset that you will be working on is called ReCoNLL 2003, which is a corrected version of the CoNLL 2003 dataset: https://www.clips.uantwerpen.be/conll2003/ner/


The three files (train, test and eval) are available from the course git repository (https://github.com/kfirbar/nlp-course)

As you can see, the annotated texts are labeled according to the IOB annotation scheme, for 3 entity types: Person, Organization, Location.

## **Task 1**

Write a funtion *read_data* for reading the data from a single file (either train, test or eval). This function recieves a filepath and returns a list of sentence. Every sentence is encoded as a pair of lists, one list contains the words and one list contains the labels.

In [None]:
def read_data(filepath):
  data = []
  words = []
  tags = []
  with open(filepath) as file:
    for line in file: 
      word_and_tag = line.split(" ")
      if (word_and_tag[0] == "\n"):
        data.append([words, tags])
        words = []
        tags = []
      else:
        words.append(word_and_tag[0])
        tags.append(word_and_tag[1])
  return data

!git clone https://github.com/kfirbar/nlp-course
train = read_data('/content/nlp-course/connl03_train.txt')
test = read_data('/content/nlp-course/connl03_test.txt')
dev = read_data('/content/nlp-course/connl03_dev.txt')

fatal: destination path 'nlp-course' already exists and is not an empty directory.


In [None]:
# train
print(len(train))
print(train[0][0])
print(train[0][1])

1750
['Portuguesa', '2', 'Parana', '0']
['B-ORG', 'O', 'B-ORG', 'O']


In [None]:
# test
print(len(test))
print(test[0][0])
print(test[0][1])

500
['Ally', 'McCoist', 'is', 'also', 'in', 'great', 'scoring', 'form', 'at', 'the', 'moment', '.', '"']
['B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
# dev
print(len(dev))
print(dev[0][0])
print(dev[0][1])

250
['Everton', "'s", 'Duncan', 'Ferguson', ',', 'who', 'scored', 'twice', 'against', 'Manchester', 'United', 'on', 'Wednesday', ',', 'was', 'picked', 'on', 'Thursday', 'for', 'the', 'Scottish', 'squad', 'after', 'a', '20-month', 'exile', '.']
['B-ORG', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


The following Vocab class can be served as a dictionary that maps words and tags into Ids. The UNK_TOKEN should be used for words that are not part of the training data.

## **Task 2:** 

Write a function *prepare_data* that takes one of the [train, dev, test] and the Vocab instance, for converting each pair of (words,labels) to a pair of indexes (from Vocab). Each pair should be added to *data_sequences*, which is returned back from the function.

In [None]:
UNK_TOKEN = 0

class Vocab:
  def __init__(self):
    self.word2id = {"__unk__": UNK_TOKEN}
    self.id2word = {UNK_TOKEN: "__unk__"}
    self.n_words = 1
    
    self.tag2id = {"O":0, "B-PER":1, "I-PER": 2, "B-LOC": 3, "I-LOC": 4, "B-ORG": 5, "I-ORG": 6}
    self.id2tag = {0:"O", 1:"B-PER", 2:"I-PER", 3:"B-LOC", 4:"I-LOC", 5:"B-ORG", 6:"I-ORG"}
        
  def index_words(self, words):
    word_indexes = [self.index_word(w) for w in words]
    return word_indexes

  def index_tags(self, tags):
    tag_indexes = [self.tag2id[t] for t in tags]
    return tag_indexes
  
  def index_word(self, w):
    if w not in self.word2id:
      self.word2id[w] = self.n_words
      self.id2word[self.n_words] = w
      self.n_words += 1
    return self.word2id[w]

  def getTagsSize(self):
    return len(self.tag2id)
  
  def getWordsSize(self):
    return self.n_words
            

In [None]:
vocab = Vocab()

def prepare_data(data, vocab):
    data_sequences = []
    for words, tags in data:
      indexes_words = vocab.index_words(words)
      indexes_tags = vocab.index_tags(tags)
      data_sequences.append([indexes_words, indexes_tags])
    return data_sequences, vocab

train_sequences, vocab = prepare_data(train, vocab)
dev_sequences, vocab = prepare_data(dev, vocab)
test_sequences, vocab = prepare_data(test, vocab)

In [None]:
# train
print(len(train_sequences))
print(train_sequences[0][0])
print(train_sequences[0][1])

1750
[1, 2, 3, 4]
[5, 0, 5, 0]


In [None]:
# test
print(len(test_sequences))
print(test_sequences[0][0])
print(test_sequences[0][1])

500
[7769, 7770, 126, 259, 31, 4983, 4236, 5047, 46, 36, 7771, 24, 203]
[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
# dev
print(len(dev_sequences))
print(dev_sequences[0][0])
print(dev_sequences[0][1])

250
[2407, 163, 5327, 2431, 82, 459, 544, 2464, 44, 2919, 428, 35, 182, 82, 94, 3153, 35, 308, 29, 36, 7162, 454, 318, 65, 7163, 6495, 24]
[5, 0, 1, 2, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## **Task 3:** 

Write NERNet, a PyTorch Module for labeling words with NER tags. 

*input_size:* the size of the vocabulary

*embedding_size:* the size of the embeddings

*hidden_size:* the LSTM hidden size

*output_size:* the number tags we are predicting for

*n_layers:* the number of layers we want to use in LSTM

*directions:* could 1 or 2, indicating unidirectional or bidirectional LSTM, respectively

The input for your forward function should be a single sentence tensor.

In [None]:
class NERNet(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, output_size, n_layers, directions, custom_embedding = None):
    super(NERNet, self).__init__()
    bidirectional = True if directions == 2 else False
    self.embedding = nn.Embedding(input_size, embedding_size)
    if custom_embedding is not None:
      self.embedding.weight = nn.Parameter(custom_embedding)
      self.embedding.weight.requires_grad = False
    self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, bidirectional=bidirectional)
    self.out = nn.Linear(directions * hidden_size, output_size)

  def forward(self, input_sentence):
    x = self.embedding(input_sentence)
    x = x.view(len(input_sentence), 1, -1) 
    x,_ = self.lstm(x)
    x = x.view(len(input_sentence), -1) 
    output = self.out(x)
    return output


## **Task 4:** 

write a training loop, which takes a model (instance of NERNet) and number of epochs to train on. The loss is always CrossEntropyLoss and the optimizer is always Adam.

In [None]:
# example from notrbook 3
# https://colab.research.google.com/drive/1JQZDXAE7jLcJj-J-s72tKHEIFsmFFtRv?usp=drive_open#scrollTo=Qi4H9mxLQ-Xl

def train_loop(model, n_epochs, sequences):
  print('Start training:')
  criterion = nn.CrossEntropyLoss() # Loss function
  optimizer = torch.optim.Adam(model.parameters(), lr=0.0001) # Optimizer (ADAM is a fancy version of SGD)
  
  # define train loss array
  train_loss_array=[]
 
  for e in range(1, n_epochs + 1):
    train_loss = 0.0

    for words, tags in sequences:
      words = torch.tensor(words).to(device) # -- For GPU
      tags = torch.tensor(tags).to(device) # -- For GPU
      y_pred = model(words)
      
      loss = criterion(y_pred, tags) # Compute and print loss
      optimizer.zero_grad() # Zero the gradients
      loss.backward()   # Perform a backward pass (backpropagation)
      optimizer.step()  # Update the parameters
      
      # statistics
      train_loss += loss.item()

    # calculate train loss for epoch
    train_loss_calc = train_loss / len(sequences)
    print('[epoch %d] train loss: %.3f' %(e, train_loss_calc))
    train_loss_array.append(train_loss_calc)

  return mean(train_loss_array)  

## **Task 5:** 

write an evaluation loop on a trained model, using the dev and test datasets. This function print the true positive rate (TPR), also known as Recall and the opposite to false positive rate (FPR), also known as precision, of each label seperately (7 labels in total), and for all the 6 labels (except O) together. The caption argument for the function should be served for printing, so that when you print include it as a prefix.

In [None]:
def evaluate(model, caption,sequences,model_num):
  
    labels=list(vocab.tag2id.keys())
    num_of_tags=len(labels)
    confusion_matrix = np.zeros((num_of_tags,num_of_tags))
    line_seperator=("*")*40
    with torch.no_grad():
      for words,tags in sequences:
          words = torch.LongTensor(words).to(device)
          pred = model(words)
          pred = torch.argmax(pred, dim=1)
          for i in range(len(tags)):
              confusion_matrix[pred[i],tags[i]] += 1
          

    df = pd.DataFrame(data=np.zeros((num_of_tags+1,2)), index= labels+["all 6 labels except O"], columns=["precision","recall"])

    for i in range(len(labels)):
        recall=confusion_matrix[i,i]/np.sum(confusion_matrix[:,i])
        precision = confusion_matrix[i,i]/np.sum(confusion_matrix[i,:])
        df.loc[labels[i]]=[precision,recall]

    recall_only_6=np.sum(confusion_matrix[1:,1:])/np.sum(confusion_matrix[:,1:])
    precision_only_6= np.sum(confusion_matrix[1:,1:])/np.sum(confusion_matrix[1:,:])
    df.loc["all 6 labels except O"]=[precision_only_6,recall_only_6]
    print("\n","Evaluation : ", caption ,model_num)
    print(line_seperator)
    print(df)

## **Task 6:** 

Train and evaluate a few models, all with embedding_size=300, and with the following hyper parameters (you may use that as captions for the models as well):

Model 1: (hidden_size: 500, n_layers: 1, directions: 1)

Model 2: (hidden_size: 500, n_layers: 2, directions: 1)

Model 3: (hidden_size: 500, n_layers: 3, directions: 1)

Model 4: (hidden_size: 500, n_layers: 1, directions: 2)

Model 5: (hidden_size: 500, n_layers: 2, directions: 2)

Model 6: (hidden_size: 500, n_layers: 3, directions: 2)

Model 7: (hidden_size: 800, n_layers: 1, directions: 2)

Model 8: (hidden_size: 800, n_layers: 2, directions: 2)

Model 9: (hidden_size: 800, n_layers: 3, directions: 2)

In [None]:
epochs = 10
input_size = vocab.getWordsSize()
embedding_size = 300
hidden_size_500 = 500
hidden_size_800 = 800
output_size = vocab.getTagsSize()

In [None]:
def print_model_header(model_number):
  print("==========================================================================================")
  print("======================================== Model #{} ========================================".format(model_number))
  print("==========================================================================================")

In [None]:
def train_and_evaluate(input_size, hidden_size, output_size = output_size, n_layers = 1, directions = 1, epochs = 10, model_number = 1, custom_embedding = None):
  model = NERNet(input_size = input_size, embedding_size = 300, hidden_size = hidden_size, output_size = output_size, n_layers = n_layers, directions = directions, custom_embedding=custom_embedding).to(device)

  print_model_header(model_number)

  print("hidden size {}, number of layers {}, directions {}\n".format(hidden_size, n_layers, directions))
  mean_train_loss = train_loop(model, epochs, train_sequences)
  print('\nmean loss: %.3f\n' %(mean_train_loss))
  evaluate(model, "test", test_sequences, model_number)
  evaluate(model, "dev", dev_sequences, model_number)

In [None]:
def train_and_evaluate_all(custom_embedding = None):
  model_param_list = [[500, 1, 1], [500, 2, 1], [500, 3, 1], [500, 1, 2], [500, 2, 2], [500, 3, 2], [800, 1, 2], [800, 2, 2], [800, 3, 2]]
  for i in range(len(model_param_list)):
    model_param = model_param_list[i]
    train_and_evaluate(input_size, model_param[0], output_size, model_param[1], model_param[2], 10, i+1, custom_embedding)
    print("\n\n")

In [None]:
train_and_evaluate_all()

hidden size 500, number of layers 1, directions 1

Start training:
[epoch 1] train loss: 0.934
[epoch 2] train loss: 0.558
[epoch 3] train loss: 0.364
[epoch 4] train loss: 0.245
[epoch 5] train loss: 0.163
[epoch 6] train loss: 0.104
[epoch 7] train loss: 0.065
[epoch 8] train loss: 0.040
[epoch 9] train loss: 0.025
[epoch 10] train loss: 0.017

mean loss: 0.251


 Evaluation :  test 1
****************************************
                       precision    recall
O                       0.921191  0.961169
B-PER                   0.681159  0.649770
I-PER                   0.793774  0.689189
B-LOC                   0.828571  0.676385
I-LOC                   0.909091  0.566038
B-ORG                   0.583072  0.531429
I-ORG                   0.590909  0.260000
all 6 labels except O   0.816679  0.677804

 Evaluation :  dev 1
****************************************
                       precision    recall
O                       0.916948  0.966408
B-PER                   0.661458 

## **Task 7:** 

Download the GloVe embeddings from https://nlp.stanford.edu/projects/glove/ (use the 300-dim vectors from glove.6B.zip). Then intialize the nn.Embedding module in your NERNet with these embeddings, so that you can start your training with pre-trained vectors. Repeat Task 6 and print the results for each model.

Note: make sure that vectors are aligned with the IDs in your Vocab, in other words, make sure that for example the word with ID 0 is the first vector in the GloVe matrix of vectors that you initialize nn.Embedding with. For a dicussion on how to do that, check it this link:
https://discuss.pytorch.org/t/can-we-use-pre-trained-word-embeddings-for-weight-initialization-in-nn-embedding/1222

In [None]:
import os

if not os.path.exists("/content/glove.6B.zip"):
  !wget "http://nlp.stanford.edu/data/glove.6B.zip"
  !unzip "/content/glove.6B.zip"
print("The files are ready")

--2021-06-07 19:17:49--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-06-07 19:17:49--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-06-07 19:17:50--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [None]:
# helpful link mentioned in the piazza https://medium.com/@martinpella/how-to-use-pre-trained-word-embeddings-in-pytorch-71ca59249f76

embeddings = np.zeros((vocab.getWordsSize(), 300))
with open("/content/glove.6B.300d.txt", "r", encoding = 'utf-8') as f:
  for line in f:
    line = line.split()
    word = line[0]
    idx = vocab.word2id.get(word)
    if idx:
      embeddings[idx] = np.array(line[1:], dtype ='float32')

embeddings = torch.from_numpy(embeddings).float().to(device)


In [None]:
train_and_evaluate_all(embeddings)

hidden size 500, number of layers 1, directions 1

Start training:
[epoch 1] train loss: 0.762
[epoch 2] train loss: 0.534
[epoch 3] train loss: 0.465
[epoch 4] train loss: 0.424
[epoch 5] train loss: 0.390
[epoch 6] train loss: 0.367
[epoch 7] train loss: 0.350
[epoch 8] train loss: 0.337
[epoch 9] train loss: 0.326
[epoch 10] train loss: 0.316

mean loss: 0.427


 Evaluation :  test 1
****************************************
                       precision    recall
O                       0.958244  0.950510
B-PER                   0.794007  0.488479
I-PER                   0.778226  0.652027
B-LOC                   0.702341  0.612245
I-LOC                   0.593750  0.358491
B-ORG                   0.419183  0.674286
I-ORG                   0.396875  0.635000
all 6 labels except O   0.812030  0.837709

 Evaluation :  dev 1
****************************************
                       precision    recall
O                       0.950517  0.949289
B-PER                   0.813433 

**Good luck!**