In [23]:
# Type Your full names
Student_1 = "Mostafa Mohamed Elgendy"
Student_2 = "Mostafa Wael Kamal"

# Named Entity Recognition Assignment
NER is a subtask of information extraction that locates and classifies named entities in a text. The named entities could be organizations, persons, locations, times, etc. In this assignment, you will train a named entity recognition system and test it on a test data. \
Let's get started

In [24]:
import os 
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from utils import get_params, get_vocab
import random as rnd

# Importing and discovering the data

In [25]:
vocab, tag_map = get_vocab('data/large/words.txt', 'data/large/tags.txt')
t_sentences, t_labels, t_size = get_params(vocab, tag_map, 'data/large/train/sentences.txt', 'data/large/train/labels.txt')
v_sentences, v_labels, v_size = get_params(vocab, tag_map, 'data/large/val/sentences.txt', 'data/large/val/labels.txt')
test_sentences, test_labels, test_size = get_params(vocab, tag_map, 'data/large/test/sentences.txt', 'data/large/test/labels.txt')

`vocab` is a dictionary that translates a word string to a unique number. Given a sentence, you can represent it as an array of numbers translating with this dictionary. The dictionary contains a `<PAD>` token. 

When training an LSTM using batches, all your input sentences must be the same size. To accomplish this, you set the length of your sentences to a certain number and add the generic `<PAD>` token to fill all the empty spaces. 

In [26]:
# vocab translates from a word to a unique number
print('vocab["the"]:', vocab["the"])
# Pad token
print('padded token:', vocab['<PAD>'])

vocab["the"]: 9
padded token: 35180


In [27]:
# The possible tags
print(tag_map)

{'O': 0, 'B-geo': 1, 'B-gpe': 2, 'B-per': 3, 'I-geo': 4, 'B-org': 5, 'I-org': 6, 'B-tim': 7, 'B-art': 8, 'I-art': 9, 'I-per': 10, 'I-gpe': 11, 'I-tim': 12, 'B-nat': 13, 'B-eve': 14, 'I-eve': 15, 'I-nat': 16}


So the coding scheme that tags the entities is a minimal one where B- indicates the first token in a multi-token entity, and I- indicates one in the middle of a multi-token entity. If you had the sentence 

**"Sharon flew to Miami on Friday"**

the outputs would look like:

```
Sharon B-per
flew   O
to     O
Miami  B-geo
on     O
Friday B-tim
```

your tags would reflect three tokens beginning with B-, since there are no multi-token entities in the sequence. But if you added Sharon's last name to the sentence: 

**"Sharon Floyd flew to Miami on Friday"**

```
Sharon B-per
Floyd  I-per
flew   O
to     O
Miami  B-geo
on     O
Friday B-tim
```

then your tags would change to show first "Sharon" as B-per, and "Floyd" as I-per, where I- indicates an inner token in a multi-token sequence.

In [28]:
# Exploring information about the data
print('The number of outputs is tag_map', len(tag_map))
# The number of vocabulary tokens (including <PAD>)
g_vocab_size = len(vocab)
print(f"Num of vocabulary words: {g_vocab_size}")
print('The vocab size is', len(vocab))
print('The training size is', t_size)
print('The validation size is', v_size)
print('An example of the first sentence is', t_sentences[0])
print('An example of its corresponding label is', t_labels[0])

The number of outputs is tag_map 17
Num of vocabulary words: 35181
The vocab size is 35181
The training size is 33570
The validation size is 7194
An example of the first sentence is [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 9, 15, 1, 16, 17, 18, 19, 20, 21]
An example of its corresponding label is [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0]


# NERDataset
The class that impelements the dataset for NER

In [29]:
class NERDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, pad):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    # pad every column in x and y to have the same length
    # create two tensors one for x and the other for labels
    # self.x = torch.tensor([sentence + [pad] * (max([len(sentence) for sentence in x]) - len(sentence)) for sentence in x])
    # self.y = torch.tensor([sentence + [0] * (max([len(sentence) for sentence in y]) - len(sentence)) for sentence in y])

    self.x = nn.utils.rnn.pad_sequence(
        [torch.tensor(i) for i in x], batch_first=True, padding_value=pad)
    self.y = nn.utils.rnn.pad_sequence(
        [torch.tensor(j) for j in y], batch_first=True, padding_value=0)
    
    #################################################################################################################

  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    # return the length of the dataset
    return len(self.x)
    ###########################################################################################

  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    # return a tuple of x and y
    return self.x[idx], self.y[idx]
    ##########################################################################################

In [30]:
batch_size = 5
mini_sentences = t_sentences[0: 8]
mini_labels = t_labels[0: 8]
mini_dataset = NERDataset(mini_sentences, mini_labels, vocab['<PAD>'])
dummy_dataloader = torch.utils.data.DataLoader(mini_dataset, batch_size=5)
dg = iter(dummy_dataloader)
X1, Y1 = next(dg)
X2, Y2 = next(dg)
print(Y1.shape, X1.shape, Y2.shape, X2.shape)
print(X1[0][:], "\n", Y1[0][:])

torch.Size([5, 30]) torch.Size([5, 30]) torch.Size([3, 30]) torch.Size([3, 30])
tensor([    0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
           10,    11,    12,    13,    14,     9,    15,     1,    16,    17,
           18,    19,    20,    21, 35180, 35180, 35180, 35180, 35180, 35180]) 
 tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])


#### Expected output
torch.Size([5, 30]) torch.Size([5, 30]) torch.Size([3, 30]) torch.Size([3, 30])\
tensor([    0,     1,     2,     3,     4,     5,     6,     7,     8,     9,
           10,    11,    12,    13,    14,     9,    15,     1,    16,    17,
           18,    19,    20,    21, 35180, 35180, 35180, 35180, 35180, 35180]) \
tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])

# NER
The class that implementss the pytorch model for NER

In [31]:
class NER(nn.Module):
  def __init__(self, vocab_size=35181, embedding_dim=50, hidden_size=50, n_classes=len(tag_map)):
    """
    The constructor of our NER model
    Inputs:
    - vacab_size: the number of unique words
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes (tags)
    """
    super(NER, self).__init__()
    # (1) Create the embedding layer
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab['<PAD>'])

    # (2) Create an LSTM layer with hidden size = hidden_size and batch_first = True
    self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True)

    # (3) Create a linear layer with number of neorons = n_classes
    self.linear = nn.Linear(hidden_size, n_classes)
    #####################################################################################################

  def forward(self, sentences):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """
    # (1) Pass the sentences through the embedding layer
    embeddingsOutput = self.embedding(sentences)

    # (2) Pass the output of the embedding layer through the LSTM layer
    lstmOutput, _ = self.lstm(embeddingsOutput)

    # (3) Pass the output of the LSTM layer through the linear layer
    final_output = self.linear(lstmOutput)
    
    ###############################################################################################
    return final_output

In [32]:
model = NER()
print(model)

NER(
  (embedding): Embedding(35181, 50, padding_idx=35180)
  (lstm): LSTM(50, 50, batch_first=True)
  (linear): Linear(in_features=50, out_features=17, bias=True)
)


#### Expected output
NER( \
  (embedding): Embedding(35181, 50) \
  (lstm): LSTM(50, 50, batch_first=True) \
  (linear): Linear(in_features=50, out_features=17, bias=True) \
)

# Training

In [33]:
def train(model, train_dataset, batch_size=512, epochs=5, learning_rate=0.01):
  """
  This function implements the training logic
  Inputs:
  - model: the model ot be trained
  - train_dataset: the training set of type NERDataset
  - batch_size: integer represents the number of examples per step
  - epochs: integer represents the total number of epochs (full training pass)
  - learning_rate: the learning rate to be used by the optimizer
  """
  
  # (1) create the dataloader of the training set (make the shuffle=True)
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

  # (2) make the criterion cross entropy loss
  criterion = torch.nn.CrossEntropyLoss()

  # (3) create the optimizer (Adam)
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()
  print("Using device: ", device)
  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0
    for train_input, train_label in tqdm(train_dataloader):

      # (4) move the train input to the device
      train_input = train_input.to(device)

      # (5) move the train label to the device
      train_label = train_label.to(device)

      # (6) do the forward pass
      output = model(train_input)
      
      # (7) loss calculation (you need to think in this part how to calculate the loss correctly)
      batch_loss = criterion(output.view(-1, output.shape[-1]), train_label.view(-1))

      # (8) append the batch loss to the total_loss_train
      total_loss_train += batch_loss.item()
      
      # (9) calculate the batch accuracy (just add the number of correct predictions)
      # acc = (torch.argmax(output, dim=-1) == train_label).sum().item()
      # or we can calculate the accuracy without using the argmax function
      acc = (output.argmax(dim=-1) == train_label).sum().item()
      total_acc_train += acc

      # (10) zero your gradients
      optimizer.zero_grad()

      # (11) do the backward pass
      batch_loss.backward()

      # (12) update the weights with your optimizer
      optimizer.step()
      
    # epoch loss
    epoch_loss = total_loss_train / len(train_dataset)

    # (13) calculate the accuracy
    epoch_acc = total_acc_train / (len(train_dataset) * train_dataset[0][0].shape[0])

    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')

  ##############################################################################################################

In [34]:
train_dataset = NERDataset(t_sentences, t_labels, vocab['<PAD>'])
val_dataset = NERDataset(v_sentences, v_labels, vocab['<PAD>'])
test_dataset = NERDataset(test_sentences, test_labels, vocab['<PAD>'])

In [35]:
train(model, train_dataset, batch_size=256, epochs=5)

Using device:  cpu


100%|██████████| 132/132 [00:17<00:00,  7.60it/s]


Epochs: 1 | Train Loss: 0.0009047294219210632         | Train Accuracy: 0.9587028253248092



100%|██████████| 132/132 [00:18<00:00,  7.00it/s]


Epochs: 2 | Train Loss: 0.0001874915819358783         | Train Accuracy: 0.9876833138562361



100%|██████████| 132/132 [00:19<00:00,  6.67it/s]


Epochs: 3 | Train Loss: 0.00012044604686814332         | Train Accuracy: 0.9913518824041613



100%|██████████| 132/132 [00:17<00:00,  7.45it/s]


Epochs: 4 | Train Loss: 9.623553774782614e-05         | Train Accuracy: 0.9927582433949727



100%|██████████| 132/132 [00:21<00:00,  6.28it/s]

Epochs: 5 | Train Loss: 8.290905526503233e-05         | Train Accuracy: 0.9935834421759355






#### Expected train accuracy after 5 epochs to be above 0.99

# Evaluation

In [36]:
def evaluate(model, test_dataset, batch_size=256):
  """
  This function takes a NER model and evaluates its performance (accuracy) on a test data
  Inputs:
  - model: a NER model
  - test_dataset: dataset of type NERDataset
  """
  # (1) create the test data loader
  test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

  # GPU Configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()

  total_acc_test = 0
  
  # (2) disable gradients
  with torch.no_grad():

    for test_input, test_label in tqdm(test_dataloader):
      # (3) move the test input to the device
      test_label = test_label.to(device)

      # (4) move the test label to the device
      test_input = test_input.to(device)

      # (5) do the forward pass
      output = model(test_input)

      # accuracy calculation (just add the correct predicted items to total_acc_test)
      acc = (torch.argmax(output, dim=-1) == test_label).sum().item()
      total_acc_test += acc
    
    # (6) calculate the over all accuracy
    total_acc_test /= (len(test_dataset) * test_dataset[0][0].shape[0])
  ##################################################################################################

  
  print(f'\nTest Accuracy: {total_acc_test}')

In [37]:
evaluate(model, test_dataset)

100%|██████████| 29/29 [00:01<00:00, 23.89it/s]


Test Accuracy: 0.9860101672028277





#### Expected test accuracy to be above 0.98

# Thank you