# Named Entity Recognition Assignment
NER is a subtask of information extraction that locates and classifies named entities in a text. The named entities could be organizations, persons, locations, times, etc. In this assignment, you will train a named entity recognition system and test it on a test data. \
Let's get started

In [2]:
import os 
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from utils_2 import get_params, get_vocab, get_tags
import random as rnd

# Importing and discovering the data

In [9]:
vocab = get_vocab('processed_input/train_vocab.txt')
tags = get_tags('processed_input/train_complex_topping_tags.txt')

t_sentences, t_labels, t_size = get_params(vocab, tags, 'processed_input/train_complex_topping.txt', 'processed_input/train_complex_topping_labels.txt')

test_sentences, test_labels, test_size = get_params(vocab, tags, 'processed_input/dev_complex_topping.txt', 'processed_input/dev_complex_topping_labels.txt')

In [10]:
# The possible tags
print(tags)

{'QUANTITY': 0, 'TOPPING': 1, 'TOPPING_S': 2, 'QUANTITY_S': 3, 'NONE': 4}


In [11]:
# Exploring information about the data
print('The number of outputs is tag_map', len(tags))
# The number of vocabulary tokens (including <PAD>)
g_vocab_size = len(vocab)
print(f"Num of vocabulary words: {g_vocab_size}")
print('The vocab size is', len(vocab))
print('The training size is', t_size)
# print('The validation size is', v_size)
print('An example of the first sentence is', t_sentences[0])
print('An example of its corresponding label is', t_labels[0])

The number of outputs is tag_map 5
Num of vocabulary words: 307
The vocab size is 307
The training size is 439054
An example of the first sentence is [201, 103]
An example of its corresponding label is [3, 2]


In [12]:
class NERDataset(torch.utils.data.Dataset):

  def __init__(self, x, y, pad, tag_none):
    """
    This is the constructor of the NERDataset
    Inputs:
    - x: a list of lists where each list contains the ids of the tokens
    - y: a list of lists where each list contains the label of each token in the sentence
    - pad: the id of the <PAD> token (to be used for padding all sentences and labels to have the same length)
    """
    x_max = max(x, key= lambda z: len(z))
    x = [sentence + [pad] * (len(x_max) - len(sentence)) for sentence in x]
    y_max = max(y, key= lambda z: len(z))
    y = [sentence + [tag_none] * (len(y_max) - len(sentence)) for sentence in y]
    self.x_tensor = torch.tensor(x)
    self.y_tensor = torch.tensor(y)
    
  def __len__(self):
    """
    This function should return the length of the dataset (the number of sentences)
    """
    return self.x_tensor.shape[0]
    
  def __getitem__(self, idx):
    """
    This function returns a subset of the whole dataset
    """
    return self.x_tensor[idx], self.y_tensor[idx]

In [13]:
batch_size = 5
mini_sentences = t_sentences[0: 8]
mini_labels = t_labels[0: 8]
mini_dataset = NERDataset(mini_sentences, mini_labels, vocab['<PAD>'], tags["NONE"])
dummy_dataloader = torch.utils.data.DataLoader(mini_dataset, batch_size=5)
dg = iter(dummy_dataloader)
X1, Y1 = next(dg)
X2, Y2 = next(dg)
print(Y1.shape, X1.shape, Y2.shape, X2.shape)
print(X1[0][:], "\n", Y1[0][:])

torch.Size([5, 5]) torch.Size([5, 5]) torch.Size([3, 5]) torch.Size([3, 5])
tensor([201, 103, 306, 306, 306]) 
 tensor([3, 2, 4, 4, 4])


# NER
The class that implementss the pytorch model for NER

In [14]:
class NER(nn.Module):
  def __init__(self, vocab_size=len(vocab), embedding_dim=50, hidden_size=50, n_classes=len(tags)):
    """
    The constructor of our NER model
    Inputs:
    - vacab_size: the number of unique words
    - embedding_dim: the embedding dimension
    - n_classes: the number of final classes (tags)
    """
    super(NER, self).__init__()
    ####################### TODO: Create the layers of your model #######################################
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True, bidirectional=True)
    self.dropout = nn.Dropout(p=0.25)
    self.linear = nn.Linear(hidden_size * 2, n_classes) # x2 cuz bi-directional
    #####################################################################################################

  def forward(self, sentences):
    """
    This function does the forward pass of our model
    Inputs:
    - sentences: tensor of shape (batch_size, max_length)

    Returns:
    - final_output: tensor of shape (batch_size, max_length, n_classes)
    """

    ######################### TODO: implement the forward pass ####################################
    embedding = self.embedding(sentences)
    lstm, _ = self.lstm(embedding)
    dropout = self.dropout(lstm)
    final_output = self.linear(dropout)
    ###############################################################################################
    return final_output

# Training

In [15]:
def train(model, train_dataset, batch_size=512, epochs=10, learning_rate=0.01):
  """
  This function implements the training logic
  Inputs:
  - model: the model ot be trained
  - train_dataset: the training set of type NERDataset
  - batch_size: integer represents the number of examples per step
  - epochs: integer represents the total number of epochs (full training pass)
  - learning_rate: the learning rate to be used by the optimizer
  """

  ############################## TODO: replace the Nones in the following code ##################################
  
  # (1) create the dataloader of the training set (make the shuffle=True)
  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size, shuffle=True)

  # (2) make the criterion cross entropy loss
  criterion = nn.CrossEntropyLoss()

  # (3) create the optimizer (Adam)
  optimizer = torch.optim.Adam(model.parameters(), learning_rate)

  # GPU configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda:0" if use_cuda else "cpu")
  # print(torch.cuda.get_device_name(device))
  if use_cuda:
    model = model.to(device)
    criterion = criterion.cuda(device)
    pass

  for epoch_num in range(epochs):
    total_acc_train = 0
    total_loss_train = 0
  
    for train_input, train_label in tqdm(train_dataloader):
  
      # (4) move the train input to the device
      train_input = train_input.to(device)
  
      # (5) move the train label to the device
      train_label = train_label.to(device)
  
  
      # (6) do the forward pass
      output = model.forward(train_input)
      
      # (7) loss calculation (you need to think in this part how to calculate the loss correctly)
      batch_loss = criterion(output.view(-1, output.shape[-1]), train_label.view(-1))
  
      # (8) append the batch loss to the total_loss_train
      total_loss_train += batch_loss
      
      # (9) calculate the batch accuracy (just add the number of correct predictions)
      acc = (torch.argmax(output, dim=-1) == train_label).sum().item()
      total_acc_train += acc
  
      # (10) zero your gradients
      optimizer.zero_grad()
  
      # (11) do the backward pass
      batch_loss.backward()
  
      # (12) update the weights with your optimizer
      optimizer.step()
      
    # epoch loss
    epoch_loss = total_loss_train / len(train_dataset)
  
    # (13) calculate the accuracy
    sample_count = len(train_dataset)
    seq_length = train_dataset[0][0].shape[0]
    epoch_acc = total_acc_train / (sample_count * seq_length)
  
  
    print(
        f'Epochs: {epoch_num + 1} | Train Loss: {epoch_loss} \
        | Train Accuracy: {epoch_acc}\n')
      
  ##############################################################################################################

In [16]:
train_dataset = NERDataset(t_sentences, t_labels, vocab['<PAD>'], tags["NONE"])
# val_dataset = NERDataset(v_sentences, v_labels, vocab['<PAD>'])
test_dataset = NERDataset(test_sentences, test_labels, vocab['<PAD>'], tags["NONE"])

In [17]:
model = NER(embedding_dim=70, hidden_size=500, n_classes=len(tags), vocab_size=len(vocab))
print(model)

NER(
  (embedding): Embedding(307, 70)
  (lstm): LSTM(70, 500, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.25, inplace=False)
  (linear): Linear(in_features=1000, out_features=5, bias=True)
)


In [18]:
train(model, train_dataset, epochs=2, batch_size=32)
train(model, test_dataset, epochs=1, batch_size=32)

100%|██████████| 13721/13721 [00:53<00:00, 258.50it/s]


Epochs: 1 | Train Loss: 1.4979475054133218e-05         | Train Accuracy: 0.9998402816054517



100%|██████████| 13721/13721 [00:52<00:00, 259.62it/s]


Epochs: 2 | Train Loss: 1.4042359939778204e-12         | Train Accuracy: 1.0



100%|██████████| 3/3 [00:00<00:00, 215.59it/s]

Epochs: 1 | Train Loss: 0.0018356989603489637         | Train Accuracy: 0.9905882352941177






# Evaluation

In [19]:
def evaluate(model, test_dataset, batch_size=64):
  """
  This function takes a NER model and evaluates its performance (accuracy) on a test data
  Inputs:
  - model: a NER model
  - test_dataset: dataset of type NERDataset
  """
  ########################### TODO: Replace the Nones in the following code ##########################

  # (1) create the test data loader
  test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size,shuffle=False)

  # GPU Configuration
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")
  if use_cuda:
    model = model.cuda()

  total_acc_test = 0
  
  # (2) disable gradients
  with torch.no_grad():

    for test_input, test_label in tqdm(test_dataloader):
      # (3) move the test input to the device
      test_input = test_input.to(device)

      # (4) move the test label to the device
      test_label = test_label.to(device)

      # (5) do the forward pass
      output = model.forward(test_input)

      # accuracy calculation (just add the correct predicted items to total_acc_test)
      acc = (torch.argmax(output, dim=-1) == test_label).sum().item()
      total_acc_test += acc 
    
    # (6) calculate the over all accuracy
    total_acc_test /= len(test_dataset) * test_dataset[0][0].shape[0]
  ##################################################################################################

  
  print(f'\nTest Accuracy: {total_acc_test}')

In [20]:
evaluate(model, test_dataset)

100%|██████████| 2/2 [00:00<00:00, 362.23it/s]


Test Accuracy: 1.0





In [28]:
inv_order_tags = {}
for tag, value in tags.items():
  inv_order_tags[value] = tag
  
def test_sample(sample):
  s = [vocab[token] if token in vocab
                 else vocab['<UNK>']
                 for token in sample.split(' ') if token != '']
  x_tensor = torch.tensor(s)
  device = torch.device("cuda:0")
  output = model.forward(x_tensor.to(device))
  output = torch.argmax(output, dim=-1).to("cpu")
  print([inv_order_tags[x.item()] for x in output])


test_sample("some love")

['QUANTITY_S', 'TOPPING_S']


In [22]:
def save_model_state(model, path):
    """
    Saves only the model's parameters
    """
    torch.save(model.state_dict(), path)

def load_model_state(model, path):
    """
    Loads the model's parameters into a pre-defined architecture
    """
    model.load_state_dict(torch.load(path))
    model.eval()  # Set to evaluation mode
    device = torch.device("cuda:0")
    model.to(device)
    return model

save_model_state(model, "models/complex_x100.0.pth")