<a href="https://colab.research.google.com/github/Nabarup-Maity/Deep_Learning/blob/main/NER_prediction_with_lstm_and_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**NER prediction with LSTM and Transformers**

Here we will be exploring the use of lstms and transformers for named entity recognition (NER) tasks. In this case, we will be looking at recognizing word tagging (e.g., classifying each word as a business, a place, etc...)

Data used:  ner_dataset.csv

Data source: https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus?select=ner_dataset.csv

Import the libraries we will need.

In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from itertools import chain


Let's look at the structure of the data

In [3]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
data = pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/dataset/ner_dataset.csv', encoding= 'unicode_escape')
data.head(15)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


We next need to create a mapping between tokens, tags, and ids. Each token should map to a unique id, and each tag should map to a unique class.

In [8]:
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}

    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))

    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok

token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')

print("Token for id 0 :", idx2token[0])
print("Tag for id 0: ", idx2tag[0])

Token for id 0 : Council
Tag for id 0:  I-nat


In [35]:
print('token2idx: ',token2idx)
print('idx2token: ', idx2token)
print('tag2idx: ', tag2idx)
print('idx2tag: ', idx2tag)

tag2idx:  {'I-nat': 0, 'O': 1, 'B-tim': 2, 'B-eve': 3, 'I-gpe': 4, 'I-org': 5, 'B-art': 6, 'I-per': 7, 'B-nat': 8, 'I-geo': 9, 'I-tim': 10, 'I-eve': 11, 'I-art': 12, 'B-org': 13, 'B-geo': 14, 'B-per': 15, 'B-gpe': 16}
idx2tag:  {0: 'I-nat', 1: 'O', 2: 'B-tim', 3: 'B-eve', 4: 'I-gpe', 5: 'I-org', 6: 'B-art', 7: 'I-per', 8: 'B-nat', 9: 'I-geo', 10: 'I-tim', 11: 'I-eve', 12: 'I-art', 13: 'B-org', 14: 'B-geo', 15: 'B-per', 16: 'B-gpe'}


In [10]:
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)
data.head(10)


Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,Thousands,NNS,O,351,1
1,,of,IN,O,24631,1
2,,demonstrators,NNS,O,28097,1
3,,have,VBP,O,25557,1
4,,marched,VBN,O,12533,1
5,,through,IN,O,31436,1
6,,London,NNP,B-geo,26837,14
7,,to,TO,O,15843,1
8,,protest,VB,O,8307,1
9,,the,DT,O,25939,1


Now you might have noticed that each sentece is split into multiple rows. We need to transform this data into sequences of words and tags.

In [12]:
# Fill na
data_fillna = data.fillna(method='ffill', axis=0)
# Groupby and collect columns
data_group = data_fillna.groupby(
['Sentence #'],as_index=False
)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))
# Visualise data
data_group.head(5)

  data_group = data_fillna.groupby(


Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[351, 24631, 28097, 25557, 12533, 31436, 26837...","[1, 1, 1, 1, 1, 1, 14, 1, 1, 1, 1, 1, 14, 1, 1..."
1,Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[8927, 26320, 31609, 24420, 22972, 15843, 8465...","[16, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
2,Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O...","[10588, 3139, 7116, 27676, 7200, 2123, 23619, ...","[1, 1, 2, 1, 1, 1, 1, 1, 14, 1, 1, 1, 1, 1, 13..."
3,Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[O, O, O, O, O, O, O, O, O, O, O]","[12831, 28091, 10689, 1266, 22364, 13604, 2743...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
4,Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[NNP, NN, NN, NNP, NNP, VBD, NNP, ,, NNP, ,, J...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo...","[22833, 17469, 11868, 24069, 22782, 18722, 104...","[14, 1, 1, 15, 7, 1, 2, 1, 14, 1, 16, 1, 16, 1..."


Next we split the data into training and testing

In [16]:
from sklearn.model_selection import train_test_split
from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

def get_pad_train_test_val(data_group, data):

    #get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    #Pad tokens (X var)
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= n_token - 1)

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]

    #Split train, test and validation set
    tokens_, test_tokens, tags_, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    train_tokens, val_tokens, train_tags, val_tags = train_test_split(tokens_,tags_,test_size = 0.25,train_size =0.75, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntest_tags:', len(test_tags),
        '\nval_tokens:', len(val_tokens),
        '\nval_tags:', len(val_tags),
    )

    return train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags

train_tokens, val_tokens, test_tokens, train_tags, val_tags, test_tags = get_pad_train_test_val(data_group, data)

train_tokens length: 32372 
train_tokens length: 32372 
test_tokens length: 4796 
test_tags: 4796 
val_tokens: 10791 
val_tags: 10791


# Model Training

Before creating our model, we need to find some pre-trained embeddings to help us with the task...for example, GloVe embeddings https://nlp.stanford.edu/projects/glove/

In [17]:
!wget https://nlp.stanford.edu/data/glove.6B.zip

--2023-08-09 14:42:56--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-08-09 14:42:56--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2023-08-09 14:45:36 (5.16 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [18]:
#!pip install patool
import patoolib

Collecting patool
  Downloading patool-1.12-py2.py3-none-any.whl (77 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/77.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m71.7/77.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.5/77.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: patool
Successfully installed patool-1.12


In [19]:
patoolib.extract_archive("glove.6B.zip", outdir="./")

patool: Extracting glove.6B.zip ...
patool: running /usr/bin/7z x -o./ -- glove.6B.zip
patool: ... glove.6B.zip extracted to `./'.


'./'

Let's a mapping between the vocabulary in our pretained models and their corresponding embeddings.

In [21]:
def load_embeddings():
  lines = open("./glove.6B.100d.txt", "r").readlines()

  w2e = {}
  for l in lines:
    s = l.split(" ")
    word = s[0]
    embedding = np.zeros( (1, len(s)-1))
    for k, x in enumerate(s[1:]):
      embedding[0,k] = float(x.strip())

    w2e[word] = embedding

  return w2e

w2e = load_embeddings()


We will be using the embedding layer from pytorch to build our network. To do so, we first need to create a weight matrix for each token in our vocabulary. That is, the weight matrix in row i should correspond to the word i in the vocabulary.

In [38]:
np.zeros((2,4))

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [22]:
def create_weight_matrix(w2e, token2idx, idx2token):
  vocab_length = len(list(token2idx.keys()))
  embed_dim = 100
  weight_matrix = np.zeros((vocab_length, embed_dim))

  for i in range(len(list(token2idx.keys()))):
    try:
        w = idx2token[i].lower()
        weight_matrix[i] = w2e[w]
    except KeyError:
        weight_matrix[i] = np.random.normal(scale=0.6, size=(embed_dim, ))

  return weight_matrix

Next, let's make a function that created the embedding layer and loads a weight matrix of pre-trained embeddings.

In [23]:
def create_emb_layer(weight_matrix, non_trainable=False):
    num_embeddings = weight_matrix.shape[0]
    embedding_dim = weight_matrix.shape[1]
    weights = torch.from_numpy(weight_matrix)
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, embedding_dim

Next, let's build the network. Make sure you include a layer that is able to capture sequential dependency.

In [24]:
class Net(nn.Module):
    def __init__(self, weight_matrix, lstm_hidden_dim, num_of_tags):
        super(Net, self).__init__()

        #maps each token to an embedding_dim vector
        self.embedding, embedding_dim = create_emb_layer(weight_matrix)

        #the LSTM takens embedded sentence
        self.lstm = nn.LSTM(embedding_dim, lstm_hidden_dim, batch_first=True)

        #fc layer transforms the output to give the final output layer
        self.fc = nn.Linear(lstm_hidden_dim, num_of_tags)



    def forward(self, s):
        #apply the embedding layer that maps each token to its embedding
        s = self.embedding(s)   # dim: batch_size x batch_max_len x embedding_dim

        #Feeds the embeddings through the LSTM layer. run the LSTM along the sentences of length batch_max_len
        s, _ = self.lstm(s)     # dim: batch_size x batch_max_len x lstm_hidden_dim

        #reshape the LSTM output tensor. reshape the Variable so that each row contains one token
        s = s.reshape(-1, s.shape[2])  # dim: batch_size*batch_max_len x lstm_hidden_dim

        #apply the fully connected layer and obtain the output for each token
        s = self.fc(s)          # dim: batch_size*batch_max_len x num_tags

        return F.log_softmax(s, dim=1)   # dim: batch_size*batch_max_len x num_tags

__Create the loss function used for training our network__

In [25]:

def loss_fn(outputs, labels):
    #reshape labels to give a flat vector of length batch_size*seq_len
    labels = labels.view(-1)

    #the number of tokens is the sum of elements in mask
    num_tokens = labels.shape[0]

    #pick the values corresponding to labels and multiply by mask
    outputs = outputs[range(outputs.shape[0]), labels]

    #cross entropy loss for all non 'PAD' tokens
    return -torch.sum(outputs)/ num_tokens

In [29]:
train_labels, val_labels, test_labels = [], [], []
for x in train_tags:
  train_labels.append( np.argmax(x, axis=1) )
train_labels = np.asarray(train_labels).squeeze()

for x in val_tags:
  val_labels.append( np.argmax(x, axis=1) )
val_labels = np.asarray(val_labels).squeeze()

for x in test_tags:
  test_labels.append( np.argmax(x, axis=1) )
test_labels = np.asarray(test_labels).squeeze()

print('train_labels: ', train_labels.shape)
print('val_labels: ', val_labels.shape)
print('test_labels: ', test_labels.shape)


train_labels:  (32372, 104)
val_labels:  (10791, 104)
test_labels:  (4796, 104)


# training

In [27]:
# Create training routine

weight_matrix = create_weight_matrix(w2e, token2idx, idx2token)
network = Net(weight_matrix, lstm_hidden_dim=128, num_of_tags=len(list(idx2tag.keys())))

optimizer = optim.Adam( network.parameters(), lr=1e-3 )

batch_size=128
indices = np.arange(train_labels.shape[0])
epochs = 2000

losses = []
for e in range(epochs):
  np.random.shuffle(indices)
  selected_indices = indices[:batch_size]
  selected_x = train_tokens[selected_indices]
  selected_y = train_labels[selected_indices]

  selected_x = torch.from_numpy(selected_x)
  selected_y = torch.from_numpy(selected_y)

  probs = network.forward(selected_x)
  loss = loss_fn(probs, selected_y)

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  losses.append(loss.detach())
  if e % 100 == 0:
    print("Epoch: %d - %.6f" %(e, np.mean(losses)))
    losses = []



Epoch: 0 - 2.859882
Epoch: 100 - 0.360054
Epoch: 200 - 0.131084
Epoch: 300 - 0.089995
Epoch: 400 - 0.064764
Epoch: 500 - 0.049594
Epoch: 600 - 0.039611
Epoch: 700 - 0.034407
Epoch: 800 - 0.031621
Epoch: 900 - 0.029189
Epoch: 1000 - 0.026639
Epoch: 1100 - 0.026052
Epoch: 1200 - 0.023835
Epoch: 1300 - 0.023819
Epoch: 1400 - 0.023080
Epoch: 1500 - 0.021442
Epoch: 1600 - 0.020840
Epoch: 1700 - 0.019218
Epoch: 1800 - 0.019230
Epoch: 1900 - 0.018787


Now that we have our model trained. Let's evaluate using the test data in terms of precision, recall, and f1.

In [28]:
#Evaluate precision-recall-f1
from sklearn.metrics import precision_recall_fscore_support

batch_size = 128
k = 0

preds = None
while k < test_tokens.shape[0]:
  x = test_tokens[k:k+batch_size] if k+batch_size < test_tokens.shape[0] else test_tokens[k:]
  y = test_labels[k:k+batch_size] if k+batch_size < test_labels.shape[0] else test_labels[k:]

  x = torch.from_numpy(x)
  y = torch.from_numpy(y)

  probs = network.forward(x).detach().numpy()
  yhat = np.argmax(probs, axis=1)

  preds = yhat if preds is None else np.hstack( (preds, yhat) )

  k += batch_size

labels = [ idx2tag[i] for i in range(len(list(idx2tag.keys()))) ]
test_labels = test_labels.reshape((-1,))

p, r, f, s = precision_recall_fscore_support( test_labels, preds)

for i in range(len(labels)):
  print("Label: %s - Precision: %.4f - Recall: %.4f - f1: %.4f - Support: %.4f" %(labels[i], p[i], r[i], f[i], s[i]) )


Label: I-nat - Precision: 0.0000 - Recall: 0.0000 - f1: 0.0000 - Support: 7.0000
Label: O - Precision: 0.9975 - Recall: 0.9983 - f1: 0.9979 - Support: 482811.0000
Label: B-tim - Precision: 0.8940 - Recall: 0.8335 - f1: 0.8627 - Support: 1994.0000
Label: B-eve - Precision: 0.0000 - Recall: 0.0000 - f1: 0.0000 - Support: 34.0000
Label: I-gpe - Precision: 0.0000 - Recall: 0.0000 - f1: 0.0000 - Support: 22.0000
Label: I-org - Precision: 0.7964 - Recall: 0.7213 - f1: 0.7570 - Support: 1697.0000
Label: B-art - Precision: 0.0000 - Recall: 0.0000 - f1: 0.0000 - Support: 28.0000
Label: I-per - Precision: 0.8534 - Recall: 0.8927 - f1: 0.8726 - Support: 1715.0000
Label: B-nat - Precision: 0.0000 - Recall: 0.0000 - f1: 0.0000 - Support: 17.0000
Label: I-geo - Precision: 0.7854 - Recall: 0.7577 - f1: 0.7713 - Support: 681.0000
Label: I-tim - Precision: 0.7488 - Recall: 0.7033 - f1: 0.7253 - Support: 674.0000
Label: I-eve - Precision: 0.0000 - Recall: 0.0000 - f1: 0.0000 - Support: 23.0000
Label: I-

  _warn_prf(average, modifier, msg_start, len(result))


#Observation
__Classes with small number of examples tend to perform worse than others. This is likely due to data imbalance in training data. Possible ways to address that is, oversample minority class, undersample majority class, or balance mini-batches during training. Another option could also be to include all small classes as a single group, if that level of granularity is not needed__

# End