In [1]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/MyDrive/DATA")
!ls

Mounted at /content/drive
Emotions.txt	   glove.6B.50d.txt  ner.csv	      theta.txt
glove.6B.200d.txt  model.png	     ner_dataset.csv


# Load the dataset

In [2]:
import pandas as pd
df = pd.read_csv('ner_dataset.csv', encoding= 'unicode_escape')


In [3]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


# Extract mappings

In [4]:
from itertools import chain
def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok

In [5]:
token2idx, idx2token = get_dict_map(df, 'token')
tag2idx, idx2tag = get_dict_map(df, 'tag')

In [6]:
token2idx['<PAD>'] = len(list(set(df['Word'].to_list())))+1

In [7]:
print(token2idx['<PAD>'])

35179


In [8]:
idx2token[35178] = '<PAD>'

In [9]:
print(idx2token[35178])

<PAD>


In [10]:
token2idx['UNK'] = len(list(set(df['Word'].to_list())))+2

In [11]:
print(token2idx['UNK'])

35180


In [12]:
idx2token[35179] = '<PAD>'

In [13]:
print(idx2token[35179])

<PAD>


In [14]:
df['Word_idx'] = df['Word'].map(token2idx)
df['Tag_idx'] = df['Tag'].map(tag2idx)
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,Thousands,NNS,O,2444,1
1,,of,IN,O,10670,1
2,,demonstrators,NNS,O,7679,1
3,,have,VBP,O,5362,1
4,,marched,VBN,O,7464,1


In [15]:
df_fill = df.fillna(method='ffill', axis=0)

In [16]:
df_fill.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,Thousands,NNS,O,2444,1
1,Sentence: 1,of,IN,O,10670,1
2,Sentence: 1,demonstrators,NNS,O,7679,1
3,Sentence: 1,have,VBP,O,5362,1
4,Sentence: 1,marched,VBN,O,7464,1


In [17]:
df_group = df_fill.groupby(['Sentence #'],as_index=False)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))

  """Entry point for launching an IPython kernel.


In [18]:
df_group.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,Tag_idx
0,Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[2444, 10670, 7679, 5362, 7464, 25009, 25531, ...","[1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 3, 1, 1, ..."
1,Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,...","[28742, 2655, 29064, 19335, 14707, 16913, 2035...","[13, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,..."
2,Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O...","[21587, 13142, 27714, 16334, 11098, 6530, 4807...","[1, 1, 8, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 2, ..."
3,Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[O, O, O, O, O, O, O, O, O, O, O]","[2780, 21402, 33583, 7831, 1168, 31223, 8058, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
4,Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[NNP, NN, NN, NNP, NNP, VBD, NNP, ,, NNP, ,, J...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo...","[9123, 32170, 8197, 14705, 2800, 21084, 3681, ...","[3, 1, 1, 5, 6, 1, 8, 1, 3, 1, 13, 1, 13, 1, 1..."


# Split the dataset

In [19]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [20]:
def get_pad_train_test_val(data_group, data):

    #get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))

    #Pad tokens (X var)    
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value= token2idx['<PAD>'])

    #Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    
    #Split train, test and validation set
    train_tokens, test_tokens, train_tags, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)
    

    print(
        'train_tokens length:', len(train_tokens),
        '\ntest_tokens length:', len(test_tokens),
        '\ntrain_tags:', len(train_tags),
        '\ntest_tags:', len(test_tags),
    )
    
    return train_tokens, test_tokens, train_tags, test_tags

train_tokens, test_tokens, train_tags, test_tags = get_pad_train_test_val(df_group, df)

train_tokens length: 43163 
test_tokens length: 4796 
train_tags: 43163 
test_tags: 4796


In [21]:
train_tokens.shape

(43163, 104)

In [22]:
train_tokens[0]

array([  151, 29064, 14633,  6545,  8254,  4081, 16913,  8348, 28500,
       24746, 27929, 26205, 19335, 33321,  1733, 16447, 12268, 18307,
        3780, 15283,  4807,  7374,  6182, 25131,  1359, 17255, 25879,
       35179, 35179, 35179, 35179, 35179, 35179, 35179, 35179, 35179,
       35179, 35179, 35179, 35179, 35179, 35179, 35179, 35179, 35179,
       35179, 35179, 35179, 35179, 35179, 35179, 35179, 35179, 35179,
       35179, 35179, 35179, 35179, 35179, 35179, 35179, 35179, 35179,
       35179, 35179, 35179, 35179, 35179, 35179, 35179, 35179, 35179,
       35179, 35179, 35179, 35179, 35179, 35179, 35179, 35179, 35179,
       35179, 35179, 35179, 35179, 35179, 35179, 35179, 35179, 35179,
       35179, 35179, 35179, 35179, 35179, 35179, 35179, 35179, 35179,
       35179, 35179, 35179, 35179, 35179], dtype=int32)

In [23]:
train_tags[0]

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

# Model architecture

In [24]:
import numpy as np
import tensorflow 
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model

In [25]:
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [26]:
input_dim = len(list(set(df['Word'].to_list())))+3
output_dim = 64
input_length = max([len(s) for s in df_group['Word_idx'].tolist()])
n_tags = len(tag2idx)
print('input_dim: ', input_dim, '\noutput_dim: ', output_dim, '\ninput_length: ', input_length, '\nn_tags: ', n_tags)

input_dim:  35181 
output_dim:  64 
input_length:  104 
n_tags:  17


In [27]:
def get_bilstm_lstm_model():
    model = Sequential()

    # Add Embedding layer
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))

    # Add timeDistributed Layer
    model.add(TimeDistributed(Dense(n_tags, activation="relu")))

    #Optimiser 
    adam = tensorflow.keras.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    
    return model

In [28]:
def train_model(X, y, model):
    loss = list()
    for i in range(20):
        hist = model.fit(X, y, batch_size=1000, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [34]:
results = pd.DataFrame()
model_bilstm_lstm = get_bilstm_lstm_model()
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)



In [35]:
test_loss, test_acc = model_bilstm_lstm.evaluate(test_tokens, np.array(test_tags))
print("test accuracy = ", test_acc)

test accuracy =  0.9698707461357117


In [74]:
i = 75
p = model_bilstm_lstm.predict([test_tokens[i]])
p.shape

(104, 1, 17)

In [75]:
p = np.argmax(p, axis=-1)
p.shape

(104, 1)

In [76]:
print(p)

[[1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [5]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]]


In [77]:
true = np.argmax(np.array(test_tags)[i], -1)
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(test_tokens[i], true, p[:,0]):
    if w != 0:
        print("{:15}: {:5} {}".format(idx2token[w-1], idx2tag[t], idx2tag[pred]))

Word           ||True ||Pred
Moustafa       : O     O
Steven         : O     O
lives          : O     O
pedestrians    : O     O
serpent        : O     O
Suez           : O     O
DEA            : O     O
pedestrians    : O     O
releases       : B-tim O
immigration    : I-tim O
once-a-decade  : I-tim O
Chesnot        : I-tim B-per
Moscow-led     : I-tim O
Nomura         : O     O
Pradip         : O     O
Jonathan       : O     O
Ericsson       : O     O
pedestrians    : O     O
sweep          : O     O
once-a-decade  : O     O
Pradip         : O     O
Chairperson    : O     O
Separatists    : O     O
reacting       : O     O
<PAD>          : O     O
<PAD>          : O     O
<PAD>          : O     O
<PAD>          : O     O
<PAD>          : O     O
<PAD>          : O     O
<PAD>          : O     O
<PAD>          : O     O
<PAD>          : O     O
<PAD>          : O     O
<PAD>          : O     O
<PAD>          : O     O
<PAD>          : O     O
<PAD>          : O     O
<PAD>          : 

# Testing with your own sentence

In [72]:
def predict(sentence, model, vocab = token2idx, tag_map = tag2idx):
    x_test_sent = pad_sequences(sequences=[[token2idx.get(token, 0) for token in sentence.split(' ')]], padding="post", value=0, maxlen=input_length)
    p = model.predict(x_test_sent[0])
    p = np.argmax(p, axis=-1)
    print("{:15}||{}".format("Word", "Prediction"))
    print(30 * "=")
    for w, pred in zip(sentence.split(' '), p[:,0]):
      print("{:15}: {:5}".format(w, idx2tag[pred])) 

In [73]:
sentence = "In the context of prehistory, antiquity and contemporary indigenous peoples, the title may refer to tribal kingship. Germanic kingship is cognate with Indo-European traditions of tribal rulership"
predictions = predict(sentence, model_bilstm_lstm, token2idx , tag2idx)

Word           ||Prediction
In             : O    
the            : O    
context        : O    
of             : O    
prehistory,    : B-per
antiquity      : B-per
and            : O    
contemporary   : O    
indigenous     : O    
peoples,       : B-per
the            : O    
title          : O    
may            : O    
refer          : O    
to             : O    
tribal         : O    
kingship.      : B-per
Germanic       : B-geo
kingship       : B-per
is             : O    
cognate        : B-per
with           : O    
Indo-European  : B-per
traditions     : O    
of             : O    
tribal         : O    
rulership      : B-per
