# LSTM for named entities

In [2]:
import numpy as np
import tensorflow
from tensorflow.keras import Sequential, Model, Input, optimizers
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.utils import plot_model
import pandas as pd
from itertools import chain
from sklearn.model_selection import train_test_split
from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from gensim.models import KeyedVectors

In [3]:
train_txt_path = '../../conll2003.train.conll'
dev_txt_path = '../../conll2003.dev.conll'

In [4]:
from numpy.random import seed
seed(1)
tensorflow.random.set_seed(2)

In [5]:
paths = [train_txt_path, dev_txt_path]
eval_split = 'dev'
output_path = 'lsmt-out.csv'
path_emb = '../../data/GoogleNews-vectors-negative300.bin'

In [9]:
def convert_data(paths):

    data = []
    sent_id = 1
    for path in paths:
        split = path.split('.')[-2]
        with open(path) as infile:
            lines = infile.read().split('\n')
        for n, line in enumerate(lines):
            ll = line.split('\t')
            if len(ll) > 2:
                d = dict()
                d['Sentence #'] = f'Sentence: {sent_id}'
                d['Word'] = ll[0]
                d['POS'] = ll[1]
                d['Chunk'] = ll[2]
                d['Tag'] = ll[-1]
                d['Split'] = split
                data.append(d)

            else:
                sent_id += 1
    data = pd.DataFrame(data)
    return data

In [10]:
data = convert_data(paths)
data.head()

Unnamed: 0,Sentence #,Word,POS,Chunk,Tag,Split
0,Sentence: 1,EU,NNP,B-NP,B-ORG,train
1,Sentence: 1,rejects,VBZ,B-VP,O,train
2,Sentence: 1,German,JJ,B-NP,B-MISC,train
3,Sentence: 1,call,NN,I-NP,O,train
4,Sentence: 1,to,TO,B-VP,O,train


In [11]:
def get_dict_map(data, token_or_tag, embedding_model=None):
    
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Word'].to_list()))
    else:
        vocab = list(set(data['Tag'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}   
    
    return tok2idx, idx2tok

In [12]:
token2idx, idx2token = get_dict_map(data, 'token')
tag2idx, idx2tag = get_dict_map(data, 'tag')
print(len(token2idx))
print(len(tag2idx))

26883
9


In [13]:
w2v_model = KeyedVectors.load_word2vec_format(path_emb, binary=True)

In [14]:
# Create embedding matrix with zero vectors for oov words
emb_dim = 300
embedding_matrix = np.zeros((len(token2idx) + 1, emb_dim))
print(embedding_matrix.shape)
for word, i in token2idx.items():
    if word in w2v_model.key_to_index:
        embedding_vector = w2v_model[word]
    else:
        embedding_vector = None
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

(26884, 300)


In [15]:
# Check dimensions, store number of vector dimensions in variable
print(embedding_matrix.shape)
emb_dim = embedding_matrix.shape[1]
print(emb_dim)

(26884, 300)
300


In [16]:
# Add index info to dataframe
data['Word_idx'] = data['Word'].map(token2idx)
data['Tag_idx'] = data['Tag'].map(tag2idx)

data.head()

Unnamed: 0,Sentence #,Word,POS,Chunk,Tag,Split,Word_idx,Tag_idx
0,Sentence: 1,EU,NNP,B-NP,B-ORG,train,11872,5
1,Sentence: 1,rejects,VBZ,B-VP,O,train,7481,4
2,Sentence: 1,German,JJ,B-NP,B-MISC,train,10272,1
3,Sentence: 1,call,NN,I-NP,O,train,521,4
4,Sentence: 1,to,TO,B-VP,O,train,7614,4


In [17]:
# Group data by sentences
# Fill na
data_fillna = data.fillna(method='ffill', axis=0)
# Groupby and collect columns
data_group = data_fillna.groupby(
['Sentence #'],as_index=False
)['Word', 'POS', 'Tag', 'Word_idx', 'Tag_idx', 'Split'].agg(lambda x: list(x))
# Visualise data
data_group.head()

  data_group = data_fillna.groupby(


Unnamed: 0,Sentence #,Word,POS,Chunk,Tag,Word_idx,Tag_idx,Split
0,Sentence: 1,"[EU, rejects, German, call, to, boycott, Briti...","[NNP, VBZ, JJ, NN, TO, VB, JJ, NN, .]","[B-NP, B-VP, B-NP, I-NP, B-VP, I-VP, B-NP, I-N...","[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]","[11872, 7481, 10272, 521, 7614, 16881, 9411, 8...","[5, 4, 1, 4, 4, 4, 1, 4, 4]","[train, train, train, train, train, train, tra..."
1,Sentence: 10,"[But, Fischler, agreed, to, review, his, propo...","[CC, NNP, VBD, TO, VB, PRP$, NN, IN, DT, NNP, ...","[O, B-NP, B-VP, I-VP, I-VP, B-NP, I-NP, B-PP, ...","[O, B-PER, O, O, O, O, O, O, O, B-ORG, O, O, O...","[24100, 26689, 20772, 7614, 11873, 5391, 21838...","[4, 3, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, ...","[train, train, train, train, train, train, tra..."
2,Sentence: 100,"[The, Syrians, are, confused, ,, they, are, de...","[DT, NNPS, VBP, VBN, ,, PRP, VBP, RB, JJ, ,, C...","[B-NP, I-NP, B-VP, B-ADJP, O, B-NP, B-VP, I-VP...","[O, B-MISC, O, O, O, O, O, O, O, O, O, O, O, O...","[8089, 854, 13577, 14901, 15792, 23441, 13577,...","[4, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...","[train, train, train, train, train, train, tra..."
3,Sentence: 1000,"[The, youth, side, replied, with, 246, for, se...","[DT, NN, NN, VBD, IN, CD, IN, CD, .]","[B-NP, I-NP, I-NP, B-VP, B-PP, B-NP, B-PP, B-N...","[O, O, O, O, O, O, O, O, O]","[8089, 24700, 19792, 19891, 616, 8125, 8356, 1...","[4, 4, 4, 4, 4, 4, 4, 4, 4]","[train, train, train, train, train, train, tra..."
4,Sentence: 10000,"[Men, 's, 3,000, metres, :]","[NN, POS, CD, NNS, :]","[B-NP, B-NP, I-NP, I-NP, O]","[O, O, O, O, O]","[24834, 7500, 12506, 8144, 16997]","[4, 4, 4, 4, 4]","[train, train, train, train, train]"


In [18]:
def get_pad_train_test_val(data_group, data, eval_split='dev'):
    
    # Get max token and tag length
    n_token = len(list(set(data['Word'].to_list())))
    n_tag = len(list(set(data['Tag'].to_list())))
    print(n_token)
    
    # Pad tokens (X var) 
    tokens = data_group['Word_idx'].tolist()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int64', padding='post', value= 26883)
    print('padding', len(pad_tokens[0]))
    
    # Pad Tags (y var) and convert it into one hot encoding
    tags = data_group['Tag_idx'].tolist()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int64', padding='post', value= tag2idx["O"])
    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]
    
    train_tokens = []
    dev_tokens = []
    train_tags = []
    dev_tags = []
    for i, row in data_group.iterrows():
        if 'train' in row['Split']:
            train_tokens.append(pad_tokens[i])
            train_tags.append(pad_tags[i])
        elif eval_split in row['Split']:
            dev_tokens.append(pad_tokens[i])
            dev_tags.append(pad_tags[i])

    print(
        'train_tokens length:', len(train_tokens),
        '\ntrain_tokens length:', len(train_tokens),
        '\nval_tokens:', len(dev_tokens),
        '\nval_tags:', len(dev_tags))
 
    return np.array(train_tokens), np.array(dev_tokens),  np.array(train_tags), np.array(dev_tags)

In [19]:
train_tokens, dev_tokens, train_tags, dev_tags = get_pad_train_test_val(data_group, data, eval_split= eval_split)

26883
padding 113
train_tokens length: 14041 
train_tokens length: 14041 
val_tokens: 3250 
val_tags: 3250


In [20]:
input_dim = len(list(set(data['Word'].to_list()))) +1
output_dim = emb_dim # number of dimensions
input_length = max([len(s) for s in data_group['Word_idx'].tolist()])
n_tags = len(tag2idx)
print('input_dim: ', 
      input_dim, '\noutput_dim: ', 
      output_dim, '\ninput_length: ', 
      input_length, '\nn_tags: ', n_tags)
print('emb dim', emb_dim)

input_dim:  26884 
output_dim:  300 
input_length:  113 
n_tags:  9
emb dim 300


In [21]:
def get_bilstm_lstm_model(embedding_matrix, embedding_dim):
    
    model = Sequential()
    # Add Embedding layer original, trainable
    # model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
    print(len(token2idx))
    embedding_layer = Embedding(len(token2idx)+1 ,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=input_length,
                            trainable=False)
    model.add(embedding_layer)

    # Add bidirectional LSTM
    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

    # Add LSTM
    model.add(TimeDistributed(Dense(n_tags, activation="sigmoid")))

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])
    model.summary()
    
    return model

In [22]:
def train_model(X, y, model):
    loss = list()
    for i in range(3):
        hist = model.fit(X, y, batch_size=200, verbose=1, epochs=1, validation_split=0.2)
        loss.append(hist.history['loss'][0])
    return loss

In [24]:
results = pd.DataFrame()
embedding_dim = 300
model_bilstm_lstm = get_bilstm_lstm_model(embedding_matrix, embedding_dim)
plot_model(model_bilstm_lstm)
results['with_add_lstm'] = train_model(train_tokens, train_tags, model_bilstm_lstm)

26883
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 113, 300)          8065200   
                                                                 
 bidirectional_1 (Bidirectio  (None, 113, 600)         1442400   
 nal)                                                            
                                                                 
 time_distributed_1 (TimeDis  (None, 113, 9)           5409      
 tributed)                                                       
                                                                 
Total params: 9,513,009
Trainable params: 1,447,809
Non-trainable params: 8,065,200
_________________________________________________________________
You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [25]:
results = model_bilstm_lstm.evaluate(dev_tokens, np.array(dev_tags), batch_size=1)
print("test loss, test acc:", results)

test loss, test acc: [0.03041241690516472, 0.992005467414856]


In [14]:
from sklearn.metrics import classification_report
import numpy as np

In [27]:
y_pred = model_bilstm_lstm.predict(dev_tokens)
y_pred = np.argmax(y_pred, axis=-1)
y_dev = np.argmax(dev_tags, axis=-1)



In [63]:
pred_labels = []
for tag in y_pred:
    for i in tag:
        label = idx2tag[i]
        pred_labels.append(label)
    
dev_labels = []
for tag in y_dev:
    for i in tag:
        label = idx2tag[i]
        dev_labels.append(label) 

In [57]:
print(len(pred_labels))
print(len(dev_labels))

367250
367250


In [9]:
classes = np.unique(dev_labels)
classes = classes.tolist()
print(classes)

['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


In [10]:
classes = classes.copy()
classes.pop()
classes

['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER']

In [46]:
print(classification_report(dev_labels, pred_labels, labels=classes, digits = 3))

              precision    recall  f1-score   support

       B-LOC      0.916     0.643     0.755      1837
      B-MISC      0.788     0.696     0.739       922
       B-ORG      0.480     0.879     0.621      1341
       B-PER      0.923     0.832     0.875      1842
       I-LOC      0.758     0.183     0.295       257
      I-MISC      0.715     0.341     0.462       346
       I-ORG      0.607     0.483     0.538       751
       I-PER      0.957     0.747     0.839      1307

   micro avg      0.749     0.702     0.725      8603
   macro avg      0.768     0.601     0.641      8603
weighted avg      0.802     0.702     0.727      8603



In [44]:
import sklearn
print(classes)
print(sklearn.metrics.confusion_matrix(dev_labels, pred_labels, labels=classes))

['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER']
[[1181   84  431   13    5    0    9    0]
 [  11  642  130   13    0    3    5    0]
 [   8   22 1179   13    0    0   20    0]
 [   3    3  147 1533    0    1    7   21]
 [  48    0   34    2   47    8   85    9]
 [   1   37   35    1    4  118   43    4]
 [  25    4  209    6    6    7  363    9]
 [   3    0   35   68    0    1   22  976]]
