In [0]:
#Enhancing LSTMs With Character Embeddings For Named Entity Recognition
#preparing of Dataset
import pandas as pd
import numpy as np
data=pd.read_csv("ner_dataset.csv",encoding="latin1")

In [0]:
#fill the null values with ffill: propagate last valid observation forward to next valid backfill
data = data.fillna(method="ffill")

In [87]:
#ner_dataset contain three columns "Word","POS","Tag"
data.dtypes

Sentence #    object
Word          object
POS           object
Tag           object
dtype: object

In [88]:
#define the the words's values in set and make a list ,count the lenght of words  
words = list(set(data["Word"].values))
n_words = len(words); n_words

35178

In [89]:
#count the lenght of tag values in a given data set :ex-B-gep
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags

17

In [0]:
#So we have 47959 sentences containing 35178 different words with 17 different tags. 
#We use the SentenceGetter class from last post to retrieve sentences with their labels. 
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [0]:

getter = SentenceGetter(data)

In [0]:
sent = getter.get_next()

In [93]:
print(sent)

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [0]:
sentences = getter.sentences

In [95]:
sentences[:3]

[[('Thousands', 'NNS', 'O'),
  ('of', 'IN', 'O'),
  ('demonstrators', 'NNS', 'O'),
  ('have', 'VBP', 'O'),
  ('marched', 'VBN', 'O'),
  ('through', 'IN', 'O'),
  ('London', 'NNP', 'B-geo'),
  ('to', 'TO', 'O'),
  ('protest', 'VB', 'O'),
  ('the', 'DT', 'O'),
  ('war', 'NN', 'O'),
  ('in', 'IN', 'O'),
  ('Iraq', 'NNP', 'B-geo'),
  ('and', 'CC', 'O'),
  ('demand', 'VB', 'O'),
  ('the', 'DT', 'O'),
  ('withdrawal', 'NN', 'O'),
  ('of', 'IN', 'O'),
  ('British', 'JJ', 'B-gpe'),
  ('troops', 'NNS', 'O'),
  ('from', 'IN', 'O'),
  ('that', 'DT', 'O'),
  ('country', 'NN', 'O'),
  ('.', '.', 'O')],
 [('Iranian', 'JJ', 'B-gpe'),
  ('officials', 'NNS', 'O'),
  ('say', 'VBP', 'O'),
  ('they', 'PRP', 'O'),
  ('expect', 'VBP', 'O'),
  ('to', 'TO', 'O'),
  ('get', 'VB', 'O'),
  ('access', 'NN', 'O'),
  ('to', 'TO', 'O'),
  ('sealed', 'JJ', 'O'),
  ('sensitive', 'JJ', 'O'),
  ('parts', 'NNS', 'O'),
  ('of', 'IN', 'O'),
  ('the', 'DT', 'O'),
  ('plant', 'NN', 'O'),
  ('Wednesday', 'NNP', 'B-tim'),
  ('

In [0]:
#Prepare the tokens
max_len = 75
max_len_char = 10

In [0]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1
word2idx["PAD"] = 0
idx2word = {i: w for w, i in word2idx.items()}
tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
idx2tag = {i: w for w, i in tag2idx.items()}

In [98]:
print(word2idx["Obama"])
print(tag2idx["B-geo"])

35002
8


In [0]:
from keras.preprocessing.sequence import pad_sequences
X_word = [[word2idx[w[0]] for w in s] for s in sentences]

In [0]:
X_word = pad_sequences(maxlen=max_len, sequences=X_word, value=word2idx["PAD"], padding='post', truncating='post')

In [101]:
chars = set([w_i for w in words for w_i in w])
n_chars = len(chars)
print(n_chars)

98


In [0]:
char2idx = {c: i + 2 for i, c in enumerate(chars)}

char2idx["UNK"] = 1
char2idx["PAD"] = 0
  

In [103]:
X_char = []
for sentence in sentences:
    sent_seq = []
    for i in range(max_len):#max_len=75
        word_seq = []
        for j in range(max_len_char):#max_len_char=10
            try:
                word_seq.append(char2idx.get(sentence[i][0][j]))
            except:
                word_seq.append(char2idx.get("PAD"))
        sent_seq.append(word_seq)
    
    X_char.append(np.array(sent_seq))
)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[[50, 67, 60, 0, 0, 0, 0, 0, 0, 0], [32, 75, 87, 54, 60, 26, 0, 0, 0, 0]]


In [0]:
char2idx.get(sentences[2][0][1])

In [105]:

sentences[2][0][1]

'NN'

In [0]:
y = [[tag2idx[w[2]] for w in s] for s in sentences]

In [0]:
y = pad_sequences(maxlen=max_len, sequences=y, value=tag2idx["PAD"], padding='post', truncating='post')

In [0]:
from sklearn.model_selection import train_test_split

In [0]:

X_word_tr, X_word_te, y_tr, y_te = train_test_split(X_word, y, test_size=0.1, random_state=2018)
X_char_tr, X_char_te, _, _ = train_test_split(X_char, y, test_size=0.1, random_state=2018)

In [0]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D
from keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D

In [0]:
# input and embedding for words
#n_words=35178+2,o/p dim:20
#35180*20=703600 input dimension of embedding layer 
word_in = Input(shape=(max_len,))
emb_word = Embedding(input_dim=n_words + 2, output_dim=20,
                     input_length=max_len, mask_zero=True)(word_in)



In [114]:
n_chars

98

In [0]:
# input and embeddings for characters
char_in = Input(shape=(max_len, max_len_char,))
# 100*10=1000 parameter require for tD1
emb_char = TimeDistributed(Embedding(input_dim=n_chars + 2, output_dim=10,
                           input_length=max_len_char, mask_zero=True))(char_in)


In [0]:
# character LSTM to get word encodings by characters
#LSTM Calculation :4*((input+1)*output+(output^2))
#LSTM cal:4*(11*20+(20)^2)
char_enc = TimeDistributed(LSTM(units=20, return_sequences=False,
                                recurrent_dropout=0.5))(emb_char)



In [0]:
# main LSTM
x = concatenate([emb_word, char_enc])
x = SpatialDropout1D(0.3)(x)
#36400=4*((41*50)+50^2)
main_lstm = Bidirectional(LSTM(units=50, return_sequences=True,
                               recurrent_dropout=0.6))(x)


In [0]:
out = TimeDistributed(Dense(n_tags + 1, activation="softmax"))(main_lstm)
#param=1818,ip=100,op=18
#param=ip*op+op

model = Model([word_in, char_in], out)

In [0]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")

In [120]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 75, 10)       0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 75)           0                                            
__________________________________________________________________________________________________
time_distributed_4 (TimeDistrib (None, 75, 10, 10)   1000        input_4[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 75, 20)       703600      input_3[0][0]                    
__________________________________________________________________________________________________
time_distr

In [123]:
history = model.fit([X_word_tr,
                     np.array(X_char_tr).reshape((len(X_char_tr), max_len, max_len_char))],
                    np.array(y_tr).reshape(len(y_tr), max_len, 1),
                    batch_size=32, epochs=10, validation_split=0.1, verbose=1)

Train on 38846 samples, validate on 4317 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [0]:
y_pred = model.predict([X_word_te,
                        np.array(X_char_te).reshape((len(X_char_te),
                                                     max_len, max_len_char))])

In [0]:
#model.save("my_model.h5")

In [125]:

i = 1925
p = np.argmax(y_pred[i], axis=-1)
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(X_word_te[i], y_te[i], p):
    if w != 0:
        print("{:15}: {:5} {}".format(idx2word[w], idx2tag[t], idx2tag[pred]))

Word           ||True ||Pred
On             : O     O
Monday         : B-tim B-tim
,              : O     O
British        : B-org B-gpe
Foreign        : I-org O
Secretary      : B-per B-per
Jack           : I-per I-per
Straw          : I-per I-per
said           : O     O
his            : O     O
government     : O     O
has            : O     O
found          : O     O
no             : O     O
evidence       : O     O
the            : O     O
Bush           : B-org B-per
administration : O     O
requested      : O     O
permission     : O     O
to             : O     O
fly            : O     O
terror         : O     O
suspects       : O     O
through        : O     O
Britain        : B-geo B-geo
or             : O     O
its            : O     O
airspace       : O     O
.              : O     O


In [126]:
!git remote add origin https://github.com/ManaliSethi/Name-Entity-recognition-Enhanced-LSTM-with-Character-Embeddings-.git
!git push -u origin master

fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git
