In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import keras

In [2]:
print(tf.__version__)
print(keras.__version__)

2.9.1
2.9.0


In [3]:
data = pd.read_csv("ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [4]:
class SentenceGetter(object):
  def __init__(self, data):
    self.n_sent = 1
    self.data = data
    self.empty = False
    agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
    self.grouped = self.data.groupby("Sentence #").apply(agg_func)
    self.sentences = [s for s in self.grouped]
    
getter = SentenceGetter(data)
sentences = getter.sentences
sentences[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [5]:
print("No. of sentences in dataset:", len(sentences))
maxlen = max([len(s) for s in sentences])
print ('Maximum sentence length:', maxlen)

No. of sentences in dataset: 47959
Maximum sentence length: 104


In [6]:
words = list(set(data["Word"].values))
n_words = len(words)
tags = list(set(data["Tag"].values))
n_tags = len(tags)
print(tags)

['B-gpe', 'I-org', 'I-gpe', 'I-tim', 'O', 'I-art', 'I-nat', 'I-eve', 'I-geo', 'B-art', 'B-per', 'B-org', 'B-eve', 'B-geo', 'B-nat', 'I-per', 'B-tim']


In [23]:
#Hyperparameters
BATCH_SIZE = 32
EPOCHS = 3
MAX_LEN = 75
EMBEDDING = 20

In [8]:
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1 # Unknown words
word2idx["PAD"] = 0 # Padding

In [9]:
idx2word = {i: w for w, i in word2idx.items()}

In [10]:
tag2idx = {t: i+1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0

In [11]:
idx2tag = {i: w for w, i in tag2idx.items()}

In [14]:
from keras_preprocessing.sequence import pad_sequences

# Convert the sentence with words to sentence with corresponding index to each word
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=MAX_LEN, sequences=X, padding="post", value=word2idx["PAD"]) #Padding

In [15]:
# Converting tags to indices
y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen=MAX_LEN, sequences=y, padding="post", value=tag2idx["PAD"])

In [17]:
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# One-Hot encode
y = [to_categorical(i, num_classes=n_tags+1) for i in y]  # n_tags+1(PAD)

#Splitting the dataset
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)
X_tr.shape, X_te.shape, np.array(y_tr).shape, np.array(y_te).shape

((43163, 75), (4796, 75), (43163, 75, 18), (4796, 75, 18))

In [19]:
from keras.models import Model
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Input
import tensorflow_addons as tfa

In [20]:
#Model definition
input = Input(shape=(MAX_LEN,))
model = Embedding(input_dim=n_words+2, output_dim=EMBEDDING, 
                  input_length=MAX_LEN)(input)  
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model) 

model = TimeDistributed(Dense(50, activation="relu"))(model)  
#crf = tfa.layers.CRF(n_tags+1) 
#output = crf(model)  
output = tfa.layers.CRF(n_tags+1)(model)[1] 

In [21]:
model = Model(input, output)
model.compile(optimizer="rmsprop", loss='CategoricalCrossentropy', metrics=['Accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 75)]              0         
                                                                 
 embedding (Embedding)       (None, 75, 20)            703600    
                                                                 
 bidirectional (Bidirectiona  (None, 75, 100)          28400     
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 75, 50)           5050      
 ibuted)                                                         
                                                                 
 crf (CRF)                   [(None, 75),              1278      
                              (None, 75, 18),                    
                              (None,),                       

In [24]:
history = model.fit(X_tr, np.array(y_tr), batch_size=BATCH_SIZE, epochs=EPOCHS,validation_split=0.1, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [25]:
model.save('ner_model.h5')
