In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

In [2]:
data = pd.read_csv('../datset/entity-annotated-corpus/ner_dataset.csv',encoding="latin1")

In [3]:
data = data.fillna(method="ffill")
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [4]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words)
n_words

35179

In [5]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [6]:
getter = SentenceGetter(data)

In [7]:
getter.get_next()

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [8]:
tags = list(set(data["Tag"].values))
n_tags = len(tags)
max_len = 75
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [9]:
sentences = getter.sentences
X_word = [[word2idx[w[0]] for w in s] for s in sentences]

In [10]:
len(X_word)

47959

In [11]:
X_word= pad_sequences(X_word,maxlen=75,padding='post',truncating='pre')

In [12]:
X_word[0]

array([22027, 10064,  8847,  6649,  7898, 27037,  7932, 25060, 34811,
        3305,  8745,  5933, 22751,  2373, 32633,  3305,  2201, 10064,
       16187, 21776, 17056, 20891, 20034, 15023,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0], dtype=int32)

In [13]:
y = [[tag2idx[w[2]] for w in s] for s in sentences]

In [14]:
len(y[0])

24

In [15]:
tag2idx.keys()

dict_keys(['I-eve', 'B-geo', 'I-geo', 'I-per', 'B-gpe', 'B-eve', 'O', 'I-tim', 'B-nat', 'B-per', 'I-gpe', 'I-org', 'I-nat', 'B-org', 'B-art', 'I-art', 'B-tim'])

In [16]:
y = pad_sequences(maxlen=75, sequences=y, padding="post", value=tag2idx["O"],truncating='pre')

In [17]:
len(y[0])

75

In [18]:
X_tr, X_te, y_tr, y_te = train_test_split(X_word, y, test_size=0.2)

In [19]:
model = Sequential()
model.add(Embedding(input_dim = n_words+1,output_dim=20,mask_zero=True))
model.add(Bidirectional(keras.layers.GRU(units=50,return_sequences=True)))
model.add(TimeDistributed(Dense(50, activation="softmax")))

W1212 11:20:48.333614 4458218944 deprecation.py:323] From /anaconda3/envs/generative/lib/python3.6/site-packages/tensorflow_core/python/keras/backend.py:3985: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 20)          703600    
_________________________________________________________________
bidirectional (Bidirectional (None, None, 100)         21600     
_________________________________________________________________
time_distributed (TimeDistri (None, None, 50)          5050      
Total params: 730,250
Trainable params: 730,250
Non-trainable params: 0
_________________________________________________________________


In [21]:
model.compile(optimizer="adam", loss='sparse_categorical_crossentropy', metrics=['acc'])

In [22]:
history = model.fit(X_tr, np.array(y_tr), batch_size=128, epochs=5,validation_data=(X_te,y_te))

Train on 38367 samples, validate on 9592 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
