In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk import word_tokenize
from tqdm import tqdm

In [2]:
data = pd.read_csv('NER dataset.csv')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [3]:
tags = list(set(data.POS.values))
tags


['VBZ',
 'VB',
 'WP',
 'JJ',
 ';',
 'NNS',
 'WRB',
 'VBP',
 'EX',
 ',',
 'RRB',
 'FW',
 'RBS',
 'WDT',
 'RBR',
 'JJS',
 'MD',
 'UH',
 'PDT',
 'NNP',
 'PRP',
 'CC',
 'CD',
 'JJR',
 'RP',
 'NNPS',
 '.',
 ':',
 'VBG',
 'RB',
 'LRB',
 'VBN',
 'NN',
 'WP$',
 'TO',
 'IN',
 'POS',
 'DT',
 'PRP$',
 'VBD',
 '``',
 '$']

## Dictionary

In [4]:
words = list(set(data.Word.values))
words

['resolved',
 'U.N.-protected',
 'Terry',
 'popular',
 'reconstruct',
 'colder-than-usual',
 'Jutarnji',
 'Monkey',
 'vents',
 'Cloud',
 'mosquito',
 'reopens',
 'disrupting',
 'Istanbul',
 'consulted',
 'Daughter',
 'seems',
 'Ventures',
 'Vavuniya',
 '3,348',
 'Angolan',
 'Volcanic',
 '300-million-dollar',
 '159',
 'desire',
 'Gachechiladze',
 'caved',
 'conference',
 'soil',
 'reputation',
 'bind',
 'Reduction',
 'cyber',
 'Envoy',
 'Bernard',
 'serpent',
 'suppliers',
 'oxygen',
 'practicing',
 'isolation',
 'big',
 'chapel',
 'eChoupal',
 'Mattoo',
 '60-kilometer',
 'mid-to-late',
 'Yushu',
 'neon',
 'exchange-rate',
 'Chaib',
 'mimics',
 'narrated',
 'applicants',
 'inconvenient',
 'Many',
 'Srinigar',
 'inefficiencies',
 'student-led',
 'notable',
 'headlines',
 'Boedihardjo',
 'IRAQ-POVERTY',
 'Patience',
 'actor-turned-musician',
 'Schearf',
 'Manipur',
 'well-developed',
 'Busta',
 'softer',
 'wheel',
 'Antwerp',
 'Ehud',
 'boat',
 'two-man',
 'geothermal',
 'Abdalla',
 'unaf

In [5]:
np.array(words[:20]).flatten()

array(['resolved', 'U.N.-protected', 'Terry', 'popular', 'reconstruct',
       'colder-than-usual', 'Jutarnji', 'Monkey', 'vents', 'Cloud',
       'mosquito', 'reopens', 'disrupting', 'Istanbul', 'consulted',
       'Daughter', 'seems', 'Ventures', 'Vavuniya', '3,348'], dtype='<U17')

## Adding Word dummy

In [6]:
# adding the word dummy to dictionary so that we can pad sentences
words.append('dummy')

## Creating a class to read sentences

In [7]:
class read_sen():
    def __init__(self, data):
        self.data = data
        agg_func = lambda s : [(w,p,t) for w , p, t in zip(s['Word'].values.tolist(),
                                                          s['POS'].values.tolist(),
                                                          s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [8]:
sentences = read_sen(data).sentences
sentences

[[('Thousands', 'NNS', 'O')],
 [('Iranian', 'JJ', 'B-gpe')],
 [('Helicopter', 'NN', 'O')],
 [('They', 'PRP', 'O')],
 [('U.N.', 'NNP', 'B-geo')],
 [('Mr.', 'NNP', 'B-per')],
 [('He', 'PRP', 'O')],
 [('Some', 'DT', 'O')],
 [('Aid', 'NNP', 'O')],
 [('Lebanese', 'JJ', 'B-gpe')],
 [('In', 'IN', 'O')],
 [('One', 'CD', 'O')],
 [('Lebanon', 'NNP', 'B-geo')],
 [('Syria', 'NNP', 'B-geo')],
 [('The', 'DT', 'O')],
 [('Israeli', 'JJ', 'B-gpe')],
 [('Doctors', 'NNS', 'O')],
 [('The', 'DT', 'O')],
 [('Doctors', 'NNS', 'O')],
 [('Mr.', 'NNP', 'B-per')],
 [('Doctors', 'NNS', 'O')],
 [('The', 'DT', 'O')],
 [('SpaceShipOne', 'NNP', 'B-art')],
 [('To', 'TO', 'O')],
 [('The', 'DT', 'O')],
 [('Three', 'CD', 'O')],
 [('The', 'DT', 'O')],
 [('SpaceShipOne', 'NNP', 'B-art')],
 [('North', 'NNP', 'B-geo')],
 [('The', 'DT', 'O')],
 [('It', 'PRP', 'O')],
 [('The', 'DT', 'O')],
 [('Most', 'JJS', 'O')],
 [('Last', 'JJ', 'O')],
 [('A', 'DT', 'O')],
 [('The', 'DT', 'O')],
 [('Prime', 'JJ', 'O')],
 [('The', 'DT', 'O')]

## Conversion of words and tags

In [9]:
# Convert words and tags into numbers
wordvec = {w : i for i , w in enumerate(words)}
tagvec = {t : i for i, t  in enumerate(tags)}

In [10]:
wordvec

{'resolved': 0,
 'U.N.-protected': 1,
 'Terry': 2,
 'popular': 3,
 'reconstruct': 4,
 'colder-than-usual': 5,
 'Jutarnji': 6,
 'Monkey': 7,
 'vents': 8,
 'Cloud': 9,
 'mosquito': 10,
 'reopens': 11,
 'disrupting': 12,
 'Istanbul': 13,
 'consulted': 14,
 'Daughter': 15,
 'seems': 16,
 'Ventures': 17,
 'Vavuniya': 18,
 '3,348': 19,
 'Angolan': 20,
 'Volcanic': 21,
 '300-million-dollar': 22,
 '159': 23,
 'desire': 24,
 'Gachechiladze': 25,
 'caved': 26,
 'conference': 27,
 'soil': 28,
 'reputation': 29,
 'bind': 30,
 'Reduction': 31,
 'cyber': 32,
 'Envoy': 33,
 'Bernard': 34,
 'serpent': 35,
 'suppliers': 36,
 'oxygen': 37,
 'practicing': 38,
 'isolation': 39,
 'big': 40,
 'chapel': 41,
 'eChoupal': 42,
 'Mattoo': 43,
 '60-kilometer': 44,
 'mid-to-late': 45,
 'Yushu': 46,
 'neon': 47,
 'exchange-rate': 48,
 'Chaib': 49,
 'mimics': 50,
 'narrated': 51,
 'applicants': 52,
 'inconvenient': 53,
 'Many': 54,
 'Srinigar': 55,
 'inefficiencies': 56,
 'student-led': 57,
 'notable': 58,
 'headlin

In [11]:
tagvec

{'VBZ': 0,
 'VB': 1,
 'WP': 2,
 'JJ': 3,
 ';': 4,
 'NNS': 5,
 'WRB': 6,
 'VBP': 7,
 'EX': 8,
 ',': 9,
 'RRB': 10,
 'FW': 11,
 'RBS': 12,
 'WDT': 13,
 'RBR': 14,
 'JJS': 15,
 'MD': 16,
 'UH': 17,
 'PDT': 18,
 'NNP': 19,
 'PRP': 20,
 'CC': 21,
 'CD': 22,
 'JJR': 23,
 'RP': 24,
 'NNPS': 25,
 '.': 26,
 ':': 27,
 'VBG': 28,
 'RB': 29,
 'LRB': 30,
 'VBN': 31,
 'NN': 32,
 'WP$': 33,
 'TO': 34,
 'IN': 35,
 'POS': 36,
 'DT': 37,
 'PRP$': 38,
 'VBD': 39,
 '``': 40,
 '$': 41}

## Prepare input and output data

In [12]:

from keras.preprocessing.sequence import pad_sequences

from keras.preprocessing.sequence import pad_sequences
max_len = 50
X = [[wordvec[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=len(words)-1)
y = [[tagvec[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tagvec["."])

In [13]:
X.shape

(47959, 50)

In [14]:
y.shape

(47959, 50)

In [15]:
print(len(y))

47959


In [16]:
len(tags)

42

##  Conversion of  output to one-hot bit

In [17]:
from tensorflow.keras.utils import to_categorical
y = [to_categorical(i, num_classes = len(tags) )for i in y]

In [18]:
len(y)

47959

## Training and Testing 

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train,X_test,y_train,y_test = train_test_split(X, y , test_size=0.20)

In [21]:
len(X_train)

38367

In [22]:
len(y_train)

38367

In [23]:
import tensorflow  as tf
from tensorflow import keras

In [24]:
X.shape[1]

50

In [25]:
model = keras.Sequential([
    keras.layers.Embedding(input_dim = len(words), output_dim = 50, input_length = X.shape[1]),
    keras.layers.Dropout(0.1),
    keras.layers.Bidirectional(keras.layers.LSTM(X.shape[1], return_sequences = True , recurrent_dropout = 0.2)),
    keras.layers.TimeDistributed(keras.layers.Dense(len(tags), activation = 'softmax'))
])

In [26]:
model.compile(optimizer="rmsprop",loss="categorical_crossentropy", metrics = ['accuracy'])

In [27]:
#y_train = tf.stack(y_train)
#X_train = tf.stack(X_train)

In [28]:
model.fit(X_train, np.array(y_train) , epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1b2fa3c3580>

## Demo Test

In [38]:
# Demo test on one sample. 

i= 1213 # Some test sentence sample
p = model.predict(np.array([X_test[i]]))
p = np.argmax(p, axis =-1)# Map softmax back to a POS index

for i, (w, pred) in enumerate(zip(X_test[i], p[0])): # for every word in the sentence
    print("{:40} -- {}".format(words[w], tags[pred])) # Print word and tag
    if pred == 43 and p[0][i+1] == 43:
        break

He                                       -- PRP
dummy                                    -- .
dummy                                    -- .
dummy                                    -- .
dummy                                    -- .
dummy                                    -- .
dummy                                    -- .
dummy                                    -- .
dummy                                    -- .
dummy                                    -- .
dummy                                    -- .
dummy                                    -- .
dummy                                    -- .
dummy                                    -- .
dummy                                    -- .
dummy                                    -- .
dummy                                    -- .
dummy                                    -- .
dummy                                    -- .
dummy                                    -- .
dummy                                    -- .
dummy                           