# Named Entity Recognition
- using RNN and conditional random fields

# 1)- Importing key modules

In [0]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [0]:
# For data processing and maths

import pandas as pd
import numpy as np
#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [3]:
import tensorflow

In [4]:
! pip install git+https://www.github.com/keras-team/keras-contrib.git

Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-39rdxiq7
  Running command git clone -q https://www.github.com/keras-team/keras-contrib.git /tmp/pip-req-build-39rdxiq7
Building wheels for collected packages: keras-contrib
  Building wheel for keras-contrib (setup.py) ... [?25l[?25hdone
  Created wheel for keras-contrib: filename=keras_contrib-2.0.8-cp36-none-any.whl size=101065 sha256=80a0b57a1ecd81e03c4022977366a7f8839c1c4b48d120d1fdefe119ea766b9b
  Stored in directory: /tmp/pip-ephem-wheel-cache-74evql1q/wheels/11/27/c8/4ed56de7b55f4f61244e2dc6ef3cdbaff2692527a2ce6502ba
Successfully built keras-contrib


In [5]:
# for model

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF

Using TensorFlow backend.


In [6]:
! pip install version_information



In [7]:
# first install: pip install version_information
%reload_ext version_information
%version_information pandas,numpy,keras, seaborn, matplotlib

Software,Version
Python,3.6.8 64bit [GCC 8.3.0]
IPython,5.5.0
OS,Linux 4.14.137+ x86_64 with Ubuntu 18.04 bionic
pandas,0.25.3
numpy,1.17.4
keras,2.2.5
seaborn,0.9.0
matplotlib,3.1.1
Tue Nov 26 23:18:22 2019 UTC,Tue Nov 26 23:18:22 2019 UTC


# 2)- Getting Data

https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus

In [0]:
data = pd.read_csv("ner_dataset.csv", encoding="latin1")

In [9]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


There are missing values. We shall fill wit ffill. Also it makes sense as sentence 1 contains all words and then comes sentence2.

In [0]:
data = data.fillna(method="ffill")

In [11]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [12]:
data.tail()

Unnamed: 0,Sentence #,Word,POS,Tag
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


So, in total we have 47959 sentences in our data. Let's see how many words we have

In [13]:
len(data["Word"])

1048575

Total words are 1048575. But , there might be many repeating words. Let's find unique words or vocab.

In [14]:
len(data["Word"].unique())

35178

In [0]:
# making list of unique "word" feature
words = list(set(data["Word"].values))

In [16]:
words[:5]

['imagine', 'carrier', 'deactivated', 'articles', 'missiles']

In [17]:
n_words = len(words)
n_words

35178

We have 47959 sentences containing 35178 unique words

In [0]:
tags = list(set(data["Tag"].values))

In [19]:
n_tags = len(tags); n_tags

17

In [20]:
# let's see how many tag we have in our data or what kind of tags we have

data.Tag.value_counts()

O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: Tag, dtype: int64

In [21]:
len(data.Tag.value_counts())

17

In [22]:
# how about POS

len(data.POS.value_counts())

42

# 3)- Data Preparation

In [13]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words); n_words

35179

In [14]:
# checking tags

tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags

17

In [0]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [0]:
getter = SentenceGetter(data)

In [0]:
sent = getter.get_next()

In [18]:
print(sent)

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [0]:
# get all data

sentences = getter.sentences

# 4)-Pre-model 

In [0]:
# introduce dictionaries of words and tags.
max_len = 75
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [21]:
word2idx["Obama"]

4579

In [22]:
tag2idx["B-geo"]

13

### 4a)- Padding

In [0]:
#map the senctences to a sequence of numbers and then pad the sequence

from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w[0]] for w in s] for s in sentences]

In [24]:
X[:1]

[[3504,
  4362,
  23658,
  19123,
  14086,
  14564,
  1865,
  17853,
  27663,
  26852,
  17029,
  27043,
  14937,
  10233,
  8604,
  26852,
  30962,
  4362,
  20829,
  7384,
  7480,
  984,
  6288,
  8859]]

In [0]:
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words-1)

In [26]:
X[:1]

array([[ 3504,  4362, 23658, 19123, 14086, 14564,  1865, 17853, 27663,
        26852, 17029, 27043, 14937, 10233,  8604, 26852, 30962,  4362,
        20829,  7384,  7480,   984,  6288,  8859, 35178, 35178, 35178,
        35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178,
        35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178,
        35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178,
        35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178,
        35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178,
        35178, 35178, 35178]], dtype=int32)

In [0]:
# same for y

y = [[tag2idx[w[2]] for w in s] for s in sentences]

In [0]:
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [29]:
y[:1]

array([[10, 10, 10, 10, 10, 10, 13, 10, 10, 10, 10, 10, 13, 10, 10, 10,
        10, 10,  0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
        10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]], dtype=int32)

### 4b)-change the labels y to categorial feature.

In [0]:
from keras.utils import to_categorical
y = [to_categorical(i, num_classes=n_tags) for i in y]

### 4c)- Split data into train-test

In [0]:
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)

# 5)- Building Model

- LSTM-CRF network 

In [0]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF

In [0]:
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words + 1, output_dim=20,
                  input_length=max_len, mask_zero=True)(input)  # 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(n_tags)  # CRF layer
out = crf(model)  # output

In [0]:
model = Model(input, out)

In [0]:
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])

In [0]:
model.summary()

In [0]:
history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=5,
                    validation_split=0.1, verbose=1)

In [0]:
hist = pd.DataFrame(history.history)

In [0]:
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.figure(figsize=(12,12))
plt.plot(hist["acc"])
plt.plot(hist["val_acc"])
plt.show()

# 6)-Evaluation

In [0]:
test_pred = model.predict(X_te, verbose=1)

In [0]:
idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_te)

In [0]:
print(classification_report(test_labels, pred_labels))

In [0]:
i = 1927
p = model.predict(np.array([X_te[i]]))
p = np.argmax(p, axis=-1)
true = np.argmax(y_te[i], -1)
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(X_te[i], true, p[0]):
    if w != 0:
        print("{:15}: {:5} {}".format(words[w-1], tags[t], tags[pred]))