<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-Packages" data-toc-modified-id="Load-Packages-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load Packages</a></span></li><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load Data</a></span></li><li><span><a href="#Sentence" data-toc-modified-id="Sentence-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Sentence</a></span></li><li><span><a href="#Memorization" data-toc-modified-id="Memorization-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Memorization</a></span></li><li><span><a href="#Random-Forest" data-toc-modified-id="Random-Forest-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Random Forest</a></span><ul class="toc-item"><li><span><a href="#Simple-FE" data-toc-modified-id="Simple-FE-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Simple FE</a></span></li><li><span><a href="#FE" data-toc-modified-id="FE-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>FE</a></span></li></ul></li><li><span><a href="#EOF" data-toc-modified-id="EOF-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>EOF</a></span></li></ul></div>

***
<br>
<span style="font-size:30pt; color:darkslateblue;"><b>
Introduction To Named Entity Recognition  
</b></span>

<img src="ner1.png" alt="Drawing" style="width: 700px;" align="left"/>

In this analysis,  
we use the the following datasets from Kaggle.  
  
https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus
***

# Load Packages

In [32]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import gc
import os
import psutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(font_scale=1.2)

import warnings

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

plt.style.use('ggplot')

from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

import spacy
from spacy import displacy

from tqdm import tqdm
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin

# Load Data

In [2]:
data = pd.read_csv('./ner_dataset.zip')

In [3]:
data.shape

(1048575, 4)

In [4]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


* POS : Part Of Speech
* Tag

Number of tagged entities:

'O': 1146068, 'geo-nam': 58388, 'org-nam': 48034, 'per-nam': 23790, 'gpe-nam': 20680, 'tim-dat': 12786, 'tim-dow': 11404, 'per-tit': 9800, 'per-fam': 8152, 'tim-yoc': 5290, 'tim-moy': 4262, 'per-giv': 2413, 'tim-clo': 891, 'art-nam': 866, 'eve-nam': 602, 'nat-nam': 300, 'tim-nam': 146, 'eve-ord': 107, 'per-ini': 60, 'org-leg': 60, 'per-ord': 38, 'tim-dom': 10, 'per-mid': 1, 'art-add': 1

Essential info about entities:

geo = Geographical Entity  
org = Organization  
per = Person  
gpe = Geopolitical Entity  
tim = Time indicator  
art = Artifact  
eve = Event  
nat = Natural Phenomenon  

In [43]:
data.POS.nunique()

42

In [44]:
data.Tag.nunique()

17

In [5]:
data.Word.nunique()

35178

# Sentence

In [7]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False

    def get_next(self):
        try:
            sent_f = self.data["Sentence #"] == "Sentence: {}".format(self.n_sent)
            sent_l = self.data["Sentence #"] == "Sentence: {}".format(self.n_sent + 1)
            s = self.data.iloc[self.data[sent_f].index.item():self.data[sent_l].index.item(), ]
            self.n_sent += 1
            return s["Word"].values.tolist(), s["POS"].values.tolist(), s["Tag"].values.tolist()
        except:
            self.empty = True
            return None, None, None

In [8]:
sg = SentenceGetter(data=data)

In [9]:
sent, pos, tag = sg.get_next()

In [10]:
print(sent); print(pos); print(tag)

['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'IN', 'NNP', 'CC', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN', 'DT', 'NN', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


# Memorization

In [11]:
class MemoryTagger(BaseEstimator, TransformerMixin):

    def fit(self, X, y):
        '''
        Expects a list of words as X and a list of tags as y.
        '''
        voc = {}
        self.tags = []
        for x, t in zip(X, y):
            if t not in self.tags:
                self.tags.append(t)
            if x in voc:
                if t in voc[x]:
                    voc[x][t] += 1
                else:
                    voc[x][t] = 1
            else:
                voc[x] = {t: 1}
        self.memory = {}
        for k, d in voc.items():
            self.memory[k] = max(d, key=d.get)

    def predict(self, X, y=None):
        '''
        Predict the the tag from memory. If word is unknown, predict 'O'.
        '''
        return [self.memory.get(x, 'O') for x in X]

In [12]:
tagger = MemoryTagger()
tagger.fit(sent, tag)

In [13]:
print(tagger.memory)
print(tagger.tags)

{'London': 'B-geo', 'withdrawal': 'O', 'British': 'B-gpe', 'war': 'O', 'that': 'O', 'country': 'O', 'protest': 'O', 'Iraq': 'B-geo', 'from': 'O', 'the': 'O', 'of': 'O', 'through': 'O', 'and': 'O', 'have': 'O', 'marched': 'O', 'in': 'O', '.': 'O', 'demand': 'O', 'demonstrators': 'O', 'troops': 'O', 'Thousands': 'O', 'to': 'O'}
['O', 'B-geo', 'B-gpe']


In [14]:
pd.DataFrame({'sent' : sent, 'tag' : tagger.predict(sent)})

Unnamed: 0,sent,tag
0,Thousands,O
1,of,O
2,demonstrators,O
3,have,O
4,marched,O
5,through,O
6,London,B-geo
7,to,O
8,protest,O
9,the,O


In [15]:
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import classification_report


This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.



In [16]:
words = data["Word"].values.tolist()
tags = data["Tag"].values.tolist()

In [17]:
pred = cross_val_predict(estimator=MemoryTagger(), X=words, y=tags, cv=5)

In [18]:
pred

array(['O', 'O', 'O', ..., 'O', 'O', 'O'], dtype='<U5')

In [19]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)

             precision    recall  f1-score   support

      B-art       0.17      0.06      0.09       402
      B-eve       0.50      0.25      0.33       308
      B-geo       0.78      0.85      0.81     37644
      B-gpe       0.94      0.93      0.94     15870
      B-nat       0.40      0.30      0.34       201
      B-org       0.67      0.48      0.56     20143
      B-per       0.76      0.66      0.71     16990
      B-tim       0.86      0.77      0.82     20333
      I-art       0.04      0.01      0.01       297
      I-eve       0.39      0.10      0.16       253
      I-geo       0.72      0.58      0.65      7414
      I-gpe       0.62      0.45      0.52       198
      I-nat       0.00      0.00      0.00        51
      I-org       0.68      0.54      0.60     16784
      I-per       0.74      0.64      0.69     17251
      I-tim       0.58      0.13      0.21      6528
          O       0.97      0.99      0.98    887908

avg / total       0.94      0.95      0.94  

# Random Forest

In [21]:
from sklearn.ensemble import RandomForestClassifier


numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.



## Simple FE

In [20]:
def feature_map(word):
    return np.array([word.istitle(), word.islower(), word.isupper(), len(word),
                     word.isdigit(),  word.isalpha()])

In [33]:
words = [feature_map(w) for w in tqdm(data['Word'].values.tolist())]

100%|████████████████████████████████████████████████| 1048575/1048575 [00:02<00:00, 457866.80it/s]


In [34]:
pred = cross_val_predict(
    RandomForestClassifier(n_estimators=20), 
    X=words, 
    y=tags, 
    cv=5
)

In [35]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)

             precision    recall  f1-score   support

      B-art       0.00      0.00      0.00       402
      B-eve       0.00      0.00      0.00       308
      B-geo       0.26      0.79      0.40     37644
      B-gpe       0.26      0.06      0.09     15870
      B-nat       0.00      0.00      0.00       201
      B-org       0.65      0.17      0.27     20143
      B-per       0.97      0.20      0.33     16990
      B-tim       0.29      0.32      0.30     20333
      I-art       0.00      0.00      0.00       297
      I-eve       0.00      0.00      0.00       253
      I-geo       0.00      0.00      0.00      7414
      I-gpe       0.00      0.00      0.00       198
      I-nat       0.00      0.00      0.00        51
      I-org       0.36      0.03      0.06     16784
      I-per       0.47      0.02      0.04     17251
      I-tim       0.50      0.06      0.11      6528
          O       0.97      0.98      0.97    887908

avg / total       0.88      0.87      0.86  

## FE

In [36]:
from sklearn.preprocessing import LabelEncoder

class FeatureTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        self.memory_tagger = MemoryTagger()
        self.tag_encoder = LabelEncoder()
        self.pos_encoder = LabelEncoder()
        
    def fit(self, X, y):
        words = X["Word"].values.tolist()
        self.pos = X["POS"].values.tolist()
        tags = X["Tag"].values.tolist()
        self.memory_tagger.fit(words, tags)
        self.tag_encoder.fit(tags)
        self.pos_encoder.fit(self.pos)
        return self
    
    def transform(self, X, y=None):
        # helper function to check OOPos or not
        def pos_default(p):
            if p in self.pos:
                return self.pos_encoder.transform([p])[0]
            else:
                return -1
        
        pos = X["POS"].values.tolist()
        words = X["Word"].values.tolist()
        out = []  # output of this function
        
        for i in tqdm(range(len(words))):
            w = words[i]
            p = pos[i]
            
            if i < len(words) - 1:
                # wp : previous word, posp : previous pos
                wp = self.tag_encoder.transform(self.memory_tagger.predict([words[i+1]]))[0]
                posp = pos_default(pos[i+1])
            else:
                wp = self.tag_encoder.transform(['O'])[0]
                posp = pos_default(".")
            
            if i > 0:
                if words[i-1] != ".":
                    wm = self.tag_encoder.transform(self.memory_tagger.predict([words[i-1]]))[0]
                    posm = pos_default(pos[i-1])
                else:
                    wm = self.tag_encoder.transform(['O'])[0]
                    posm = pos_default(".")
            else:
                posm = pos_default(".")
                wm = self.tag_encoder.transform(['O'])[0]
            
            out.append(np.array([
                w.istitle(), w.islower(), w.isupper(), len(w), w.isdigit(), w.isalpha(),
                self.tag_encoder.transform(self.memory_tagger.predict([w]))[0],  # a little leaky
                pos_default(p), 
                wp, wm, posp, posm]))
            
        return out

In [37]:
from sklearn.pipeline import Pipeline

In [45]:
pred = cross_val_predict(
    Pipeline([
        ("feature_map", FeatureTransformer()),
        ("clf", RandomForestClassifier(n_estimators=20,
                                       n_jobs=3))
    ]), X=data, y=tags, cv=5)

In [None]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)

# EOF