<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-Packages" data-toc-modified-id="Load-Packages-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load Packages</a></span></li><li><span><a href="#Memorization" data-toc-modified-id="Memorization-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Memorization</a></span></li></ul></div>

***
<br>
<span style="font-size:30pt; color:darkslateblue;"><b>
Introduction To Named Entity Recognition  
</b></span>

<img src="ner1.png" alt="Drawing" style="width: 700px;" align="left"/>

In this analysis,  
we use the the following datasets from Kaggle.  
  
https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus
***

# Load Packages

In [81]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import gc
import os
import psutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(font_scale=1.2)

import warnings

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

plt.style.use('ggplot')

from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

import spacy
from spacy import displacy

from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
data = pd.read_csv('./ner_dataset.zip')

In [7]:
data.shape

(1048575, 4)

In [44]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


Number of tagged entities:

'O': 1146068, 'geo-nam': 58388, 'org-nam': 48034, 'per-nam': 23790, 'gpe-nam': 20680, 'tim-dat': 12786, 'tim-dow': 11404, 'per-tit': 9800, 'per-fam': 8152, 'tim-yoc': 5290, 'tim-moy': 4262, 'per-giv': 2413, 'tim-clo': 891, 'art-nam': 866, 'eve-nam': 602, 'nat-nam': 300, 'tim-nam': 146, 'eve-ord': 107, 'per-ini': 60, 'org-leg': 60, 'per-ord': 38, 'tim-dom': 10, 'per-mid': 1, 'art-add': 1

Essential info about entities:

geo = Geographical Entity  
org = Organization  
per = Person  
gpe = Geopolitical Entity  
tim = Time indicator  
art = Artifact  
eve = Event  
nat = Natural Phenomenon  

In [11]:
data.Word.nunique()

35178

In [48]:
data[data["Sentence #"] == "Sentence: {}".format(1)].index

Int64Index([0], dtype='int64')

In [75]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False

    def get_next(self):
        try:
            sent_f = self.data["Sentence #"] == "Sentence: {}".format(self.n_sent)
            sent_l = self.data["Sentence #"] == "Sentence: {}".format(self.n_sent + 1)
            s = self.data.iloc[self.data[sent_f].index.item():self.data[sent_l].index.item(), ]
            self.n_sent += 1
            return s["Word"].values.tolist(), s["POS"].values.tolist(), s["Tag"].values.tolist()
        except:
            self.empty = True
            return None, None, None

In [76]:
sg = SentenceGetter(data=data)

In [77]:
sent, pos, tag = sg.get_next()

In [79]:
print(sent); print(pos); print(tag)

['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'IN', 'NNP', 'CC', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN', 'DT', 'NN', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


# Memorization

In [82]:
class MemoryTagger(BaseEstimator, TransformerMixin):

    def fit(self, X, y):
        '''
        Expects a list of words as X and a list of tags as y.
        '''
        voc = {}
        self.tags = []
        for x, t in zip(X, y):
            if t not in self.tags:
                self.tags.append(t)
            if x in voc:
                if t in voc[x]:
                    voc[x][t] += 1
                else:
                    voc[x][t] = 1
            else:
                voc[x] = {t: 1}
        self.memory = {}
        for k, d in voc.items():
            self.memory[k] = max(d, key=d.get)

    def predict(self, X, y=None):
        '''
        Predict the the tag from memory. If word is unknown, predict 'O'.
        '''
        return [self.memory.get(x, 'O') for x in X]

In [83]:
tagger = MemoryTagger()
tagger.fit(sent, tag)

In [98]:
print(tagger.memory)
print(tagger.tags)

{'war': 'O', '.': 'O', 'through': 'O', 'troops': 'O', 'in': 'O', 'from': 'O', 'demonstrators': 'O', 'protest': 'O', 'and': 'O', 'marched': 'O', 'to': 'O', 'that': 'O', 'Thousands': 'O', 'withdrawal': 'O', 'of': 'O', 'London': 'B-geo', 'have': 'O', 'Iraq': 'B-geo', 'the': 'O', 'British': 'B-gpe', 'demand': 'O', 'country': 'O'}
['O', 'B-geo', 'B-gpe']


In [97]:
pd.DataFrame({'sent' : sent, 'tag' : tagger.predict(sent)})

Unnamed: 0,sent,tag
0,Thousands,O
1,of,O
2,demonstrators,O
3,have,O
4,marched,O
5,through,O
6,London,B-geo
7,to,O
8,protest,O
9,the,O


In [86]:
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import classification_report


This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.



In [87]:
words = data["Word"].values.tolist()
tags = data["Tag"].values.tolist()

In [88]:
pred = cross_val_predict(estimator=MemoryTagger(), X=words, y=tags, cv=5)

In [93]:
pred

array(['O', 'O', 'O', ..., 'O', 'O', 'O'], dtype='<U5')

In [90]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)

             precision    recall  f1-score   support

      B-art       0.18      0.06      0.10       402
      B-eve       0.50      0.25      0.33       308
      B-geo       0.78      0.85      0.81     37644
      B-gpe       0.94      0.93      0.94     15870
      B-nat       0.43      0.30      0.35       201
      B-org       0.67      0.48      0.56     20143
      B-per       0.79      0.64      0.71     16990
      B-tim       0.86      0.77      0.82     20333
      I-art       0.07      0.01      0.01       297
      I-eve       0.36      0.12      0.18       253
      I-geo       0.73      0.58      0.65      7414
      I-gpe       0.62      0.45      0.53       198
      I-nat       0.00      0.00      0.00        51
      I-org       0.70      0.53      0.60     16784
      I-per       0.73      0.66      0.69     17251
      I-tim       0.59      0.13      0.21      6528
          O       0.97      0.99      0.98    887908

avg / total       0.94      0.95      0.94  