In [51]:
import pandas as pd
import numpy as np
import os 
from os import listdir
pd.set_option('display.max_colwidth',200,'display.max_columns',None)

# %matplotlib inline

In [6]:
# !pip install pandas

In [7]:
path='entity-annotated-corpus/'
files=[]
for file in listdir(path):
    files.append(path+file)
files

['entity-annotated-corpus/ner.csv', 'entity-annotated-corpus/ner_dataset.csv']

In [8]:
data=pd.read_csv(files[1], encoding='latin1')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Sentence #  47959 non-null    object
 1   Word        1048575 non-null  object
 2   POS         1048575 non-null  object
 3   Tag         1048575 non-null  object
dtypes: object(4)
memory usage: 32.0+ MB


In [10]:
data.fillna(method='ffill',inplace=True)

In [11]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [12]:
data.describe()

Unnamed: 0,Sentence #,Word,POS,Tag
count,1048575,1048575,1048575,1048575
unique,47959,35178,42,17
top,Sentence: 22480,the,NN,O
freq,104,52573,145807,887908


In [13]:
words=set(data['Word'])
n_words=len(words)
n_words

35178

In [14]:
data.shape

(1048575, 4)

In [27]:
class SentenceGetter():
    def __init__(self,data):
        self.n_sent=1
        self.data=data
        self.empty=False
    def get_next(self):
        try:
            s=self.data[self.data['Sentence #']==f'Sentence: {self.n_sent}']
            self.n_sent+=1
            return s['Word'].values.tolist(),s['POS'].values.tolist(),s['Tag'].values.tolist()
        except :
            self.empty=True
            return None, None, None
        

In [28]:
getter=SentenceGetter(data)

In [29]:
sent, pos, tag = getter.get_next()

In [30]:
sent, pos, tag

(['Thousands',
  'of',
  'demonstrators',
  'have',
  'marched',
  'through',
  'London',
  'to',
  'protest',
  'the',
  'war',
  'in',
  'Iraq',
  'and',
  'demand',
  'the',
  'withdrawal',
  'of',
  'British',
  'troops',
  'from',
  'that',
  'country',
  '.'],
 ['NNS',
  'IN',
  'NNS',
  'VBP',
  'VBN',
  'IN',
  'NNP',
  'TO',
  'VB',
  'DT',
  'NN',
  'IN',
  'NNP',
  'CC',
  'VB',
  'DT',
  'NN',
  'IN',
  'JJ',
  'NNS',
  'IN',
  'DT',
  'NN',
  '.'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-gpe',
  'O',
  'O',
  'O',
  'O',
  'O'])

In [19]:
# getter.get_next()

In [40]:
from sklearn.base import BaseEstimator, TransformerMixin
class MemoryTagger(BaseEstimator, TransformerMixin):
    def fit(self,X,y):
        voc={}
        self.tags=[]
        for x,t in zip(X,y):
            if t not in self.tags:
                self.tags.append(t)
            if x in voc:
                if t not in voc[x]:
                    voc[x][t]=1
                else:
                    voc[x][t]+=1
            else :
                voc[x]={t:1}
        self.memory={}
        for k,d in voc.items():
            self.memory[k]=max(d,key=d.get)
    def predict(self,X,y=None):
        return [self.memory.get(x,'O') for x in X]
                

In [41]:
tagger = MemoryTagger()

In [42]:
tagger.fit(sent,tag)

In [52]:
pd.DataFrame([sent,tagger.predict(sent)])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
0,Thousands,of,demonstrators,have,marched,through,London,to,protest,the,war,in,Iraq,and,demand,the,withdrawal,of,British,troops,from,that,country,.
1,O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-gpe,O,O,O,O,O


In [53]:
tagger.tags

['O', 'B-geo', 'B-gpe']

In [56]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

In [57]:
words = data["Word"].values.tolist()
tags = data["Tag"].values.tolist()

In [58]:
pred = cross_val_predict(estimator=MemoryTagger(), X=words, y=tags, cv=5)

In [59]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)

              precision    recall  f1-score   support

       B-art       0.20      0.05      0.09       402
       B-eve       0.54      0.25      0.34       308
       B-geo       0.78      0.85      0.81     37644
       B-gpe       0.94      0.93      0.94     15870
       B-nat       0.42      0.28      0.33       201
       B-org       0.67      0.49      0.56     20143
       B-per       0.78      0.65      0.71     16990
       B-tim       0.87      0.77      0.82     20333
       I-art       0.04      0.01      0.01       297
       I-eve       0.39      0.12      0.18       253
       I-geo       0.73      0.58      0.65      7414
       I-gpe       0.62      0.45      0.52       198
       I-nat       0.00      0.00      0.00        51
       I-org       0.69      0.53      0.60     16784
       I-per       0.73      0.65      0.69     17251
       I-tim       0.58      0.13      0.21      6528
           O       0.97      0.99      0.98    887908

    accuracy              