# NER

- word by word dataset
- "Tag" is name of entity. "Pos" is Part of Speech
-  using ML models

# 1)-Importing key modules

In [1]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
# For data processing and maths

import pandas as pd
import numpy as np
#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

# 2)- Loading data

- Loading data
- clean dataset

In [3]:
filename='ner_word-to-word.xlsx'

In [4]:
filename

'ner_word-to-word.xlsx'

In [5]:
df=pd.read_excel(filename)

In [6]:
df.columns

Index(['Document #', 'Sentence #', 'Words', 'Tag', 'Pos'], dtype='object')

In [7]:
df.isnull().sum()

Document #    785
Sentence #    767
Words           0
Tag           734
Pos             0
dtype: int64

In [8]:
df.head()

Unnamed: 0,Document #,Sentence #,Words,Tag,Pos
0,contract1,Sentence:1,Only,,O
1,,,F.UN’s,org,O
2,,,and,,O
3,,,Supplier's contract managers,person,O
4,,,shall,,O


In [9]:
df['Document #'] = df['Document #'].fillna('contract1')

In [10]:
df.head()

Unnamed: 0,Document #,Sentence #,Words,Tag,Pos
0,contract1,Sentence:1,Only,,O
1,contract1,,F.UN’s,org,O
2,contract1,,and,,O
3,contract1,,Supplier's contract managers,person,O
4,contract1,,shall,,O


In [11]:
df['Sentence #']=df['Sentence #'].fillna(method="ffill")

In [12]:
df.isnull().sum()

Document #      0
Sentence #      0
Words           0
Tag           734
Pos             0
dtype: int64

In [13]:
df.head()

Unnamed: 0,Document #,Sentence #,Words,Tag,Pos
0,contract1,Sentence:1,Only,,O
1,contract1,Sentence:1,F.UN’s,org,O
2,contract1,Sentence:1,and,,O
3,contract1,Sentence:1,Supplier's contract managers,person,O
4,contract1,Sentence:1,shall,,O


In [14]:
df.tail()

Unnamed: 0,Document #,Sentence #,Words,Tag,Pos
781,contract1,Sentence: 19,contract,,O
782,contract1,Sentence: 19,changes,,O
783,contract1,Sentence: 19,to,,O
784,contract1,Sentence: 19,the,,O
785,contract1,Sentence: 19,agreement,,O


In [15]:
df.isnull().sum()

Document #      0
Sentence #      0
Words           0
Tag           734
Pos             0
dtype: int64

still, we have missing values in our "Tag" feature

In [16]:
df['Tag'] = df['Tag'].fillna('other')

In [17]:
df.head()

Unnamed: 0,Document #,Sentence #,Words,Tag,Pos
0,contract1,Sentence:1,Only,other,O
1,contract1,Sentence:1,F.UN’s,org,O
2,contract1,Sentence:1,and,other,O
3,contract1,Sentence:1,Supplier's contract managers,person,O
4,contract1,Sentence:1,shall,other,O


In [18]:
df.tail()

Unnamed: 0,Document #,Sentence #,Words,Tag,Pos
781,contract1,Sentence: 19,contract,other,O
782,contract1,Sentence: 19,changes,other,O
783,contract1,Sentence: 19,to,other,O
784,contract1,Sentence: 19,the,other,O
785,contract1,Sentence: 19,agreement,other,O


In [19]:
df.isnull().sum()

Document #    0
Sentence #    0
Words         0
Tag           0
Pos           0
dtype: int64

### save work

In [20]:
df.to_excel('ner_ROOT2.xlsx',index=False)

# 3)-Exploring dataset

In [21]:
data=pd.read_excel('ner_ROOT2.xlsx')

In [22]:
data.shape

(786, 5)

In [23]:
data.head(2)

Unnamed: 0,Document #,Sentence #,Words,Tag,Pos
0,contract1,Sentence:1,Only,other,O
1,contract1,Sentence:1,F.UN’s,org,O


In [24]:
data.isnull().sum()

Document #    0
Sentence #    0
Words         0
Tag           0
Pos           0
dtype: int64

We have total 786 words with 19 sentences.

In [25]:
len(data["Words"])

786

In [26]:
len(data["Words"].unique())

307

Our vocabulary is very thin. We need a dense Vocabulary. It might be possible with more text documents

In [27]:
# making list of unique "word" feature
words = list(set(data["Words"].values))

In [28]:
words[:5]

['Agreement', 'applies.', 'effiecient ', 'For ', 'date']

In [29]:
n_words = len(words)
n_words

307

In [30]:
# let's see how many tag we have in our data or what kind of tags we have

data.Tag.value_counts()

other       734
org          24
supplier     16
person       12
Name: Tag, dtype: int64

In [31]:
len(data.Tag.value_counts())

4

# 4)- Retrieving sentence

In [32]:
d=data

In [33]:
d=d.groupby(["Sentence #","Tag"]).size()

In [34]:
d.head()

Sentence #    Tag     
Sentence: 10  org          1
              other       43
              person       1
              supplier     2
Sentence: 11  org          1
dtype: int64

In [35]:
d_tag=d.unstack('Tag')

In [36]:
d_tag

Tag,org,other,person,supplier
Sentence #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sentence: 10,1.0,43.0,1.0,2.0
Sentence: 11,1.0,14.0,,
Sentence: 12,2.0,40.0,2.0,1.0
Sentence: 13,2.0,32.0,,1.0
Sentence: 14,2.0,45.0,,1.0
Sentence: 15,1.0,41.0,2.0,
Sentence: 16,2.0,55.0,,1.0
Sentence: 17,1.0,20.0,,
Sentence: 18,1.0,9.0,,1.0
Sentence: 19,,44.0,,1.0


- We can see that data retrieval is not possible due to so many missing values. To fix this problem, I took some external help
- Thanks to blog by http://www.depends-on-the-definition.com/introduction-named-entity-recognition-python/

We can use a simpler solution for this

In [37]:
data.head(2)

Unnamed: 0,Document #,Sentence #,Words,Tag,Pos
0,contract1,Sentence:1,Only,other,O
1,contract1,Sentence:1,F.UN’s,org,O


In [38]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Words"].values.tolist(),
                                                           s["Tag"].values.tolist(),
                                                           s["Pos"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [39]:
getter = SentenceGetter(data)

In [40]:
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
sentences[0]

'F.UN contract manager shall decide in its sole direction and and shall notify Supplier in writing email being sufficient as to whether or not  it wishes to proceed with  the implementation of the proposed contract change using the services of  Supplier on the terms of contract  change notice.'

In [41]:
labels = [[s[1] for s in sent] for sent in getter.sentences]
print(labels[0])

['org', 'person', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'supplier', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'other', 'supplier', 'other', 'other', 'other', 'other', 'other', 'other', 'other']


### convert word-index and index-word

In [42]:
tags_vals = list(set(data["Tag"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}

# 5)- Base Model

### 5.1)-most common words

In [43]:
words_counts = {}

from collections import Counter
words_counts = Counter([word for line in data['Words'] for word in line.split(' ')])

# Sorting 
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:10]

# Top 10
most_common_words[:10]


[('contract', 48),
 ('', 47),
 ('the', 45),
 ('change', 37),
 ('to', 31),
 ('F.UN', 23),
 ('of', 23),
 ('and', 21),
 ('in', 21),
 ('shall', 18)]

### 5.2)- Splitting data into Independent and target feature

In [44]:
X=data['Words']
y=data['Tag']

### 5.3)-Split data for train-test set

In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(589,)
(197,)
(589,)
(197,)


### 5.4)- Vectorization

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_features(X_train, X_test):
    """
        X_train, X_val, X_test - input text       
        return TF-IDF vectorizer for each dataset
    """
    
    # filter out too rare words (occur less than in 5 titles) and too frequent words (occur more than in 90% of the tweets)
    # ngram!!! -->  ngram_range=(1,2)
    tfidf_vectorizer = TfidfVectorizer(max_features= 500, max_df=0.9, min_df=5, token_pattern='(\S+)')
    
    # Fit and transform the vectorizer on the train set
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    
    # Only Transform the test sets 
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    
    return X_train_tfidf, X_test_tfidf, tfidf_vectorizer.vocabulary_
    
    
X_train_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_test)

In [47]:
# convert to dense array
X_train_tfidf=X_train_tfidf.toarray()

In [48]:
type(X_train_tfidf)

numpy.ndarray

In [49]:
type(X_train)

pandas.core.series.Series

In [50]:
pd.DataFrame(X_train_tfidf)[:5]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
X_test_tfidf=X_test_tfidf.toarray()

### 5.5)-Building Naive Bayes Model

In [52]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train_tfidf,y_train)
predmnb = mnb.predict(X_test_tfidf)

In [53]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predmnb)

0.9898477157360406

An amazing result achieved that for such thin data. I really hope we are on right track and we can provide excellent results.

In [54]:
from sklearn.metrics import classification_report
print (classification_report(y_test, predmnb))

              precision    recall  f1-score   support

         org       0.86      1.00      0.92         6
       other       0.99      0.99      0.99       186
      person       0.00      0.00      0.00         1
    supplier       1.00      1.00      1.00         4

    accuracy                           0.99       197
   macro avg       0.71      0.75      0.73       197
weighted avg       0.99      0.99      0.99       197



We have pretty good precision scores as well

- "org" has 86% score which is strong.
- "Other" has been found most precisely as we have alot of sample points where Other was given.
- "Person" has zero precision. It does not mean our model did bad. Actually we only had one entity in our test set that was person and that only one our model could not pick. So, I hope once we have more data we won't have this problem.
- "Supplier" has been detected very well. It occured 4 times in data and our model picked it all the time.