First two cells are requried for execution in Google Colab notebook. And can be avoided for local notebook execution

In [1]:
# Reference: https://towardsdatascience.com/named-entity-recognition-and-classification-with-scikit-learn-f05372f07ba2
# To mount gdrive into Google Colab 
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [2]:
# Copy data to google colab from google drive and unzip
# This may take 1-2 minutes
!cp gdrive/My\ Drive/ADBI\ Project/Dataset/ner_dataset.csv.zip .
!cp gdrive/My\ Drive/ADBI\ Project/Dataset/barack.txt .
!unzip ner_dataset.csv.zip

Archive:  ner_dataset.csv.zip
  inflating: ner_dataset.csv         


## Implementation

In [0]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

### Load Data

In [4]:
df = pd.read_csv('ner_dataset.csv', encoding = "ISO-8859-1")
df = df[:10000]
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [5]:
df.isnull().sum()

Sentence #    9543
Word             0
POS              0
Tag              0
dtype: int64

## Data Preprocessing

In [6]:
df = df.fillna(method='ffill')
df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique()

(457, 2746, 17)

#### Tags not evenly distributed

In [7]:
df.groupby('Tag').size().reset_index(name='counts')

Unnamed: 0,Tag,counts
0,B-art,28
1,B-eve,10
2,B-geo,244
3,B-gpe,303
4,B-nat,5
5,B-org,176
6,B-per,160
7,B-tim,149
8,I-art,20
9,I-eve,10


###Train Test split - 70-30%

In [8]:
X = df.drop('Tag', axis=1)
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
y = df.Tag.values
classes = np.unique(y)
classes = classes.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0)
print(X_train.shape, y_train.shape)
new_classes = classes.copy()
new_classes.pop()

(6700, 3242) (6700,)


'O'

## CRF - Conditional Random Fields 

In [9]:
# Installation of sklearn_crfsuite
! pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3 (from sklearn_crfsuite)
[?25l  Downloading https://files.pythonhosted.org/packages/2f/86/cfcd71edca9d25d3d331209a20f6314b6f3f134c29478f90559cee9ce091/python_crfsuite-0.9.6-cp36-cp36m-manylinux1_x86_64.whl (754kB)
[K     |████████████████████████████████| 757kB 4.1MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.6 sklearn-crfsuite-0.3.6


In [0]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter

### Sentences

In [0]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None
getter = SentenceGetter(df)
sentences = getter.sentences

### Feature Extraction and Train-Test split

In [0]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features
  
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]

X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

### CRF model

In [13]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)
y_pred = crf.predict(X_test)

print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes))

              precision    recall  f1-score   support

       B-art       0.50      0.40      0.44         5
       B-eve       0.00      0.00      0.00         2
       B-geo       0.79      0.68      0.73        77
       B-gpe       0.75      0.88      0.81        91
       B-nat       0.00      0.00      0.00         2
       B-org       0.77      0.68      0.72        53
       B-per       0.85      0.92      0.88        61
       B-tim       0.95      0.89      0.92        45
       I-art       0.00      0.00      0.00         4
       I-eve       0.00      0.00      0.00         1
       I-geo       0.75      0.38      0.50        16
       I-gpe       0.67      0.57      0.62         7
       I-nat       0.00      0.00      0.00         2
       I-org       0.74      0.70      0.72        50
       I-per       0.87      0.97      0.92        75
       I-tim       0.33      1.00      0.50         1

   micro avg       0.80      0.78      0.79       492
   macro avg       0.50   

  'precision', 'predicted', average, warn_for)


## NER with Spacy
On comparing the performance of Spacy and the above implemented CRF model, it is observed that Spacy has a better performance. Hence, we use Spacy further in our project.

In [0]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
nlp = en_core_web_sm.load()
from pprint import pprint

###  Preprocess the dataset and convert words into paragraph

In [0]:
agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        
        
grouped = df.groupby('Sentence #').apply(agg_func)
grouped = [s for s in grouped]
sentences = []

iob_tag_list = []
no_of_words = 0
for sent in grouped:
  no_of_words += len(sent)
  
  sentences.append(" ".join([x[0] for x in sent]))
  iob_tag_list.append([(x[0],x[2]) for x in sent])

### Spacy model evalution

In [20]:
# Process actual pair to split '-' words into separate rows
def process_actual_pair(actual_pair):
  modified_actual_pair = []
  for pair in actual_pair:
    if '-' in pair[0] and not re.search(r'\d', pair[0]) and '-'*len(pair[0]) != pair[0]:
      words = [ w for w in pair[0].split('-') if w.strip() != '']
      count = len(words) - 1
      for word in words:
        if word == '':
          print("Space")
        modified_actual_pair.append((word, pair[1]))
        if count > 0:
          modified_actual_pair.append(('-', pair[1]))
          count -= 1
    elif re.search(r'\d', pair[0]) and len(pair[0].split('-')) > 2:
      words = pair[0].split('-')
      first, last =  words[:2], words[2:-1]
    else:
        modified_actual_pair.append(pair)
  return modified_actual_pair
        

def calculate_spacy_performance_metrics(predictions, actual_pair):
  modified_actual_pair = process_actual_pair(actual_pair)
  no_pred = len(predictions)
  no_mod = len(modified_actual_pair)
  truth = []
  test = []
  if no_pred == no_mod:
    for pred, actual in zip(predictions, modified_actual_pair):  
      if str(pred[0]) == str(actual[0]) :
        test.append(pred[1])
        truth.append(actual[1])
    return test, truth
  else:
#     print("Modified")
#     print(no_pred, no_mod)
    return [], []


truth = []
test = []

for index, sentence in enumerate(sentences):
  doc = nlp(sentence)
  iob_tags = iob_tag_list[index]
  predictions = [(X, X.ent_iob_) for X in doc]
  result = calculate_spacy_performance_metrics(predictions, iob_tags)
  test += result[0]
  truth += result[1]

print("Spacy: ")
print("\t \t Precision \t Recall \t F-Score")
print("Micro: " + str(precision_recall_fscore_support(truth, test, average='micro')))
print("Macro: " + str(precision_recall_fscore_support(truth, test, average='macro')))
print("Weighted: " + str(precision_recall_fscore_support(truth, test, average='weighted')))
print("Accuracy: " + str(accuracy_score(truth, test)))

Spacy: 
	 	 Precision 	 Recall 	 F-Score
Micro: (0.7886391007627459, 0.7886391007627459, 0.7886391007627459, None)
Macro: (0.05160705607292501, 0.04897323237044654, 0.0502556591978153, None)
Weighted: (0.8310528083280995, 0.7886391007627459, 0.809290625911828, None)
Accuracy: 0.7886391007627459


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
