# Deliverable 2

- Deliverable 2 will be a NER (Named entity recognition system).


# 1. Prepate the data

url = https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus


Essential info about entities:

```
geo = Geographical Entity
org = Organization
per = Person
gpe = Geopolitical Entity
tim = Time indicator
art = Artifact
eve = Event
nat = Natural Phenomenon
```


In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

Read the data.

In [2]:
data = pd.read_csv("../data/kaggle_ner/ner_dataset.csv",
                   encoding="latin1")

In [3]:
data.head(70)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
...,...,...,...,...
65,,Hyde,NNP,B-geo
66,,Park,NNP,I-geo
67,,.,.,O
68,Sentence: 4,Police,NNS,O


Notice that one sentence spans various rows. The sentence is written along the column "Words" and has NaN's in the "Sentence #" column, except for the first word of each sentence, which has the number of the sentence in such column. Therefore, the first thing we should do is fix this structural problem. This will be done by filling with "Sentence: k" for each k.

## Indexing Sentences

In [4]:
sentence_formatter = "Sentence: {}"

In [5]:
last_n = 2000
end   = data.index[data["Sentence #"] == sentence_formatter.format(last_n)][0]

In [6]:
data = data[0:end]

In [7]:
n_sentences = len(list(set(data["Sentence #"])))
first_n = 1
last_n = last_n -1
print(n_sentences)

2000


In [8]:
%%time 
sentence_formatter = "Sentence: {}"

for s_id in  range(first_n, last_n):
    print("current {}/{}".format(s_id,last_n), end="\r")
    sentence_id = sentence_formatter.format(s_id)
    sentence_id_next = sentence_formatter.format(s_id + 1)
    start = data.index[data["Sentence #"] == sentence_id][0]
    end   = data.index[data["Sentence #"] == sentence_id_next][0]
    data["Sentence #"][start:end] = sentence_id
    
sentence_id = sentence_formatter.format(last_n)
start = data.index[data["Sentence #"] == sentence_id][0]
end   = data.shape[0]
data["Sentence #"][start:end] = sentence_id

CPU times: user 8.78 s, sys: 140 ms, total: 8.92 s
Wall time: 8.48 s


Now, we have the sentences properly identified.

In [9]:
data.head(70)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
...,...,...,...,...
65,Sentence: 3,Hyde,NNP,B-geo
66,Sentence: 3,Park,NNP,I-geo
67,Sentence: 3,.,.,O
68,Sentence: 4,Police,NNS,O


# 2. Feature mapper
What remains is taken from notebook *09_structured_perceptron.ipynb* (from *Lecture10*).

In [10]:
#import all the required packages for doing the steps required on what remains

import scipy
import numpy as np

import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
import skseq


from skseq.sequences import sequence
from skseq.sequences.sequence import Sequence
from skseq.sequences import sequence_list
from skseq.sequences.sequence_list import SequenceList
from skseq.sequences import label_dictionary
from skseq.sequences.label_dictionary import LabelDictionary
from skseq.sequences import extended_feature

## Determining the training and testing sequences

### Building the dictionaries

In [11]:
n_sentences = 2000

X = []
Y = []

sentence_formatter = "Sentence: {}"

for i in range(1,n_sentences):
    s = sentence_formatter.format(i)
    X.append(list(data[data["Sentence #"]==s]["Word"].values))
    Y.append(list(data[data["Sentence #"]==s]["Tag"].values))

In [12]:
def build_word_to_pos(X):

    word_to_pos = {}
    i = 0
    for s in X:
        for w in s:
            if w not in word_to_pos:
                word_to_pos[w] = i
                i +=1
                
    pos_to_word = {v: k for k, v in word_to_pos.items()}
    return word_to_pos, pos_to_word
            
def build_tag_to_pos(Y):
    tag_to_pos = {}
    i = 0
    for s in Y:
        for t in s:
            if t not in tag_to_pos:
                tag_to_pos[t] = i
                i +=1
    pos_to_tag = {v: k for k, v in tag_to_pos.items()}

    return tag_to_pos, pos_to_tag

In [13]:
word_to_pos, pos_to_word = build_word_to_pos(X)
tag_to_pos, pos_to_tag  = build_tag_to_pos(Y)

### Splitting the data into train and test

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)    

### Constructing the SequenceList object

In [16]:
#convert X and Y into integers
X_train = [[word_to_pos[w] for w in s] for s in X_train]
Y_train = [[tag_to_pos[t] for t in s] for s in Y_train]

#convert X and Y into integers
X_test = [[word_to_pos[w] for w in s] for s in X_test]
Y_test = [[tag_to_pos[t] for t in s] for s in Y_test]

In [17]:
list_seq_train = []
for i in np.arange(len(X_train)):
    seq = skseq.sequences.sequence.Sequence(X_train[i], Y_train[i])
    list_seq_train.append(seq)
    
    
list_seq_test = []
for i in np.arange(len(X_test)):
    seq = skseq.sequences.sequence.Sequence(X_test[i], Y_test[i])
    list_seq_test.append(seq)

In [18]:
word_to_pos2 = LabelDictionary(word_to_pos)
word_to_pos2.label_names = word_to_pos2.keys()
tag_to_pos2 = LabelDictionary(tag_to_pos)
tag_to_pos2.label_names = tag_to_pos2.keys()

In [19]:
train_seq = SequenceList(x_dict = word_to_pos2, y_dict = tag_to_pos2)
test_seq = SequenceList(x_dict = word_to_pos2, y_dict = tag_to_pos2)

In [20]:
train_seq.seq_list = list_seq_train
test_seq.seq_list = list_seq_test

In [21]:
print(type(train_seq))
print(type(test_seq))

<class 'skseq.sequences.sequence_list.SequenceList'>
<class 'skseq.sequences.sequence_list.SequenceList'>


In [22]:
print(train_seq.seq_list[0])
print(test_seq.seq_list[0])

1620/2 1189/0 957/0 1564/7 19/0 9/0 980/0 1035/0 1150/0 7/0 1781/0 7/0 102/0 9/0 105/0 11/0 1634/1 11/0 994/0 1147/7 21/0 
1817/1 151/0 968/0 9/0 340/0 51/0 1/0 1790/0 1053/0 1054/0 1081/0 1490/0 1/0 666/1 21/0 


In [23]:
print(train_seq[0].x)
print(train_seq[0].y)

[1620, 1189, 957, 1564, 19, 9, 980, 1035, 1150, 7, 1781, 7, 102, 9, 105, 11, 1634, 11, 994, 1147, 21]
[2, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 7, 0]


## Create the feature mapper

In [24]:
feature_mapper = skseq.sequences.extended_feature.ExtendedFeatures(train_seq) #initialize
feature_mapper.build_features() #get the features

In [25]:
#Show the tags
set([x.split(":")[0] for x in feature_mapper.feature_dict.keys()])

{'KeyWord',
 'LettersNumbers',
 'Numeric',
 'PuncSign',
 'capitalized',
 'final_prev_tag',
 'id',
 'init_tag',
 'prev_tag',
 'suffix',
 'uppercase'}

In [26]:
#number of entries in the dictionary
len(feature_mapper.feature_dict)

7141

In [27]:
feature_mapper.feature_dict

{'init_tag:B-gpe': 0,
 'id:Japanese::B-gpe': 1,
 'suffix:ese::B-gpe': 2,
 'capitalized:B-gpe': 3,
 'id:media::O': 4,
 'prev_tag:B-gpe::O': 5,
 'id:reported::O': 6,
 'prev_tag:O::O': 7,
 'id:Monday::B-tim': 8,
 'capitalized:B-tim': 9,
 'KeyWord:monday::B-tim': 10,
 'prev_tag:O::B-tim': 11,
 'id:that::O': 12,
 'prev_tag:B-tim::O': 13,
 'id:the::O': 14,
 'id:two::O': 15,
 'id:countries::O': 16,
 'id:hope::O': 17,
 'id:to::O': 18,
 'id:arrange::O': 19,
 'id:hold::O': 20,
 'id:talks::O': 21,
 'id:in::O': 22,
 'id:Beijing::B-geo': 23,
 'capitalized:B-geo': 24,
 'prev_tag:O::B-geo': 25,
 'prev_tag:B-geo::O': 26,
 'id:early::O': 27,
 'id:November::B-tim': 28,
 'suffix:er::B-tim': 29,
 'id:.::O': 30,
 'PuncSign:O': 31,
 'final_prev_tag:O': 32,
 'init_tag:O': 33,
 'id:The::O': 34,
 'capitalized:O': 35,
 'id:boat::O': 36,
 'id:originated::O': 37,
 'id:from::O': 38,
 'id:Somalia::B-geo': 39,
 'id:and::O': 40,
 'id:most::O': 41,
 'id:of::O': 42,
 'id:those::O': 43,
 'id:on::O': 44,
 'id:board::O': 

## Inspection

Let us see some of the entries of the dictionary.

In [28]:
# Looking at some features and the position they have assigned
c =0
print("First 5 features in the dicitionary\n")
for i in feature_mapper.feature_dict:
    print(i, ":", feature_mapper.feature_dict[i])
    c +=1
    if c>=5:
        break

First 5 features in the dicitionary

init_tag:B-gpe : 0
id:Japanese::B-gpe : 1
suffix:ese::B-gpe : 2
capitalized:B-gpe : 3
id:media::O : 4


## Training the perceptron

In [29]:
import skseq.sequences.structured_perceptron as spc

In [30]:
sp = spc.StructuredPerceptron(word_to_pos2, tag_to_pos2, feature_mapper) #create the perceptron

In [31]:
%%time
num_epochs = 15
sp.fit(feature_mapper.dataset, num_epochs) #training the perceptron

Epoch: 0 Accuracy: 0.890524
Epoch: 1 Accuracy: 0.919482
Epoch: 2 Accuracy: 0.933634
Epoch: 3 Accuracy: 0.937775
Epoch: 4 Accuracy: 0.944326
Epoch: 5 Accuracy: 0.950594
Epoch: 6 Accuracy: 0.950736
Epoch: 7 Accuracy: 0.955557
Epoch: 8 Accuracy: 0.959556
Epoch: 9 Accuracy: 0.958989
Epoch: 10 Accuracy: 0.960691
Epoch: 11 Accuracy: 0.962307
Epoch: 12 Accuracy: 0.961882
Epoch: 13 Accuracy: 0.963867
Epoch: 14 Accuracy: 0.964009
CPU times: user 3min, sys: 384 ms, total: 3min
Wall time: 3min


### Inspection of the trained perceptron

In [32]:
len(sp.parameters)

7141

In [33]:
sp.parameters

array([5.86666667, 4.13333333, 8.93333333, ..., 0.        , 0.        ,
       4.6       ])

## Evaluating model quality

In [34]:
def evaluate_corpus(sequences, sequences_predictions):
    """
    Evaluate classification accuracy at corpus level, comparing with gold standard.
    """
    total = 0.0
    correct = 0.0
    for i, sequence in enumerate(sequences):
        pred = sequences_predictions[i]
        for j, y_hat in enumerate(pred.y):
            if sequence.y[j] == y_hat:
                correct += 1
            total += 1
    return correct / total

In [35]:
# Make predictions for the various sequences using the trained model.
pred_train = sp.viterbi_decode_corpus(train_seq)
pred_test = sp.viterbi_decode_corpus(test_seq)

In [36]:
# Evaluate and print accuracies
eval_train = evaluate_corpus(train_seq.seq_list, pred_train)
#eval_dev = evaluate_corpus(dev_seq.seq_list, pred_dev)
eval_test = evaluate_corpus(test_seq.seq_list, pred_test)
print("SP -  Accuracy Train: %.3f Test: %.3f"%(eval_train, eval_test))

SP -  Accuracy Train: 0.966 Test: 0.947


### Inspecting the trained perceptron on the testing sequences

In [37]:
pred_test[0].to_words(train_seq)

'Egypt/B-geo has/O had/O the/O largest/O number/O of/O human/O bird/O flu/O cases/O outside/O of/O Asia/B-geo ./O '

In [38]:
pred_test[9].to_words(train_seq)

'Earlier/O in/O the/O week/O ,/O Cheney/B-per visited/O Japan/B-geo and/O the/O U.S./B-org Pacific/B-geo island/O of/O Guam/B-geo to/O discuss/O regional/O security/O issues/O ,/O and/O rally/O support/O for/O the/O Iraq/B-geo war/O ./O '