# Deliverable 2

- Deliverable 2 will be a NER (Named entity recognition system).


## Overview of the data

url = https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus


Essential info about entities:

```
geo = Geographical Entity
org = Organization
per = Person
gpe = Geopolitical Entity
tim = Time indicator
art = Artifact
eve = Event
nat = Natural Phenomenon
```


In [59]:
import pandas as pd
import numpy as np
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
import skseq
from skseq.sequences import sequence
from skseq.sequences.sequence import Sequence
import csv
import os.path
import scipy
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
import tqdm
from collections import Counter
import pickle
from sklearn.metrics import confusion_matrix
import skseq.sequences.structured_perceptron as spc

The data is located in the 'data' folder

In [15]:
!ls ../NLP/project_2/

ls: ../NLP/project_2/: No such file or directory


We can take a look at the first lines of the dataset

In [16]:
#/Users/Julian/4_Semester/NLP/project_2/entity-annotated-corpus/ner_dataset.csv
my_path = os.path.dirname(os.path.abspath("__file__"))
my_path
data = pd.read_csv(my_path+ "/data/kaggle_ner/ner_dataset.csv",
                   encoding="latin1")

In [17]:
data.head(70)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
...,...,...,...,...
65,,Hyde,NNP,B-geo
66,,Park,NNP,I-geo
67,,.,.,O
68,Sentence: 4,Police,NNS,O


# Data Preparation

 Fill with "Sentence: k" for each k

In [18]:
sentences = list(set(data["Sentence #"]))
sentences[0] = "nan"
sentences.sort()
len(sentences)

47960

In [19]:
sentences[0:3]

['Sentence: 1', 'Sentence: 10', 'Sentence: 100']

In [20]:
set(data["Tag"])

{'B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O'}

In [21]:
for tag in set(data["Tag"]):
    print("\nTAG:",tag)
    print(data[data["Tag"] == tag]["Word"][0:10])


TAG: I-org
98        Party
155      Atomic
156      Energy
157      Agency
235       Union
249    Security
250     Council
329      Berger
360       Shell
544       Qaida
Name: Word, dtype: object

TAG: I-nat
5045            Jing
5074            Jing
12509          Acute
12510    Respiratory
12511       Syndrome
22948        Katrina
23055        Katrina
29719        Katrina
34813        Katrina
68389        Katrina
Name: Word, dtype: object

TAG: I-eve
4854      Summer
4855    Olympics
5036     Olympic
5171      Medusa
5764         War
6730        Open
6756     Classic
6834        Open
9990         War
9991          II
Name: Word, dtype: object

TAG: B-gpe
18     British
102    English
113    Britain
126    British
173       Iran
181       Iran
196    Iranian
238       U.S.
245       Iran
259     Tehran
Name: Word, dtype: object

TAG: B-art
263       Nuclear
3769     Saltillo
3810    Pentastar
3814     Chrysler
3816        Dodge
3818         Jeep
3820          Ram
3863        Vioxx
39

How many sentences do we have?

In [22]:
"Sentence: 47959" in sentences, "Sentence: 47960" in sentences

(True, False)

## Indexing Sentences

In [23]:
sentence_formatter = "Sentence: {}"
sentence_formatter.format(0) in sentences

False

In [24]:
sentence_formatter = "Sentence: {}"
sentence_formatter.format(1) in sentences

True

In [25]:
i = 1
sentence_id      = sentence_formatter.format(i)
sentence_id_next = sentence_formatter.format(i+1)
sentence_id, sentence_id_next

('Sentence: 1', 'Sentence: 2')

In [26]:
print(data.index[data["Sentence #"] == sentence_id])
print(data.index[data["Sentence #"] == sentence_id_next])

Int64Index([0], dtype='int64')
Int64Index([24], dtype='int64')


In [27]:
start = data.index[data["Sentence #"] == sentence_id][0]
end   =  data.index[data["Sentence #"] == sentence_id_next][0]
start, end

(0, 24)

In [28]:
data["Sentence #"][start:end] = sentence_id

In [29]:
data["Sentence #"][start:end]

0     Sentence: 1
1     Sentence: 1
2     Sentence: 1
3     Sentence: 1
4     Sentence: 1
5     Sentence: 1
6     Sentence: 1
7     Sentence: 1
8     Sentence: 1
9     Sentence: 1
10    Sentence: 1
11    Sentence: 1
12    Sentence: 1
13    Sentence: 1
14    Sentence: 1
15    Sentence: 1
16    Sentence: 1
17    Sentence: 1
18    Sentence: 1
19    Sentence: 1
20    Sentence: 1
21    Sentence: 1
22    Sentence: 1
23    Sentence: 1
Name: Sentence #, dtype: object

# Selecting Training and Validation Set

In [30]:
data = pd.read_csv(my_path+ "/data/kaggle_ner/ner_dataset.csv",
                   encoding="latin1")


# Train set: From "Sentence: 1" to "Sentence: 35970"
last_n = 35971
end   = data.index[data["Sentence #"] == sentence_formatter.format(last_n)][0]
data = data[0:end]
# n_sentences = len(list(set(data["Sentence #"])))
# # first_n = 1
# # last_n = last_n -1
# print(n_sentences)

In [31]:
data = data.fillna(method='ffill')
data['Sentence #'].nunique(), data.Word.nunique(), data.Tag.nunique()
n_sentences = len(list(set(data["Sentence #"])))
print(n_sentences)

35970


In [32]:
data.tail(2)

Unnamed: 0,Sentence #,Word,POS,Tag
786876,Sentence: 35970,year,NN,O
786877,Sentence: 35970,.,.,O


In [59]:
# %%time 
# sentence_formatter = "Sentence: {}"

# for s_id in  range(first_n, last_n):
#     print("current {}/{}".format(s_id,last_n), end="\r")
#     sentence_id = sentence_formatter.format(s_id)
#     sentence_id_next = sentence_formatter.format(s_id + 1)
#     start = data.index[data["Sentence #"] == sentence_id][0]
#     end   = data.index[data["Sentence #"] == sentence_id_next][0]
#     data["Sentence #"][start:end] = sentence_id
    
# sentence_id = sentence_formatter.format(last_n)
# start = data.index[data["Sentence #"] == sentence_id][0]
# end   = data.shape[0]
# data["Sentence #"][start:end] = sentence_id


## Building X and Y

In [33]:
n_sentences

35970

In [34]:
X = []
Y = []

sentence_formatter = "Sentence: {}"

for i in range(1,n_sentences):
     s = sentence_formatter.format(i)
     X.append(list(data[data["Sentence #"]==s]["Word"].values))
     Y.append(list(data[data["Sentence #"]==s]["Tag"].values))

In [35]:
pickle.dump(X, open("data/X", 'wb'))
pickle.dump(Y, open("data/Y", 'wb'))

In [101]:
#X = pickle.load(open('data/X', 'rb'))
#Y = pickle.load(open('data/Y', 'rb'))

In [37]:
i = 0
xy = ["{}/{}".format(x,y) for x,y in zip(X[i],Y[i])]
" ".join(xy)

'Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O'

In [38]:
def build_word_to_pos(X):

    word_to_pos = {}
    i = 0
    for s in X:
        for w in s:
            if w not in word_to_pos:
                word_to_pos[w] = i
                i +=1
                
    pos_to_word = {v: k for k, v in word_to_pos.items()}
    return word_to_pos, pos_to_word
            
def build_tag_to_pos(Y):
    tag_to_pos = {}
    i = 0
    for s in Y:
        for t in s:
            if t not in tag_to_pos:
                tag_to_pos[t] = i
                i +=1
    pos_to_tag = {v: k for k, v in tag_to_pos.items()}

    return tag_to_pos, pos_to_tag

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(
X, Y, test_size=0.33, random_state=42)

In [40]:
word_to_pos_train, pos_to_word_test = build_word_to_pos(X_train)
tag_to_pos_train, pos_to_tag_test  = build_tag_to_pos(y_train)
len(word_to_pos_train), len(tag_to_pos_train)

(25940, 17)

In [41]:
word_to_pos_validation, pos_to_word_validation = build_word_to_pos(X_validation)
tag_to_pos_validation, pos_to_tag_validation  = build_tag_to_pos(y_validation)
len(word_to_pos_validation), len(tag_to_pos_validation)

(18789, 17)

# Selecting Testing Set

In [46]:
data = pd.read_csv(my_path+ "/data/kaggle_ner/ner_dataset.csv",
                   encoding="latin1")

# Test set: From "Sentence: 35971" to "Sentence: 47959"
first_n = 35971
# last_n = 5000
start   = data.index[data["Sentence #"] == sentence_formatter.format(first_n)][0]
# end   = data.index[data["Sentence #"] == sentence_formatter.format(last_n)][0]
data = data[start:]
# n_sentences = len(list(set(data["Sentence #"])))
# last_n = last_n -1
# print(n_sentences)

In [47]:
data = data.fillna(method='ffill')
data['Sentence #'].nunique(), data.Word.nunique(), data.Tag.nunique()
n_sentences = len(list(set(data["Sentence #"])))
# last_n = last_n -1
print(n_sentences)

11989


In [48]:
data

Unnamed: 0,Sentence #,Word,POS,Tag
786878,Sentence: 35971,Officials,NNS,O
786879,Sentence: 35971,said,VBD,O
786880,Sentence: 35971,Sunday,NNP,B-tim
786881,Sentence: 35971,Jean,NNP,B-per
786882,Sentence: 35971,Rene,NNP,I-per
...,...,...,...,...
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O


## Building X and Y

In [73]:
X_test = []
Y_test = []

 sentence_formatter = "Sentence: {}"

for i in range(35971,47960):
    s = sentence_formatter.format(i)
    X_test.append(list(data[data["Sentence #"]==s]["Word"].values))
    Y_test.append(list(data[data["Sentence #"]==s]["Tag"].values   )) 
                       
i = 0
xy = ["{}/{}".format(x,y) for x,y in zip(X_test[i],Y_test[i])]
 " ".join(xy)

In [74]:
# import pickle
pickle.dump(X_test, open("data/X_test", 'wb'))
pickle.dump(Y_test, open("data/Y_test", 'wb'))

In [49]:
#X_test = pickle.load(open('data/X_test', 'rb'))
#Y_test = pickle.load(open('data/Y_test', 'rb'))

In [50]:
word_to_pos_test, pos_to_word_test = build_word_to_pos(X_test)
tag_to_pos_test, pos_to_tag_test  = build_tag_to_pos(Y_test)
len(word_to_pos_test), len(tag_to_pos_test)

(17838, 17)

# Creating sequence list for 3 datasubsets

In [51]:
#Sequence things
from tqdm import tqdm
import skseq
from skseq.sequences import sequence
from skseq.sequences.sequence import Sequence
from skseq.sequences.sequence_list import SequenceList
from skseq.sequences.label_dictionary import LabelDictionary
import skseq.sequences.structured_perceptron as spc

In [52]:
sequence_list_train = SequenceList(LabelDictionary(word_to_pos_train), LabelDictionary(tag_to_pos_train))
for x,y in zip(tqdm(X_train),y_train):
    #needs to be a Label dict, since it uses the get_id function of that class
    sequence_list_train.add_sequence(x,y, LabelDictionary(word_to_pos_train), LabelDictionary(tag_to_pos_train))

100%|██████████| 24099/24099 [08:27<00:00, 47.47it/s]


In [53]:
sequence_list_validation = SequenceList(LabelDictionary(word_to_pos_validation), LabelDictionary(tag_to_pos_validation))
for x,y in zip(tqdm(X_validation),y_validation):
    #needs to be a Label dict, since it uses the get_id function of that class
    sequence_list_validation.add_sequence(x,y, LabelDictionary(word_to_pos_validation), LabelDictionary(tag_to_pos_validation))

100%|██████████| 11870/11870 [02:55<00:00, 67.51it/s]


In [54]:
sequence_list_test = SequenceList(LabelDictionary(word_to_pos_test), LabelDictionary(tag_to_pos_test))
for x,y in zip(tqdm(X_test),Y_test):
    #needs to be a Label dict, since it uses the get_id function of that class
    sequence_list_test.add_sequence(x,y, LabelDictionary(word_to_pos_test), LabelDictionary(tag_to_pos_test))

100%|██████████| 11989/11989 [02:19<00:00, 86.08it/s]


In [55]:
sequence_list_test

[0/0 1/0 2/1 3/2 4/3 5/3 6/0 7/0 8/0 9/0 10/0 11/0 12/0 13/0 14/0 15/0 16/0 17/0 18/4 19/0 , 20/0 21/0 22/0 9/0 23/0 24/0 25/0 26/0 27/0 28/0 29/2 30/3 28/0 6/0 7/0 8/0 9/0 10/0 11/0 31/5 32/0 19/0 , 33/2 30/3 28/0 14/0 34/0 27/0 17/0 35/4 36/0 37/0 38/0 28/0 6/0 39/0 17/0 40/0 41/0 17/0 9/0 42/1 43/6 44/0 45/0 46/0 33/2 47/3 48/0 49/0 14/0 50/0 19/0 , 31/5 51/0 52/0 53/0 54/0 55/0 14/0 56/0 57/0 8/0 58/0 59/0 8/0 60/0 11/0 61/0 12/0 62/0 19/0 , 63/0 64/2 65/0 66/0 67/0 68/0 17/0 14/0 69/4 70/0 71/0 28/0 72/0 73/0 74/0 28/0 75/0 76/0 77/0 78/0 79/0 80/0 17/0 81/0 82/1 19/0 , 83/0 84/0 51/0 85/0 86/0 87/0 88/0 89/0 48/0 90/0 91/0 92/0 93/0 70/0 94/0 75/0 95/0 96/0 19/0 , 69/4 97/0 51/0 85/0 86/0 87/0 98/0 48/0 99/0 100/0 101/0 14/0 102/7 103/8 104/8 48/0 105/0 106/0 107/0 55/0 108/0 109/0 19/0 , 110/0 111/0 112/0 113/0 114/0 115/0 116/0 73/0 74/0 107/0 8/0 117/0 118/0 17/0 14/0 69/4 119/0 120/0 73/0 121/0 28/0 14/0 97/0 122/0 123/0 124/0 125/0 55/0 79/0 48/0 126/0 127/0 128/0 129/0 130/

In [56]:
len(sequence_list_train.x_dict)

25940

In [57]:
len(sequence_list_validation.x_dict)

18789

In [58]:
len(sequence_list_test.x_dict)

17838

In [104]:
pickle.dump(sequence_list_train, open("data/sequence_list_train", 'wb'))
pickle.dump(sequence_list_validation, open("data/sequence_list_validation", 'wb'))
pickle.dump(sequence_list_test, open("data/sequence_list_test", 'wb'))

# Training Structured Perceptron- default FM

## Building Simple Feature Mapper

In [107]:
from skseq.sequences import extended_feature
import pprint

feature_mapper = skseq.sequences.extended_feature.ExtendedFeatures(sequence_list_train)
feature_mapper.additional_features = [0] #Use no extra features
feature_mapper.build_features()
pickle.dump(feature_mapper, open("data/feature_mapper", 'wb'))
pprint.pprint(list(feature_mapper.__dict__.keys()))

['feature_dict',
 'feature_list',
 'add_features',
 'dataset',
 'node_feature_cache',
 'initial_state_feature_cache',
 'final_state_feature_cache',
 'edge_feature_cache',
 'additional_features']


In [61]:
len(feature_mapper.feature_dict),list(feature_mapper.feature_dict)[0:10]

(43701,
 ['init_tag:O',
  'id:We::O',
  'suffix:e::O',
  'id:will::O',
  'suffix:l::O',
  'suffix:ll::O',
  'suffix:ill::O',
  'prev_tag:O::O',
  'id:get::O',
  'suffix:t::O'])

In [62]:
inv_feature_dict = {word: pos for pos, word in feature_mapper.feature_dict.items()}
id_seq = 6
feature_type = ["Initial features", "Transition features", "Final features", "Emission features"]

for feat,feat_ids in enumerate(feature_mapper.get_sequence_features(sequence_list_train[id_seq])):
    print(feature_type[feat])
    for id_list in feat_ids:
        print ("\t",id_list)
        for k,id_val in enumerate(id_list):
            print ("\t\t", inv_feature_dict[id_val] )
    print("\n")

Initial features
	 [324]
		 init_tag:B-geo


Transition features
	 [31]
		 prev_tag:B-geo::O
	 [7]
		 prev_tag:O::O
	 [7]
		 prev_tag:O::O
	 [7]
		 prev_tag:O::O
	 [66]
		 prev_tag:O::B-per
	 [71]
		 prev_tag:B-per::I-per
	 [72]
		 prev_tag:I-per::O
	 [7]
		 prev_tag:O::O
	 [7]
		 prev_tag:O::O
	 [7]
		 prev_tag:O::O
	 [7]
		 prev_tag:O::O
	 [7]
		 prev_tag:O::O
	 [7]
		 prev_tag:O::O
	 [7]
		 prev_tag:O::O
	 [7]
		 prev_tag:O::O
	 [7]
		 prev_tag:O::O
	 [7]
		 prev_tag:O::O
	 [7]
		 prev_tag:O::O
	 [7]
		 prev_tag:O::O
	 [7]
		 prev_tag:O::O
	 [7]
		 prev_tag:O::O
	 [7]
		 prev_tag:O::O
	 [7]
		 prev_tag:O::O


Final features
	 [92]
		 final_prev_tag:O


Emission features
	 [325, 326, 327, 328]
		 id:Saddam::B-geo
		 suffix:m::B-geo
		 suffix:am::B-geo
		 suffix:dam::B-geo
	 [53, 54]
		 id:'s::O
		 suffix:s::O
	 [329, 75, 330, 331]
		 id:head::O
		 suffix:d::O
		 suffix:ad::O
		 suffix:ead::O
	 [332, 2, 111, 333]
		 id:defense::O
		 suffix:e::O
		 suffix:se::O
		 suffix:nse::O
	 [334,

## Training Weights of Perceptron

In [63]:
sp = spc.StructuredPerceptron(sequence_list_train.x_dict, sequence_list_train.y_dict, feature_mapper)
sp.num_epochs = 5

In [64]:
%%time
num_epochs = 10
sp.fit(feature_mapper.dataset, num_epochs)

Epoch: 0 Accuracy: 0.907107
Epoch: 1 Accuracy: 0.932806
Epoch: 2 Accuracy: 0.940094
Epoch: 3 Accuracy: 0.944213
Epoch: 4 Accuracy: 0.947934
Epoch: 5 Accuracy: 0.950148
Epoch: 6 Accuracy: 0.951618
Epoch: 7 Accuracy: 0.953421
Epoch: 8 Accuracy: 0.954620
Epoch: 9 Accuracy: 0.955111
CPU times: user 47min 49s, sys: 21.4 s, total: 48min 10s
Wall time: 51min 29s


In [67]:
pickle.dump(sp, open("data/sp_01", 'wb'))

In [102]:
#sp = pickle.load(open('data/sp_01', 'rb'))

In [103]:
# Make predictions for the various sequences using the trained model.
pred_train = sp.viterbi_decode_corpus(sequence_list_train)
pred_dev   = sp.viterbi_decode_corpus(sequence_list_validation)
pred_test = sp.viterbi_decode_corpus(sequence_list_test)

In [105]:
pickle.dump(pred_train, open("data/pred_train_1", 'wb'))
pickle.dump(pred_dev, open("data/pred_dev_1", 'wb'))
pickle.dump(pred_test, open("data/pred_test_1", 'wb'))

# Training Structured Perceptron- Extended FM

## Building Extended Feature Mapper

In [113]:
from skseq.sequences.id_feature import IDFeatures
from skseq.sequences.id_feature import UnicodeFeatures

# ----------
# Feature Class
# Extracts features from a labeled corpus (only supported features are extracted
# ----------
class ExtendedFeatures(IDFeatures):
    
    def __init__(self, dataset):
        IDFeatures.__init__(self, dataset) #Get all the previous init of the inherited class
        self.additional_features = [] #If empty, use all, if 0, use none
    
    def add_emission_features(self, sequence, pos, y, features):
        x = sequence.x[pos]
        # Get tag name from ID.
        y_name = self.dataset.y_dict.get_label_name(y)

        # Get word name from ID.
        if isinstance(x, str):
            x_name = x
        else:
            x_name = self.dataset.x_dict.get_label_name(x)

        word = str(x_name)
        # Generate feature name.
        feat_name = "id:%s::%s" % (word, y_name)
        # Get feature ID from name.
        feat_id = self.add_feature(feat_name)
        # Append feature.
        if feat_id != -1:
            features.append(feat_id)
        
        # Suffixes
        max_suffix = 3
        for i in range(max_suffix):
            if len(word) > i+1:
                suffix = word[-(i+1):]
                # Generate feature name.
                feat_name = "suffix:%s::%s" % (suffix, y_name)
                # Get feature ID from name.
                feat_id = self.add_feature(feat_name)
                # Append feature.
                if feat_id != -1:
                    features.append(feat_id)  
                    
                    
        if len(self.additional_features) == 0 or 2 in self.additional_features:
        #If there is a capital letter at the beggining
            if word[0].isupper():
                # Generate feature name.
                feat_name = "upper::%s" % y_name
                # Get feature ID from name.
                feat_id = self.add_feature(feat_name)
                # Append feature.
                if feat_id != -1:
                    features.append(feat_id)

        if len(self.additional_features) == 0 or 3 in self.additional_features:
            #Hyphen
            if str.find(word, "-") != -1:
                # Generate feature name.
                feat_name = "hyphen::%s" % y_name
                # Get feature ID from name.
                feat_id = self.add_feature(feat_name)
                # Append feature.
                if feat_id != -1:
                    features.append(feat_id)

        if len(self.additional_features) == 0 or 4 in self.additional_features:
            #If there the word ends with -ed   
            if str.endswith(word,'ed'):
                # Generate feature name.
                feat_name = "verb_ed::%s" % y_name
                # Get feature ID from name.
                feat_id = self.add_feature(feat_name)
                # Append feature.
                if feat_id != -1:
                    features.append(feat_id)

        if len(self.additional_features) == 0 or 5 in self.additional_features:
            #If there the word ends with -ly      
            if str.endswith(word,'ly'):
                # Generate feature name.
                feat_name = "adverb_ly::%s" % y_name
                # Get feature ID from name.
                feat_id = self.add_feature(feat_name)
                # Append feature.
                if feat_id != -1:
                    features.append(feat_id)


        if len(self.additional_features) == 0 or 6 in self.additional_features:
            #If the word ends with 'ing'
            if str.endswith(word,'ing'):
                # Generate feature name.
                feat_name = "end_ing::%s" % y_name
                # Get feature ID from name.
                feat_id = self.add_feature(feat_name)
                # Append feature.
                if feat_id != -1:
                    features.append(feat_id)
                    
        if len(self.additional_features) == 0 or 7 in self.additional_features:
            #If the word ends with 'or'
            if str.endswith(word,'or'):
                # Generate feature name.
                feat_name = "or_suf::%s" % y_name
                # Get feature ID from name.
                feat_id = self.add_feature(feat_name)
                # Append feature.
                if feat_id != -1:
                    features.append(feat_id)        
        
        if len(self.additional_features) == 0 or 8 in self.additional_features:
            #If the word starts with 'up'
            if str.startswith(word,'up'):
                # Generate feature name.
                feat_name = "up_pref::%s" % y_name
                # Get feature ID from name.
                feat_id = self.add_feature(feat_name)
                # Append feature.
                if feat_id != -1:
                    features.append(feat_id)


        if len(self.additional_features) == 0 or 9 in self.additional_features: 
            #If the word contains a full stop 
            if str.find(word, ".") != -1:
                # Generate feature name.
                feat_name = "points::%s" % y_name
                # Get feature ID from name.
                feat_id = self.add_feature(feat_name)
                # Append feature.
                if feat_id != -1:
                    features.append(feat_id)

        if len(self.additional_features) == 0 or 10 in self.additional_features:      
            #If the word is 'to'
            if word == 'to':
                # Generate feature name.
                feat_name = "to_prep::%s" % y_name
                # Get feature ID from name.
                feat_id = self.add_feature(feat_name)
                # Append feature.
                if feat_id != -1:
                    features.append(feat_id)


        if len(self.additional_features) == 0 or 11 in self.additional_features:
            #If the word is 'of'
            if word == 'of':
                # Generate feature name.
                feat_name = "of_prep::%s" % y_name
                # Get feature ID from name.
                feat_id = self.add_feature(feat_name)
                # Append feature.
                if feat_id != -1:
                    features.append(feat_id)



        if len(self.additional_features) == 0 or 12 in self.additional_features:
            #If the word is 'from'
            if word == 'from':
                # Generate feature name.
                feat_name = "from_prep::%s" % y_name
                # Get feature ID from name.
                feat_id = self.add_feature(feat_name)
                # Append feature.
                if feat_id != -1:
                    features.append(feat_id)
                    

        if len(self.additional_features) == 0 or 13 in self.additional_features:
            #If the word is 'the'
            if word == 'the':
                # Generate feature name.
                feat_name = "the_art::%s" % y_name
                # Get feature ID from name.
                feat_id = self.add_feature(feat_name)
                # Append feature.
                if feat_id != -1:
                    features.append(feat_id)



        if len(self.additional_features) == 0 or 14 in self.additional_features:
            #If the word is 'in'
            if word == 'in':
                # Generate feature name.
                feat_name = "in_prep::%s" % y_name
                # Get feature ID from name.
                feat_id = self.add_feature(feat_name)
                # Append feature.
                if feat_id != -1:
                    features.append(feat_id)


                    
        return features

In [114]:
# https://github.com/LxMLS/lxmls-toolkit/blob/master/lxmls/sequences/extended_feature.py
feature_mapper_2 = ExtendedFeatures(sequence_list_train)
feature_mapper_2.build_features()

In [94]:
inv_feature_dict = {word: pos for pos, word in feature_mapper.feature_dict.items()}
id_seq = 6
feature_type = ["Initial features", "Transition features", "Final features", "Emission features"]

for feat,feat_ids in enumerate(feature_mapper_2.get_sequence_features(sequence_list_train[id_seq])):
    print(feature_type[feat])
    for id_list in feat_ids:
        print ("\t",id_list)
        for k,id_val in enumerate(id_list):
            print ("\t\t", inv_feature_dict[id_val] )
    print("\n")

Initial features
	 [345]
		 suffix:ld::O


Transition features
	 [36]
		 suffix:ay::B-tim
	 [8]
		 id:get::O
	 [8]
		 id:get::O
	 [8]
		 id:get::O
	 [75]
		 suffix:d::O
	 [81]
		 suffix:ver::O
	 [82]
		 id:a::O
	 [8]
		 id:get::O
	 [8]
		 id:get::O
	 [8]
		 id:get::O
	 [8]
		 id:get::O
	 [8]
		 id:get::O
	 [8]
		 id:get::O
	 [8]
		 id:get::O
	 [8]
		 id:get::O
	 [8]
		 id:get::O
	 [8]
		 id:get::O
	 [8]
		 id:get::O
	 [8]
		 id:get::O
	 [8]
		 id:get::O
	 [8]
		 id:get::O
	 [8]
		 id:get::O
	 [8]
		 id:get::O


Final features
	 [105]
		 suffix:es::O


Emission features
	 [346, 347, 348, 349, 29]
		 suffix:uld::O
		 id:appeal::O
		 suffix:eal::O
		 id:decision::O
		 suffix:my::O
	 [61, 62]
		 suffix:man::O
		 id:,::O
	 [350, 85, 351, 352]
		 id:asked::O
		 suffix:ort::O
		 id:for::O
		 id:halt::O
	 [353, 2, 124, 354]
		 suffix:lt::O
		 suffix:e::O
		 id:who::O
		 suffix:alt::O
	 [355, 13, 14, 356]
		 id:court::O
		 suffix:er::O
		 suffix:her::O
		 suffix:urt::O
	 [357, 358, 359, 360, 74

In [95]:
sp = spc.StructuredPerceptron(sequence_list_train.x_dict, sequence_list_train.y_dict, feature_mapper_2)
sp.num_epochs = 5

## Training Weights of Perceptron

In [96]:
%%time
num_epochs = 10
sp.fit(feature_mapper_2.dataset, num_epochs)

Epoch: 0 Accuracy: 0.930445
Epoch: 1 Accuracy: 0.943236
Epoch: 2 Accuracy: 0.947381
Epoch: 3 Accuracy: 0.950591
Epoch: 4 Accuracy: 0.952001
Epoch: 5 Accuracy: 0.953811
Epoch: 6 Accuracy: 0.955037
Epoch: 7 Accuracy: 0.956422
Epoch: 8 Accuracy: 0.956791
Epoch: 9 Accuracy: 0.957908
CPU times: user 44min 15s, sys: 9.29 s, total: 44min 24s
Wall time: 44min 30s


In [108]:
pickle.dump(sp, open("data/sp_02", 'wb'))

## Predictions

In [97]:
# Make predictions for the various sequences using the trained model.
pred_train = sp.viterbi_decode_corpus(sequence_list_train)
pred_dev   = sp.viterbi_decode_corpus(sequence_list_validation)
pred_test = sp.viterbi_decode_corpus(sequence_list_test)

In [98]:
pickle.dump(pred_train, open("data/pred_train_2", 'wb'))
pickle.dump(pred_dev, open("data/pred_dev_2", 'wb'))
pickle.dump(pred_test, open("data/pred_test_2", 'wb'))

# Perceptron evaluation using the testing sentences

In [109]:
p = "The programmers from Barcelona might write a sentence without a spell checker. The programmers from Barchelona cannot write a sentence without a spell checker. Jack London went to Parris. Jack London went to Paris. We never though Microsoft would become such a big company. We never though Microsof would become such a big company. The president of U.S.A though they could win the war The president of the United States of America though they could win the war The king of Saudi Arabia wanted total control. Robin does not want to go to Saudi Arabia. Apple is a great company. I really love apples and oranges."

new_seq = skseq.sequences.sequence.Sequence(x=p.split(), y=[int(0) for w in p.split()])
new_seq

The/0 programmers/0 from/0 Barcelona/0 might/0 write/0 a/0 sentence/0 without/0 a/0 spell/0 checker./0 The/0 programmers/0 from/0 Barchelona/0 cannot/0 write/0 a/0 sentence/0 without/0 a/0 spell/0 checker./0 Jack/0 London/0 went/0 to/0 Parris./0 Jack/0 London/0 went/0 to/0 Paris./0 We/0 never/0 though/0 Microsoft/0 would/0 become/0 such/0 a/0 big/0 company./0 We/0 never/0 though/0 Microsof/0 would/0 become/0 such/0 a/0 big/0 company./0 The/0 president/0 of/0 U.S.A/0 though/0 they/0 could/0 win/0 the/0 war/0 The/0 president/0 of/0 the/0 United/0 States/0 of/0 America/0 though/0 they/0 could/0 win/0 the/0 war/0 The/0 king/0 of/0 Saudi/0 Arabia/0 wanted/0 total/0 control./0 Robin/0 does/0 not/0 want/0 to/0 go/0 to/0 Saudi/0 Arabia./0 Apple/0 is/0 a/0 great/0 company./0 I/0 really/0 love/0 apples/0 and/0 oranges./0 

In [110]:
feature_mapper_2.get_sequence_features(new_seq)

([[0]],
 [[8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8],
  [8]],
 [[105]],
 [[106, 2, 23, 3],
  [36208, 62, 135, 136],
  [867, 270, 868, 869, 870],
  [36092, 310, 311, 36093, 3],
  [2029, 10, 2030, 2031],
  [21757, 2, 445, 2625],
  [94],
  [9623, 2, 140, 504],
  [4258, 10, 453, 557],
  [94],
  [5, 6, 772],
  [6935, 103]

In [111]:
sp.viterbi_decode(new_seq)

(The/0 programmers/0 from/0 Barcelona/1 might/0 write/0 a/0 sentence/0 without/0 a/0 spell/5 checker./6 The/0 programmers/0 from/0 Barchelona/3 cannot/4 write/0 a/0 sentence/0 without/0 a/0 spell/5 checker./6 Jack/0 London/1 went/0 to/0 Parris./5 Jack/6 London/6 went/0 to/0 Paris./5 We/6 never/0 though/0 Microsoft/3 would/0 become/0 such/0 a/0 big/0 company./5 We/6 never/0 though/0 Microsof/0 would/0 become/0 such/0 a/0 big/5 company./6 The/0 president/0 of/0 U.S.A/3 though/0 they/0 could/0 win/0 the/0 war/0 The/0 president/0 of/0 the/0 United/1 States/7 of/7 America/7 though/0 they/0 could/0 win/0 the/0 war/0 The/0 king/0 of/0 Saudi/1 Arabia/7 wanted/0 total/0 control./5 Robin/6 does/0 not/0 want/0 to/0 go/0 to/0 Saudi/3 Arabia./4 Apple/4 is/0 a/0 great/0 company./5 I/6 really/0 love/0 apples/0 and/0 oranges./5 ,
 3143.200000000003)

In [112]:
sp.viterbi_decode(new_seq)[0].to_words(sequence_list_train,
                                       only_tag_translation=True)

'The/O programmers/O from/O Barcelona/B-geo might/O write/O a/O sentence/O without/O a/O spell/B-per checker./I-per The/O programmers/O from/O Barchelona/B-org cannot/I-org write/O a/O sentence/O without/O a/O spell/B-per checker./I-per Jack/O London/B-geo went/O to/O Parris./B-per Jack/I-per London/I-per went/O to/O Paris./B-per We/I-per never/O though/O Microsoft/B-org would/O become/O such/O a/O big/O company./B-per We/I-per never/O though/O Microsof/O would/O become/O such/O a/O big/B-per company./I-per The/O president/O of/O U.S.A/B-org though/O they/O could/O win/O the/O war/O The/O president/O of/O the/O United/B-geo States/I-geo of/I-geo America/I-geo though/O they/O could/O win/O the/O war/O The/O king/O of/O Saudi/B-geo Arabia/I-geo wanted/O total/O control./B-per Robin/I-per does/O not/O want/O to/O go/O to/O Saudi/B-org Arabia./I-org Apple/I-org is/O a/O great/O company./B-per I/I-per really/O love/O apples/O and/O oranges./B-per '

# HMM Model

In [70]:
def logzero():
    return -np.inf


def safe_log(x):
    print(x)
    if x == 0:
        return logzero()
    return np.log(x)


def logsum_pair(logx, logy):
    """
    Return log(x+y), avoiding arithmetic underflow/overflow.

    logx: log(x)
    logy: log(y)

    Rationale:

    x + y    = e^logx + e^logy
             = e^logx (1 + e^(logy-logx))
    log(x+y) = logx + log(1 + e^(logy-logx)) (1)

    Likewise,
    log(x+y) = logy + log(1 + e^(logx-logy)) (2)

    The computation of the exponential overflows earlier and is less precise
    for big values than for small values. Due to the presence of logy-logx
    (resp. logx-logy), (1) is preferred when logx > logy and (2) is preferred
    otherwise.
    """
    if logx == logzero():
        return logy
    elif logx > logy:
        return logx + np.log1p(np.exp(logy-logx))
    else:
        return logy + np.log1p(np.exp(logx-logy))


def logsum(logv):
    """
    Return log(v[0]+v[1]+...), avoiding arithmetic underflow/overflow.
    """
    res = logzero()
    for val in logv:
        res = logsum_pair(res, val)
    return res

In [71]:
class HMM(object):
    
    def __init__(self, word_to_pos={}, state_to_pos={}):
        self.fitted = False
        self.counts = {"emission": None, "transition":None, "final":None, "initial":None}
        self.probs  = {"emission": None, "transition":None, "final":None, "initial":None}
        self.scores = {"emission": None, "transition":None, "final":None, "initial":None}
        self.decode = set(["posterior", "viterbi"])
        self.word_to_pos  = word_to_pos
        self.state_to_pos = state_to_pos
        self.pos_to_word  = {v: k for k, v in word_to_pos.items()}
        self.pos_to_state = {v: k for k, v in state_to_pos.items()}
    
        self.n_states     = len(state_to_pos)
        self.n_words      = len(word_to_pos)
        self.fitted = False

    def fit(self, observation_lables: list, state_labels: list):
        """
        Computes and saves: counts, probs, scores.
        """
        if self.state_to_pos is None or self.word_to_pos is None:
            print("Error state_to_pos or word_to_pos needed to be defined")
            return
            
        self.counts = self.sufficient_statistics_hmm(observation_lables, state_labels)       
        self.probs  = self.compute_probs(self.counts)  
        self.scores = self.compute_scores(self.probs)  
        self.fitted = True
        
    def sufficient_statistics_hmm(self, observation_lables, state_labels):

        state_to_pos, word_to_pos = self.state_to_pos, self.word_to_pos
        
        def update_initial_counts(initial_counts, seq_x, state_to_pos):
            initial_counts[state_to_pos[seq_x[0]]] +=  1
            
        def update_transition_counts(transition_counts, seq_y, state_to_pos):
            for (t_prev, t) in zip(seq_y[:-1], seq_y[1:]):
                transition_counts[state_to_pos[t], state_to_pos[t_prev]] += 1 

        def update_emission_counts(emission_counts, seq_x, seq_y, state_to_pos, word_to_pos):
            for (t,x) in zip(seq_y, seq_x):
                emission_counts[state_to_pos[t], word_to_pos[x]] += 1 
                
        def update_final_counts(final_counts, seq_y, state_to_pos):
            final_counts[state_to_pos[seq_y[-1]]] +=1

        n_states = len(state_to_pos)
        n_words  = len(word_to_pos)
        initial_counts      = np.zeros((n_states))
        transition_counts   = np.zeros((n_states, n_states))
        final_counts        = np.zeros((n_states))
        emission_counts     = np.zeros((n_states, n_words))

        for seq_x, seq_y in zip(observation_lables, state_labels):
            update_initial_counts(initial_counts, seq_y, state_to_pos)
            update_transition_counts(transition_counts, seq_y,  state_to_pos)
            update_emission_counts(emission_counts, seq_x, seq_y, state_to_pos, word_to_pos) 
            update_final_counts(final_counts, seq_y,  state_to_pos) 

        return {"emission":   emission_counts, 
                "transition": transition_counts,
                "final":      final_counts, 
                "initial":    initial_counts}
    
    def compute_probs(self, counts):
        
        initial_counts    = counts['initial']
        transition_counts = counts['transition']
        emission_counts   = counts['emission']
        final_counts      = counts['final']

        initial_probs    = (initial_counts / np.sum(initial_counts))
        transition_probs = transition_counts/(np.sum(transition_counts,0) + final_counts)
        final_probs      = final_counts/(np.sum(transition_counts, 0) + final_counts )
        emission_probs   = (emission_counts.T / np.sum(emission_counts, 1)).T
    
        return {"emission":   emission_probs, 
                "transition": transition_probs,
                "final":      final_probs, 
                "initial":    initial_probs}
    
    def compute_scores(self, probs):
         return {"emission":   np.log(probs["emission"]), 
                 "transition": np.log(probs["transition"]),
                 "final":      np.log(probs["final"]), 
                 "initial":    np.log(probs["initial"])}
        
    def forward_computations(self, x: list):
        forward_x = None
        return forward_x
    
    def backward_computations(self, x:list):
        backward_x = None
        return backward_x
    
    def log_forward_computations(self, x: list):
        """
        Compute the log_forward computations

        Assume there are S possible states and a sequence of length N.
        This method will compute iteritavely the log_forward quantities.

        * log_f is a S x N Array.
        * log_f_x[:,i] will contain the forward quantities at position i.
        * log_f_x[:,i] is a vector of size S.
        
        Returns
        - log_f_x: Array of size K x N
        """ 
        n_x = len(x)
        
        # log_f_x initialized to -Inf because log(0) = -Inf
        log_f_x = np.zeros((self.n_states, n_x)) - np.Inf
        x_emission_scores = np.array([hmm.scores['emission'][:, hmm.word_to_pos[w]] for w in x]).T
        
        log_f_x[:,0] = x_emission_scores[:, 0] + self.scores['initial']
        for n in range(1, n_x):
            for s in range(self.n_states):
                log_f_x[s,n] = logsum(log_f_x[:,n-1] + self.scores['transition'][s,:]) + x_emission_scores[s,n]

        log_likelihood = logsum(log_f_x[:,n_x-1] + self.scores['final']) 
        return log_f_x, log_likelihood # log(P(X=x))
    
    
    def log_backward_computations(self, x: list):
        n_x = len(x)
        
        # log_f_x initialized to -Inf because log(0) = -Inf
        log_b_x = np.zeros((self.n_states, n_x)) - np.Inf
        x_emission_scores = np.array([hmm.scores['emission'][:, hmm.word_to_pos[w]] for w in x]).T
        log_b_x[:,-1] = self.scores['final']

        for n in range(n_x-2, -1, -1):
            for s in range(self.n_states):
                log_b_x[s,n] = logsum(log_b_x[:,n+1] + self.scores['transition'][:,s] + x_emission_scores[:,n+1])

        log_likelihood = logsum(log_b_x[:,0] + self.scores['initial'] + x_emission_scores[:,0]) 
        return log_b_x, log_likelihood  # log(P(X=x))
        
    def predict_labels(self, x: list, decode="posterior"):
        """
        Retuns a sequence of states for each word in **x**.
        The output depends on the **decode** method chosen.
        """
        assert decode in self.decode, "decode `{}` is not valid".format(decode)
        
        if decode is 'posterior':
            return self.posterior_decode(x)
        
        if decode is 'viterbi':
            return self.viterbi_decode(x)

    def compute_state_posteriors(self, x:list):
        log_f_x, log_likelihood = self.log_forward_computations(x)
        log_b_x, log_likelihood = self.log_backward_computations(x)
        state_posteriors = np.zeros((self.n_states, len(x)))
        
        for pos in range(len(x)):
            state_posteriors[:, pos] = log_f_x[:, pos] + log_b_x[:, pos] - log_likelihood
        return state_posteriors

    def posterior_decode(self, x: list, decode_states=True):
        
        state_posteriors = self.compute_state_posteriors(x)
        y_hat = state_posteriors.argmax(axis=0)
        
        if decode_states:
            y_hat = [hmm.pos_to_state[y] for y in y_hat]
            
        return y_hat

In [117]:
ind_to_word  = {v: k for k, v in sequence_list_train.x_dict.items()}
ind_to_state = {v: k for k, v in sequence_list_train.y_dict.items()}
word_to_ind  = sequence_list_train.x_dict
state_to_ind = sequence_list_train.y_dict

In [118]:
X = []
Y = []
for i in range(len(sequence_list_train)):
    xy = sequence_list_train[i]
    X.append([ind_to_word[x_i] for x_i in xy.x])
    Y.append([ind_to_state[y_i] for y_i in xy.y])

In [119]:
X[1],Y[1]

(['The',
  'commission',
  'says',
  'China',
  'continues',
  'to',
  'harass',
  ',',
  'abuse',
  'and',
  'detain',
  'religious',
  'believers',
  'who',
  'practice',
  'their',
  'faith',
  'outside',
  'state-controlled',
  'religious',
  'venues',
  ',',
  'in',
  'particular',
  'the',
  'Muslim',
  'Uighur',
  'minority',
  '.'],
 ['O',
  'O',
  'O',
  'B-geo',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-org',
  'I-org',
  'O',
  'O'])

In [120]:
hmm = HMM(word_to_ind, state_to_ind)
hmm.fit(X, Y)



## training data

In [135]:
Y_hat = []
for x in tqdm.tqdm(X):
    Y_hat.append(hmm.predict_labels(x))

100%|██████████| 11989/11989 [09:36<00:00, 20.80it/s]


In [136]:
pickle.dump(Y_hat, open("data/Y_hat", 'wb'))
pickle.dump(Y, open("data/Y", 'wb'))

## validation data

In [123]:
ind_to_word  = {v: k for k, v in sequence_list_validation.x_dict.items()}
ind_to_state = {v: k for k, v in sequence_list_validation.y_dict.items()}
word_to_ind  = sequence_list_validation.x_dict
state_to_ind = sequence_list_validation.y_dict

In [124]:
X = []
Y_2 = []
for i in range(len(sequence_list_validation)):
    xy = sequence_list_validation[i]
    X.append([ind_to_word[x_i] for x_i in xy.x])
    Y_2.append([ind_to_state[y_i] for y_i in xy.y])

In [125]:
hmm = HMM(word_to_ind, state_to_ind)
hmm.fit(X, Y_2)



In [126]:
Y_hat_2 = []
for x in tqdm.tqdm(X):
    Y_hat_2.append(hmm.predict_labels(x))

100%|██████████| 11870/11870 [08:11<00:00, 24.14it/s]


In [127]:
pickle.dump(Y_hat_2, open("data/Y_hat_2", 'wb'))
pickle.dump(Y_2, open("data/Y_2", 'wb'))

In [128]:
ind_to_word  = {v: k for k, v in sequence_list_test.x_dict.items()}
ind_to_state = {v: k for k, v in sequence_list_test.y_dict.items()}
word_to_ind  = sequence_list_test.x_dict
state_to_ind = sequence_list_test.y_dict

In [129]:
X = []
Y_3 = []
for i in range(len(sequence_list_test)):
    xy = sequence_list_test[i]
    X.append([ind_to_word[x_i] for x_i in xy.x])
    Y_3.append([ind_to_state[y_i] for y_i in xy.y])

In [130]:
hmm = HMM(word_to_ind, state_to_ind)
hmm.fit(X, Y_3)



In [133]:
Y_hat_3 = []
for x in tqdm.tqdm(X):
    Y_hat_3.append(hmm.predict_labels(x))

100%|██████████| 11989/11989 [09:20<00:00, 21.38it/s]


In [134]:
pickle.dump(Y_hat_3, open("data/Y_hat_3", 'wb'))
pickle.dump(Y_3, open("data/Y_3", 'wb'))