# Deliverable 2

- Deliverable 2 will be a NER (Named entity recognition system).


## Overview of the data

url = https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus


Essential info about entities:

```
geo = Geographical Entity
org = Organization
per = Person
gpe = Geopolitical Entity
tim = Time indicator
art = Artifact
eve = Event
nat = Natural Phenomenon
```


In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

import matplotlib
import numpy as np
import re
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from Spelling_Correction_c  import Spelling_Correction_c 

The data is located in the 'data' folder

In [2]:
data = pd.read_csv("data/ner_dataset.csv", encoding="latin1")

In [3]:
data.head(70)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
...,...,...,...,...
65,,Hyde,NNP,B-geo
66,,Park,NNP,I-geo
67,,.,.,O
68,Sentence: 4,Police,NNS,O


 Fill with "Sentence: k" for each k

In [4]:
sentences = list(set(data["Sentence #"]))
sentences[0] = "nan"
sentences.sort()
len(sentences)

47960

In [5]:
sentences[0:3]

['Sentence: 1', 'Sentence: 10', 'Sentence: 100']

In [6]:
set(data["Tag"])

{'B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O'}

In [7]:
for tag in set(data["Tag"]):
    print("\nTAG:",tag)
    print(data[data["Tag"] == tag]["Word"][0:10])


TAG: O
0         Thousands
1                of
2     demonstrators
3              have
4           marched
5           through
7                to
8           protest
9               the
10              war
Name: Word, dtype: object

TAG: I-art
264     Non-Proliferation
3811                  V-6
4016               Simple
4017                 Life
4142              Morning
4143              America
5248               Mirror
5923                   De
5924               Gaulle
5935        International
Name: Word, dtype: object

TAG: B-eve
4853          2012
4887         Games
5001         Games
5035          2008
5170     Operation
5763          Gulf
6729    Australian
6755       Kooyong
6833    Australian
9989         World
Name: Word, dtype: object

TAG: I-geo
66            Park
347          State
350          State
381          Delta
561           Arab
796           West
797       Frontier
798       Province
1112    Waziristan
1122           Wam
Name: Word, dtype: object

TAG: I-org


How many sentences do we have?

In [8]:
"Sentence: 47959" in sentences, "Sentence: 47960" in sentences

(True, False)

## Indexing Sentences

In [9]:
sentence_formatter = "Sentence: {}"
sentence_formatter.format(0) in sentences

False

In [10]:
sentence_formatter = "Sentence: {}"
sentence_formatter.format(1) in sentences

True

In [11]:
i = 1
sentence_id      = sentence_formatter.format(i)
sentence_id_next = sentence_formatter.format(i+1)
sentence_id, sentence_id_next

('Sentence: 1', 'Sentence: 2')

In [12]:
print(data.index[data["Sentence #"] == sentence_id])
print(data.index[data["Sentence #"] == sentence_id_next])

Int64Index([0], dtype='int64')
Int64Index([24], dtype='int64')


In [13]:
start = data.index[data["Sentence #"] == sentence_id][0]
end   =  data.index[data["Sentence #"] == sentence_id_next][0]
start, end

(0, 24)

In [14]:
data["Sentence #"][start:end] = sentence_id

In [15]:
data["Sentence #"][start:end]

0     Sentence: 1
1     Sentence: 1
2     Sentence: 1
3     Sentence: 1
4     Sentence: 1
5     Sentence: 1
6     Sentence: 1
7     Sentence: 1
8     Sentence: 1
9     Sentence: 1
10    Sentence: 1
11    Sentence: 1
12    Sentence: 1
13    Sentence: 1
14    Sentence: 1
15    Sentence: 1
16    Sentence: 1
17    Sentence: 1
18    Sentence: 1
19    Sentence: 1
20    Sentence: 1
21    Sentence: 1
22    Sentence: 1
23    Sentence: 1
Name: Sentence #, dtype: object

## Selecting a subset and writting an identifier

In [16]:
data = pd.read_csv("data/ner_dataset.csv", encoding="latin1")

last_n = 2000
end   = data.index[data["Sentence #"] == sentence_formatter.format(last_n)][0]

In [17]:
data = data[0:end]

In [18]:
n_sentences = len(list(set(data["Sentence #"])))
first_n = 1
last_n = last_n -1
print(n_sentences)

2000


In [19]:
%%time 
sentence_formatter = "Sentence: {}"

for s_id in  range(first_n, last_n):
    print("current {}/{}".format(s_id,last_n), end="\r")
    sentence_id = sentence_formatter.format(s_id)
    sentence_id_next = sentence_formatter.format(s_id + 1)
    start = data.index[data["Sentence #"] == sentence_id][0]
    end   = data.index[data["Sentence #"] == sentence_id_next][0]
    data["Sentence #"][start:end] = sentence_id
    
sentence_id = sentence_formatter.format(last_n)
start = data.index[data["Sentence #"] == sentence_id][0]
end   = data.shape[0]
data["Sentence #"][start:end] = sentence_id


CPU times: user 15.9 s, sys: 133 ms, total: 16.1 s
Wall time: 15.8 s


## Building X and Y

In [20]:
n_sentences

2000

In [21]:
X = []
Y = []

sentence_formatter = "Sentence: {}"

for i in range(1,n_sentences):
    s = sentence_formatter.format(i)
    X.append(list(data[data["Sentence #"]==s]["Word"].values))
    Y.append(list(data[data["Sentence #"]==s]["Tag"].values))

In [22]:
i = 0
xy = ["{}/{}".format(x,y) for x,y in zip(X[i],Y[i])]
" ".join(xy)

'Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O'

In [23]:
def build_word_to_pos(X):

    word_to_pos = {}
    i = 0
    for s in X:
        for w in s:
            if w not in word_to_pos:
                word_to_pos[w] = i
                i +=1
                
    pos_to_word = {v: k for k, v in word_to_pos.items()}
    return word_to_pos, pos_to_word
            
def build_tag_to_pos(Y):
    tag_to_pos = {}
    i = 0
    for s in Y:
        for t in s:
            if t not in tag_to_pos:
                tag_to_pos[t] = i
                i +=1
    pos_to_tag = {v: k for k, v in tag_to_pos.items()}

    return tag_to_pos, pos_to_tag

In [24]:
X = [[w for w in s] for s in X]
Y = [[t for t in s] for s in Y]

In [25]:
from sklearn.model_selection import train_test_split
train_idx, val_idx, _, _ = train_test_split(np.arange(len(X)), np.arange(len(X)), test_size=0.2, random_state=42)

X_train = [X[i] for i in train_idx]
Y_train = [Y[i] for i in train_idx]
X_val = [X[i] for i in val_idx]
Y_val = [Y[i] for i in val_idx]

In [26]:
word_to_pos, pos_to_word = build_word_to_pos(X_train)
tag_to_pos, pos_to_tag  = build_tag_to_pos(Y_train)

len(word_to_pos), len(tag_to_pos)

(6268, 17)

In [27]:
tag_to_pos

{'B-gpe': 0,
 'O': 1,
 'B-tim': 2,
 'B-geo': 3,
 'B-per': 4,
 'B-org': 5,
 'I-per': 6,
 'I-org': 7,
 'B-art': 8,
 'I-gpe': 9,
 'I-geo': 10,
 'I-tim': 11,
 'B-nat': 12,
 'I-nat': 13,
 'I-art': 14,
 'B-eve': 15,
 'I-eve': 16}

In [28]:
words = []
for i in range(len(X_train)):
    words = words + X_train[i]

In [52]:
#Load the words of our corpus
import nltk
nltk.download('words')
words2 = nltk.corpus.words.words()
words2.extend(['online', 'Quora'])
words.extend(words2)

[nltk_data] Downloading package words to /home/laia/nltk_data...
[nltk_data]   Package words is already up-to-date!


## Spelling mistakes

In [54]:
#Create the spelling correction object (it will create the BK tree)
spelling_c = Spelling_Correction_c(words, tol = 1)

In [55]:
sentence =['An', 'Algerian', 'man', 'madee', 'I']
sentence_cl = spelling_c.correct_text(sentence)

sentence_cl

['An', 'Algerian', 'man', 'made', 'I']

In [56]:
i=0
print(len(X_val))
X_val_cleaned = []
for w in X_val:
    print(i)
    sentence = X_val[i]
    sentence_cl = spelling_c.correct_text(sentence)
    X_val_cleaned.append(sentence_cl)
    i+=1

400
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
27

# HMM

In [32]:
from HMM import HMM

In [33]:
hmm = HMM(word_to_pos, tag_to_pos)

In [34]:
hmm.fit(X_train, Y_train)

  return {"emission":   np.log(probs["emission"]),
  "transition": np.log(probs["transition"]),
  "final":      np.log(probs["final"]),
  "initial":    np.log(probs["initial"])}


##### Train acc

In [35]:
Y_hat = []
for x in tqdm(X_train):
    Y_hat.append(hmm.predict_labels(x))

correct = 0
total   = 0
for y,y_hat in zip(Y_train,Y_hat):
    for y_hat_k, y_k in zip(y,y_hat):
        total +=1
        if y_hat_k == y_k:
            correct +=1

print("Accuracy posterior decode train data", correct/total)

HBox(children=(FloatProgress(value=0.0, max=1599.0), HTML(value='')))

  state_posteriors[:, pos] = log_f_x[:, pos] + log_b_x[:, pos] - log_likelihood



Accuracy posterior decode train data 0.941660285317224


#### Validation acc

In [36]:
Y_hat = []
Y_val_new = []

i=0
for x in tqdm(X_val):
    try:
        Y_hat.append(hmm.predict_labels(x))
        Y_val_new.append(Y_val[i])
    except:
        print("Error: ", i)
    i+=1
    
correct = 0
total   = 0
for y,y_hat in zip(Y_val_new,Y_hat):
    for y_hat_k, y_k in zip(y,y_hat):
        total +=1
        if y_hat_k == y_k:
            correct +=1

print("Accuracy posterior decode validation data", correct/total)

HBox(children=(FloatProgress(value=0.0, max=400.0), HTML(value='')))

Error:  1
Error:  2
Error:  3
Error:  4
Error:  8
Error:  9
Error:  12
Error:  13
Error:  14
Error:  15
Error:  18
Error:  19
Error:  20
Error:  21
Error:  22
Error:  23
Error:  24
Error:  25
Error:  26
Error:  28
Error:  29
Error:  30
Error:  31
Error:  32
Error:  33
Error:  34
Error:  35
Error:  37
Error:  38
Error:  39
Error:  40
Error:  41
Error:  43
Error:  44
Error:  45
Error:  46
Error:  47
Error:  48
Error:  49
Error:  53
Error:  54
Error:  55
Error:  56
Error:  57
Error:  58
Error:  59
Error:  60
Error:  61
Error:  63
Error:  64
Error:  65
Error:  66
Error:  67
Error:  69
Error:  70
Error:  71
Error:  72
Error:  73
Error:  74
Error:  75
Error:  76
Error:  77
Error:  78
Error:  79
Error:  80
Error:  81
Error:  82
Error:  83
Error:  85
Error:  86
Error:  87
Error:  90
Error:  92
Error:  94
Error:  96
Error:  97
Error:  98
Error:  100
Error:  101
Error:  103
Error:  104
Error:  105
Error:  106
Error:  107
Error:  108
Error:  109
Error:  110
Error:  111
Error:  112
Error:  113
Err

In [57]:
Y_hat_cleaned = []
Y_val_cleaned = []
i=0
for x in tqdm(X_val_cleaned):
    try:
        Y_hat_cleaned.append(hmm.predict_labels(x))
        Y_val_cleaned.append(Y_val[i])
    except:
        print("Error: ", i)
    i+=1
        
correct = 0
total   = 0
for y,y_hat in zip(Y_val_cleaned,Y_hat_cleaned):
    for y_hat_k, y_k in zip(y,y_hat):
        total +=1
        if y_hat_k == y_k:
            correct +=1

print("Accuracy posterior decode validation data (cleaned)", correct/total)

HBox(children=(FloatProgress(value=0.0, max=400.0), HTML(value='')))

Error:  1
Error:  2
Error:  3
Error:  4
Error:  8
Error:  9
Error:  12
Error:  13
Error:  14
Error:  15
Error:  18
Error:  19
Error:  20
Error:  21
Error:  22
Error:  23
Error:  24
Error:  25
Error:  26
Error:  28
Error:  29
Error:  30
Error:  31
Error:  32
Error:  33
Error:  34
Error:  35
Error:  37
Error:  38
Error:  39
Error:  40
Error:  41
Error:  43
Error:  44
Error:  45
Error:  46
Error:  47
Error:  48
Error:  49
Error:  52
Error:  53
Error:  54
Error:  55
Error:  56
Error:  57
Error:  59
Error:  60
Error:  61
Error:  63
Error:  64
Error:  65
Error:  66
Error:  67
Error:  69
Error:  70
Error:  71
Error:  72
Error:  73
Error:  74
Error:  75
Error:  76
Error:  77
Error:  78
Error:  79
Error:  80
Error:  81
Error:  82
Error:  83
Error:  85
Error:  86
Error:  87
Error:  90
Error:  92
Error:  94
Error:  96
Error:  97
Error:  98
Error:  100
Error:  101
Error:  103
Error:  104
Error:  105
Error:  106
Error:  107
Error:  108
Error:  109
Error:  110
Error:  111
Error:  112
Error:  113
Err

# Structured perceptron

In [58]:
import skseq
from skseq.sequences import sequence
from skseq.sequences.sequence import Sequence
from skseq.sequences.sequence_list import SequenceList
from skseq.sequences.label_dictionary import LabelDictionary
import skseq.sequences.structured_perceptron as spc
import time


In [39]:
def generate_sequence_list(X, y, word_to_pos, tag_to_pos):
    # Generate x and y dicts
    x_dict = LabelDictionary(word_to_pos.keys())
    y_dict = LabelDictionary(tag_to_pos.keys())
    # Generate SequenceList
    seq_list = SequenceList(x_dict, y_dict)
    # Add words/tags to sequencelist
    for i in range(len(X)):
        seq_list.add_sequence(X[i], y[i], x_dict, y_dict)
    return seq_list

In [40]:
train_seq = generate_sequence_list(X_train, Y_train, word_to_pos, tag_to_pos)

In [41]:
feature_mapper = skseq.sequences.id_feature.IDFeatures(train_seq)
feature_mapper.build_features()

### Train perceptron

In [42]:
sp = spc.StructuredPerceptron(word_to_pos, tag_to_pos, feature_mapper)

In [43]:
%%time
num_epochs = 15
sp.fit(feature_mapper.dataset, num_epochs)

Epoch: 0 Accuracy: 0.780283
Epoch: 1 Accuracy: 0.838339
Epoch: 2 Accuracy: 0.865907
Epoch: 3 Accuracy: 0.889730
Epoch: 4 Accuracy: 0.905471
Epoch: 5 Accuracy: 0.914490
Epoch: 6 Accuracy: 0.924076
Epoch: 7 Accuracy: 0.935421
Epoch: 8 Accuracy: 0.937463
Epoch: 9 Accuracy: 0.945461
Epoch: 10 Accuracy: 0.950651
Epoch: 11 Accuracy: 0.955529
Epoch: 12 Accuracy: 0.955359
Epoch: 13 Accuracy: 0.954678
Epoch: 14 Accuracy: 0.960237
CPU times: user 4min 19s, sys: 455 ms, total: 4min 19s
Wall time: 4min 19s


### Make predictions

In [44]:
p = "Egypt had been asked to write Asia for Angel hlashfo ."
new_seq = skseq.sequences.sequence.Sequence(x=p.split(), y=[int(0) for w in p.split()])
new_seq


Egypt/0 had/0 been/0 asked/0 to/0 write/0 Asia/0 for/0 Angel/0 hlashfo/0 ./0 

In [45]:
sp.viterbi_decode(new_seq)[0].to_words(train_seq,
                                       only_tag_translation=True)

'Egypt/B-geo had/O been/O asked/O to/O write/O Asia/B-geo for/O Angel/O hlashfo/O ./O '

In [59]:
pred_val_cleaned = []
for i in range(len(X_val_cleaned)):
    p = X_val_cleaned[i]
    new_seq = skseq.sequences.sequence.Sequence(x=p, y=[int(0) for w in p])
    res = sp.viterbi_decode(new_seq)[0]
    pred_val_cleaned.append(res)

In [47]:
pred_val = []
for i in range(len(X_val)):
    p = X_val[i]
    new_seq = skseq.sequences.sequence.Sequence(x=p, y=[int(0) for w in p])
    res = sp.viterbi_decode(new_seq)[0]
    pred_val.append(res)

### Evaluate performance

In [48]:
# Make predictions for the various sequences using the trained model.
pred_train = sp.viterbi_decode_corpus(train_seq)

In [49]:
def evaluate_corpus(sequences, sequences_predictions):
    """Evaluate classification accuracy at corpus level, comparing with
    gold standard."""
    total = 0.0
    correct = 0.0
    for i, sequence in enumerate(sequences):
        pred = sequences_predictions[i]
        for j, y_hat in enumerate(pred.y):
            if sequence.y[j] == y_hat:
                correct += 1
            total += 1
    return correct / total


def evaluate_predictions(y_seq, sequences_predictions):
    """Evaluate classification accuracy at corpus level, comparing with
    gold standard."""
    total = 0.0
    correct = 0.0
    for i, ys in enumerate(y_seq):
        pred = sequences_predictions[i]
        for j, y_hat in enumerate(pred.y):
            if tag_to_pos[ys[j]] == y_hat:
                correct += 1
            total += 1
    return correct / total

In [60]:
# Evaluate and print accuracies
eval_train = evaluate_corpus(train_seq.seq_list, pred_train)
eval_val = evaluate_predictions(Y_val, pred_val)
eval_val_cleaned = evaluate_predictions(Y_val, pred_val_cleaned)
print("SP -  Accuracy Train: %.3f Validation: %.3f, Validation cleaned: %.3f"%(eval_train, eval_val, eval_val_cleaned))

SP -  Accuracy Train: 0.973 Validation: 0.939, Validation cleaned: 0.937


### Save the model

In [67]:
sp.save_model("perceptron_15_iter")

### Load existing model

In [68]:
sp2 = spc.StructuredPerceptron(word_to_pos, tag_to_pos, feature_mapper)
sp2.load_model(dir="perceptron_15_iter")