# Deliverable 2

- Deliverable 2 will be a NER (Named entity recognition system).


## Overview of the data

url = https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus


Essential info about entities:

```
geo = Geographical Entity
org = Organization
per = Person
gpe = Geopolitical Entity
tim = Time indicator
art = Artifact
eve = Event
nat = Natural Phenomenon
```


In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

The data is located in the 'data' folder

In [2]:
data = pd.read_csv("data/ner_dataset.csv", encoding="latin1")

In [3]:
data.head(70)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
5,,through,IN,O
6,,London,NNP,B-geo
7,,to,TO,O
8,,protest,VB,O
9,,the,DT,O


 Fill with "Sentence: k" for each k

In [4]:
sentences = list(set(data["Sentence #"]))
sentences[0] = "nan"
sentences.sort()
len(sentences)

47960

In [5]:
sentences[0:3]

['Sentence: 1', 'Sentence: 10', 'Sentence: 100']

In [6]:
set(data["Tag"])

{'B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O'}

In [7]:
for tag in set(data["Tag"]):
    print("\nTAG:",tag)
    print(data[data["Tag"] == tag]["Word"][0:10])


TAG: B-nat
2723       H5N1
4554       H5N1
5044       Jing
5073       Jing
5606       H5N1
12506      SARS
12508    Severe
13162       HIV
13164      AIDS
22260      AIDS
Name: Word, dtype: object

TAG: B-art
263       Nuclear
3769     Saltillo
3810    Pentastar
3814     Chrysler
3816        Dodge
3818         Jeep
3820          Ram
3863        Vioxx
3951        Vioxx
3962        Vioxx
Name: Word, dtype: object

TAG: B-geo
6        London
12         Iraq
65         Hyde
94      Britain
106    Brighton
118        Iraq
133      London
146        Rome
148       Paris
151      Madrid
Name: Word, dtype: object

TAG: I-gpe
1225    States
1264     Korea
2713      Binh
2932     Ababa
3466      City
5241     Lanka
5313     Korea
5361     Korea
5370     Korea
5390     Korea
Name: Word, dtype: object

TAG: I-eve
4854      Summer
4855    Olympics
5036     Olympic
5171      Medusa
5764         War
6730        Open
6756     Classic
6834        Open
9990         War
9991          II
Name: Word, dtyp

How many sentences do we have?

In [8]:
"Sentence: 47959" in sentences, "Sentence: 47960" in sentences

(True, False)

## Indexing Sentences

In [9]:
sentence_formatter = "Sentence: {}"
sentence_formatter.format(0) in sentences

False

In [10]:
sentence_formatter = "Sentence: {}"
sentence_formatter.format(1) in sentences

True

In [11]:
i = 1
sentence_id      = sentence_formatter.format(i)
sentence_id_next = sentence_formatter.format(i+1)
sentence_id, sentence_id_next

('Sentence: 1', 'Sentence: 2')

In [12]:
print(data.index[data["Sentence #"] == sentence_id])
print(data.index[data["Sentence #"] == sentence_id_next])

Int64Index([0], dtype='int64')
Int64Index([24], dtype='int64')


In [13]:
start = data.index[data["Sentence #"] == sentence_id][0]
end   =  data.index[data["Sentence #"] == sentence_id_next][0]
start, end

(0, 24)

In [14]:
data["Sentence #"][start:end] = sentence_id

In [15]:
data["Sentence #"][start:end]

0     Sentence: 1
1     Sentence: 1
2     Sentence: 1
3     Sentence: 1
4     Sentence: 1
5     Sentence: 1
6     Sentence: 1
7     Sentence: 1
8     Sentence: 1
9     Sentence: 1
10    Sentence: 1
11    Sentence: 1
12    Sentence: 1
13    Sentence: 1
14    Sentence: 1
15    Sentence: 1
16    Sentence: 1
17    Sentence: 1
18    Sentence: 1
19    Sentence: 1
20    Sentence: 1
21    Sentence: 1
22    Sentence: 1
23    Sentence: 1
Name: Sentence #, dtype: object

## Selecting a subset and writting an identifier

In [16]:
data = pd.read_csv("data/ner_dataset.csv", encoding="latin1")

last_n = 2000
end   = data.index[data["Sentence #"] == sentence_formatter.format(last_n)][0]

In [17]:
data = data[0:end]

In [18]:
n_sentences = len(list(set(data["Sentence #"])))
first_n = 1
last_n = last_n -1
print(n_sentences)

2000


In [19]:
%%time 
sentence_formatter = "Sentence: {}"

for s_id in  range(first_n, last_n):
    print("current {}/{}".format(s_id,last_n), end="\r")
    sentence_id = sentence_formatter.format(s_id)
    sentence_id_next = sentence_formatter.format(s_id + 1)
    start = data.index[data["Sentence #"] == sentence_id][0]
    end   = data.index[data["Sentence #"] == sentence_id_next][0]
    data["Sentence #"][start:end] = sentence_id
    
sentence_id = sentence_formatter.format(last_n)
start = data.index[data["Sentence #"] == sentence_id][0]
end   = data.shape[0]
data["Sentence #"][start:end] = sentence_id


Wall time: 9.26 s


## Building X and Y

In [20]:
n_sentences

2000

In [21]:
X = []
Y = []

sentence_formatter = "Sentence: {}"

for i in range(1,n_sentences):
    s = sentence_formatter.format(i)
    X.append(list(data[data["Sentence #"]==s]["Word"].values))
    Y.append(list(data[data["Sentence #"]==s]["Tag"].values))

In [22]:
i = 0
xy = ["{}/{}".format(x,y) for x,y in zip(X[i],Y[i])]
" ".join(xy)

'Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O'

In [23]:
def build_word_to_pos(X):

    word_to_pos = {}
    i = 0
    for s in X:
        for w in s:
            if w not in word_to_pos:
                word_to_pos[w] = i
                i +=1
                
    pos_to_word = {v: k for k, v in word_to_pos.items()}
    return word_to_pos, pos_to_word
            
def build_tag_to_pos(Y):
    tag_to_pos = {}
    i = 0
    for s in Y:
        for t in s:
            if t not in tag_to_pos:
                tag_to_pos[t] = i
                i +=1
    pos_to_tag = {v: k for k, v in tag_to_pos.items()}

    return tag_to_pos, pos_to_tag

In [24]:
word_to_pos, pos_to_word = build_word_to_pos(X)
tag_to_pos, pos_to_tag  = build_tag_to_pos(Y)

len(word_to_pos), len(tag_to_pos)

(7047, 17)

In [25]:
tag_to_pos

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-per': 3,
 'I-geo': 4,
 'B-org': 5,
 'I-org': 6,
 'B-tim': 7,
 'B-art': 8,
 'I-art': 9,
 'I-per': 10,
 'I-gpe': 11,
 'I-tim': 12,
 'B-nat': 13,
 'B-eve': 14,
 'I-eve': 15,
 'I-nat': 16}

In [26]:
#X = [[word_to_pos[w] for w in s] for s in X]
#Y = [[tag_to_pos[t] for t in s] for s in Y]

In [27]:
X = [[w for w in s] for s in X]
Y = [[t for t in s] for s in Y]

In [37]:
from sklearn.model_selection import train_test_split

In [39]:
train_idx, val_idx, _, _ = train_test_split(np.arange(len(X)), np.arange(len(X)), test_size=0.2, random_state=42)

In [44]:
X_train = [X[i] for i in train_idx]
Y_train = [Y[i] for i in train_idx]
X_val = [X[i] for i in val_idx]
Y_val = [Y[i] for i in val_idx]

# HMM

In [28]:
from HMM import HMM

In [29]:
hmm = HMM(word_to_pos, tag_to_pos)

In [45]:
hmm.fit(X_train, Y_train)

  return {"emission":   np.log(probs["emission"]),
  "transition": np.log(probs["transition"]),
  "final":      np.log(probs["final"]),
  "initial":    np.log(probs["initial"])}


##### Train acc

In [47]:
Y_hat = []
for x in tqdm(X_train):
    Y_hat.append(hmm.predict_labels(x))

correct = 0
total   = 0
for y,y_hat in zip(Y_train,Y_hat):
    for y_hat_k, y_k in zip(y,y_hat):
        total +=1
        if y_hat_k == y_k:
            correct +=1

print("Accuracy posterior decode train data", correct/total)

HBox(children=(FloatProgress(value=0.0, max=1599.0), HTML(value='')))


Accuracy posterior decode train data 0.9699934768427919


#### Validation acc

In [48]:
Y_hat = []
for x in tqdm(X_val):
    Y_hat.append(hmm.predict_labels(x))

correct = 0
total   = 0
for y,y_hat in zip(Y_val,Y_hat):
    for y_hat_k, y_k in zip(y,y_hat):
        total +=1
        if y_hat_k == y_k:
            correct +=1

print("Accuracy posterior decode validation data", correct/total)

HBox(children=(FloatProgress(value=0.0, max=400.0), HTML(value='')))


Accuracy posterior decode validation data 0.8725318121983326


# Structured perceptron

In [49]:
import skseq
from skseq.sequences import sequence
from skseq.sequences.sequence import Sequence