### Muthu Palaniappan M - NLP LAB EX 5

### Importing Pacakges

In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from hmmlearn import hmm
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


### Loading Data

In [35]:
data = pd.read_csv("ner_data.csv", encoding='latin1')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


### Data Cleaning

In [36]:
data = data.fillna(method="ffill")
data = data.rename(columns={'Sentence #': 'sentence'})
data.head(5)

Unnamed: 0,sentence,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [38]:
' '.join(data[data['sentence'] == 'Sentence: 1'].Word.tolist())

'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .'

In [39]:
tags = list(set(data.Tag.values)) 
words = list(set(data.Word.values))

In [41]:
print(f"Total Tags: {len(tags)}")
print()
print(tags)

Total Tags: 17

['O', 'I-geo', 'I-per', 'I-eve', 'B-gpe', 'B-art', 'B-per', 'I-gpe', 'I-nat', 'B-eve', 'I-org', 'I-tim', 'B-geo', 'I-art', 'B-org', 'B-nat', 'B-tim']


### Data Preparation

We cannot split data normally with `train_test_split` because doing that makes some parts of a sentence in the training set while some others in the testing set. Instead, we use `GroupShuffleSplit`.

In [42]:
y = data.Tag
X = data.drop('Tag', axis=1)

In [43]:
gs = GroupShuffleSplit(n_splits=2, test_size=.33, random_state=42)
train_ix, test_ix = next(gs.split(X, y, groups=data['sentence']))

In [44]:
data_train = data.loc[train_ix]
data_test = data.loc[test_ix]

In [45]:
data_train

Unnamed: 0,sentence,Word,POS,Tag
24,Sentence: 2,Families,NNS,O
25,Sentence: 2,of,IN,O
26,Sentence: 2,soldiers,NNS,O
27,Sentence: 2,killed,VBN,O
28,Sentence: 2,in,IN,O
...,...,...,...,...
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O


In [46]:
tags = list(set(data_train.Tag.values)) 
words = list(set(data_train.Word.values))

In [47]:
dfupdate = data_train.sample(frac=.15, replace=False, random_state=42)
dfupdate.Word = 'UNKNOWN'
data_train.update(dfupdate)

In [48]:
words = list(set(data_train.Word.values))
word2id = {w: i for i, w in enumerate(words)}
tag2id = {t: i for i, t in enumerate(tags)}
id2tag = {i: t for i, t in enumerate(tags)}

### Model Parameter Estimation

Hidden Markov Models can be trained by using the Baum-Welch algorithm.
- `startprob_`
- `transmat_`
- `emissionprob_`

In [49]:
count_tags = dict(data_train.Tag.value_counts())
count_tags_to_words = data_train.groupby(['Tag']).apply(lambda grp: grp.groupby('Word')['Tag'].count().to_dict()).to_dict()
count_init_tags = dict(data_train.groupby('sentence').first().Tag.value_counts())

In [50]:
count_tags_to_next_tags = np.zeros((len(tags), len(tags)), dtype=int)
sentences = list(data_train.sentence)
pos = list(data_train.Tag)
for i in range(len(sentences)) :
    if (i > 0) and (sentences[i] == sentences[i - 1]):
        prevtagid = tag2id[pos[i - 1]]
        nexttagid = tag2id[pos[i]]
        count_tags_to_next_tags[prevtagid][nexttagid] += 1

#### Calculating Probablites

In [51]:
mystartprob = np.zeros((len(tags),))
mytransmat = np.zeros((len(tags), len(tags)))
myemissionprob = np.zeros((len(tags), len(words)))
num_sentences = sum(count_init_tags.values())
sum_tags_to_next_tags = np.sum(count_tags_to_next_tags, axis=1)
for tag, tagid in tag2id.items():
    floatCountTag = float(count_tags.get(tag, 0))
    mystartprob[tagid] = count_init_tags.get(tag, 0) / num_sentences
    for word, wordid in word2id.items():
        myemissionprob[tagid][wordid]= count_tags_to_words.get(tag, {}).get(word, 0) / floatCountTag
    for tag2, tagid2 in tag2id.items():
        mytransmat[tagid][tagid2]= count_tags_to_next_tags[tagid][tagid2] / sum_tags_to_next_tags[tagid]

In [52]:
mystartprob

array([7.16419768e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       6.27100710e-02, 3.42337856e-04, 8.25967882e-02, 0.00000000e+00,
       0.00000000e+00, 2.48972986e-04, 0.00000000e+00, 0.00000000e+00,
       6.89966389e-02, 0.00000000e+00, 5.80418275e-02, 2.17851363e-04,
       1.04257438e-02])

In [53]:
mytransmat

array([[8.89746990e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.48244413e-02, 4.33193256e-04, 1.28715209e-02, 0.00000000e+00,
        0.00000000e+00, 3.00039591e-04, 0.00000000e+00, 0.00000000e+00,
        4.03011758e-02, 0.00000000e+00, 1.93978259e-02, 2.20147392e-04,
        2.19046655e-02],
       [8.70028124e-01, 1.05062274e-01, 0.00000000e+00, 0.00000000e+00,
        4.01767778e-04, 2.00883889e-04, 3.01325834e-03, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 1.80795500e-03, 0.00000000e+00,
        1.94857372e-02],
       [7.18293954e-01, 0.00000000e+00, 2.71195274e-01, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        8.68658791e-05, 0.00000000e+00, 6.94927033e-04, 0.00000000e+00,
        9.72897846e-03],
       [6.95121951e-01, 0.00000000e+00, 0.00000000e+00, 2.926

In [54]:
myemissionprob

array([[0.00000000e+00, 0.00000000e+00, 1.67972090e-06, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 8.67980210e-05, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.38500329e-04, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

### HMM

In [55]:
model = hmm.MultinomialHMM(n_components=len(tags), algorithm='viterbi', random_state=42)
model.startprob_ = mystartprob
model.transmat_ = mytransmat
model.emissionprob_ = myemissionprob

In [56]:
data_test.loc[~data_test['Word'].isin(words), 'Word'] = 'UNKNOWN'

In [57]:
word_test = list(data_test.Word)
samples = []
for i, val in enumerate(word_test):
    samples.append([word2id[val]])

In [58]:
len(samples)

345639

In [59]:
lengths = []
count = 0
sentences = list(data_test.sentence)
for i in range(len(sentences)) :
    if (i > 0) and (sentences[i] == sentences[i - 1]):
        count += 1
    elif i > 0:
        lengths.append(count)
        count = 1
    else:
        count = 1

In [60]:
len(lengths)

15826

### HMM - Prediction

In [61]:
ner_predict = model.predict(samples, lengths)

In [62]:
ner_predict

array([ 0,  0,  0, ..., 12,  0,  0], dtype=int32)

### Testing

In [64]:
def reportTest(y_pred, y_test):
    print("The accuracy is {}".format(accuracy_score(y_test, y_pred))) 
    print("The precision is {}".format(precision_score(y_test, y_pred, average='weighted'))) 
    print("The recall is {}".format(recall_score(y_test, y_pred, average='weighted'))) 
    print("The F1-Score is {}".format(f1_score(y_test, y_pred, average='weighted')))

min_length = min(len(pos_predict), len(pos_test))

reportTest(pos_predict[:min_length], pos_test[:min_length])

The accuracy is 0.9560956555705048
The precision is 0.9544313510538452
The recall is 0.9560956555705048
The F1-Score is 0.9547968981401819
