# NER with NLTK HMM with Part-Of-Speech as features

## Results

 |#ID | #Gold Standard | #Found | #Correct | Precision | Recall | F-1 |
 |----|----------------|--------|----------|-----------|--------|-----|
 | 1| 3414 | 1063| 256 | 0.24082784571966134| 0.0750073249340756 | 0.11438784629133154|
 |2|3392|1084|266|  0.24538745387453875 | 0.07841981132075472 | 0.11885612153708669|

 1. Training on 80% of data and 20 % dev with NLTK HMM
 2. Training on 80% of data and 20 % dev with NLTK HMM

In [1]:
import pandas as pd
import numpy as np
from StringIO import StringIO
from evalNER import eval
from collections import Counter
import itertools
import nltk
from nltk.tag import hmm

In [2]:
text = open("./gene-trainF17.txt").read()
lines = [ y.strip() for y in text.split("\n\n")]
raw_df = pd.DataFrame(lines, columns = ["sentence"])
# np.random.seed(1234)
msk = np.random.rand(len(raw_df)) < 0.8
train_df = raw_df[msk]
dev_df = raw_df[~msk]

## Training Phase

In [3]:
df = train_df.copy()

In [4]:
df.loc[:, "sentence_token"] = df["sentence"].apply(lambda x : [y.split("\t") for y in x.split("\n")])

In [5]:
df.loc[:, "tags"] = df["sentence_token"].apply(lambda x : [y[2] for y in x])
df.loc[:, "words"] = df["sentence_token"].apply(lambda x : [y[1] for y in x]) 
df.loc[:, "pos"] = df["words"].apply(lambda x : [ y[1] for y in nltk.pos_tag(x)])
df["pos_tags"] = df[["pos", "tags"]].apply(lambda x : zip(x[0],x[1]), axis = 1)

In [6]:
features = df["pos_tags"].tolist()

In [7]:
trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(features)

## Testing Phase

In [8]:
test_df = dev_df.copy()
test_df.loc[:, "sentence_token"] = test_df["sentence"].apply(lambda x : [y.split("\t") for y in x.split("\n")])
test_df.loc[:, "tags"] = test_df["sentence_token"].apply(lambda x : [y[2] for y in x])
test_df.loc[:, "words"] = test_df["sentence_token"].apply(lambda x : [y[1] for y in x])
test_df.loc[:, "pos"] = test_df["words"].apply(lambda x :  [x[1] for x in nltk.pos_tag(x)])

In [9]:
test_df.loc[:, "prediction"] = test_df["pos"].apply(lambda x: ["\t".join(x) for x in tagger.tag(x)])

In [10]:
test_df.loc[:, "temp1"] = test_df["prediction"].apply(lambda x : [str(i) + "\t" + y for i,y in enumerate(x,1)])
test_df.loc[:, "temp1"] = test_df["temp1"].apply(lambda x : "\n".join(x))

predictions = "\n\n".join(test_df["temp1"].tolist())
gold_standard = "\n\n".join(test_df["sentence"].tolist())
eval(StringIO(gold_standard), StringIO(predictions))

(3072, ' entities in gold standard.')
(973, ' total entities found.')
(227, ' of which were correct.')
('Precision: ', 0.23329907502569372, 'Recall: ', 0.07389322916666667, 'F1-measure: ', 0.11223733003708282)
