# NER with NLTK HMM with Words as features

## Results

 |#ID | #Gold Standard | #Found | #Correct | Precision | Recall | F-1 |
 |----|----------------|--------|----------|-----------|--------|-----|
|1|3413|1441|832|0.577376821652|0.243773806036|0.342810053564|
|2|3413|2540|1370|0.53937007874|0.401406387343|0.460272131698|
|3|3413|2434|1308|0.537387017256|0.383240550835|0.447408927655|
|4|3167|2418|1318|0.545078577337|0.416166719293|0.471978513876|
 1. Training on 80% of data and 20 % dev with NLTK HMM with words as features
 2. Training on 80% of data and 20 % dev with NLTK HMM with words as features
     - Removing Low Frequency Words <= 1
 3. Training on 80% of data and 20 % dev with NLTK HMM with words as features
     - Removing Low Frequency Words <= 2
 2. Random sample training on 80% of data and 20 % dev with NLTK HMM with words as features
     - Removing Low Frequency Words <= 1

In [1]:
import pandas as pd
import numpy as np
from StringIO import StringIO
from evalNER import eval
from collections import Counter
import itertools
import nltk
from nltk.tag import hmm

In [2]:
text = open("./gene-trainF17.txt").read()
lines = [ y.strip() for y in text.split("\n\n")]
raw_df = pd.DataFrame(lines, columns = ["sentence"])
# np.random.seed(1234)
msk = np.random.rand(len(raw_df)) < 0.8
train_df = raw_df[msk]
dev_df = raw_df[~msk]

## Training Phase

In [15]:
df = train_df.copy()

In [16]:
df.loc[:, "sentence_token"] = df["sentence"].apply(lambda x : tuple(y.split("\t") for y in x.split("\n")))

In [18]:
df.loc[:, "tags"] = df["sentence_token"].apply(lambda x : tuple(y[2] for y in x))
df.loc[:, "words"] = df["sentence_token"].apply(lambda x : tuple(y[1] for y in x)) 

In [19]:
count_df = df["words"].value_counts()

In [20]:
word_counter = Counter()
for k, v in count_df.iteritems():
    temp_counter = Counter()
    for w in k:
        word_counter[w] += v

In [21]:
V = set(k for k, v in word_counter.iteritems() if v > 1)
print len(V)

12373


In [22]:
df["words"] = df["words"].apply(lambda x: tuple(y if y in V else "UNK" for y in x))

In [23]:
df["words_tags"] = df[["words", "tags"]].apply(lambda x : zip(x[0],x[1]), axis = 1)
features = df["words_tags"].tolist()

In [24]:
trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(features)

## Testing Phase

In [25]:
test_df = dev_df.copy()
test_df.loc[:, "sentence_token"] = test_df["sentence"].apply(lambda x : [y.split("\t") for y in x.split("\n")])
test_df.loc[:, "tags"] = test_df["sentence_token"].apply(lambda x : tuple(y[2] for y in x))
test_df.loc[:, "words"] = test_df["sentence_token"].apply(lambda x : tuple(y[1] if y[1] in V else "UNK" for y in x))

In [26]:
test_df.loc[:, "prediction"] = test_df["words"].apply(lambda x: tuple("\t".join(x) for x in tagger.tag(x)))

In [27]:
test_df.loc[:, "temp1"] = test_df["prediction"].apply(lambda x : tuple(str(i) + "\t" + y for i,y in enumerate(x,1)))
test_df.loc[:, "temp1"] = test_df["temp1"].apply(lambda x : "\n".join(x))

predictions = "\n\n".join(test_df["temp1"].tolist())
gold_standard = "\n\n".join(test_df["sentence"].tolist())
eval(StringIO(gold_standard), StringIO(predictions))

(3167, ' entities in gold standard.')
(2418, ' total entities found.')
(1318, ' of which were correct.')
('Precision: ', 0.5450785773366419, 'Recall: ', 0.41616671929270604, 'F1-measure: ', 0.47197851387645484)
|3167|2418|1318|0.545078577337|0.416166719293|0.471978513876|
