# NER with NLTK MEMM

## Results

 |#ID | #Gold Standard | #Found | #Correct | Precision | Recall | F-1 |
 |----|----------------|--------|----------|-----------|--------|-----|
 |1|3413|45|1|0.022222222222222223|0.0002929973630237328|0.000578368999421631|
 |2|3413|291|61|0.209621993127|0.0178728391444|0.0329373650108|
 |3|3413|1843|629|0.341291372762|0.184295341342|0.239345509893|
 |4|3413|1729|660|0.381723539618|0.193378259596|0.256709451575|
 |5|3413|2830|833|0.294346289753|0.244066803399|0.266858881948|
 |6|3413|1950|843|0.432307692308|0.246996777029|0.314376281932|
|7|3413|2090|1009|0.482775119617|0.295634339291|0.366709067781|
|8|3413|2217|1118|0.504285069914|0.327571051861|0.397158081705|
|9|3413|2396|1338|0.558430717863|0.392030471726|0.460664486142|
|10|3413|2355|1319|0.56008492569|0.386463521828|0.457350901526|
|11|3413|2479|1432|0.577652279145|0.41957222385|0.486082824168|
|12|3413|2601|1556|0.598231449443|0.455903896865|0.517459261723|
|13|3372|2570|1508|0.586770428016|0.447212336892|0.507573207674|
Training on 80% of data and 20% dev with NLTK MaxEnt

 1. Feature : POS, Word Position
 2. Feature : POS, Word Position, Word
 3. Feature : POS, Word Position, Word, word Shape
 4. Feature : POS, Previous POS, Word Position, Word, word Shape
 5. Feature : current pos, previous pos, word position, current word, previous word, word Shape
 6. Feature : current pos, previous pos, word position, current word, previous word, current word shape, previous word shape
 7. Feature : {current, previous, next} pos, word position, {current, previous, next} word, {current, previous, next} word shape
 8. Feature : {current, previous, next} pos, word position, {current, previous, next} word, {current, previous, next} word shape, current word len
 9. Feature : {current, previous, next} {pos, word, word shape}, current word len, word position, suffix3, prefix3
 10. Feature : {current, previous, next} {pos, word, word_shape, lemma},  word position, current word len, suffix3, prefix3
 11. Feature : {current, previous, next} {pos, word, word_shape, lemma, word len},  word position, suffix3, prefix3
 12. Feature : {current, previous, next} {pos, word, word_shape, lemma, word len, suffix3, prefix3},  word position
 13. Feature : {current, previous, next} {pos, word, word_shape, lemma, word len, suffix3, prefix3},  word position
     - Random Data Set

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from StringIO import StringIO
from evalNER import eval
from collections import Counter
import itertools
import nltk
from nltk import MaxentClassifier
from nltk.chunk import named_entity
from nltk.stem.snowball import SnowballStemmer

In [3]:
text = open("./gene-trainF17.txt").read()
lines = [ y.strip() for y in text.split("\n\n")]
raw_df = pd.DataFrame(lines, columns = ["sentence"])
# np.random.seed(1234)
msk = np.random.rand(len(raw_df)) < 0.8
train_df = raw_df[msk]
dev_df = raw_df[~msk]
stemmer = SnowballStemmer("english")

## Training Phase

In [4]:
df = train_df.copy()

In [5]:
df.loc[:, "sentence_token"] = df["sentence"].apply(lambda x : [y.split("\t") for y in x.split("\n")])

In [6]:
df.loc[:, "tags"] = df["sentence_token"].apply(lambda x : [y[2] for y in x])
df.loc[:, "words"] = df["sentence_token"].apply(lambda x : [y[1] for y in x]) 
df.loc[:, "pos"] = df["words"].apply(lambda x : [ y[1] for y in nltk.pos_tag(x)])
df["pos_tags"] = df[["pos", "tags"]].apply(lambda x : zip(x[0],x[1]), axis = 1)

In [7]:
def extract_features(word, prev_word, next_word, pos_tag, prev_pos, next_pos, num):
    feature = {}
    
    feature["index"] = num
    
    prefix = lambda x,y : x[:y].lower() if x else None
    
    feature["prev_prefix3"] = prefix(prev_word,3)
    feature["curr_prefix3"] = prefix(word,3)
    feature["next_prefix3"] = prefix(next_word,3)
    
    suffix = lambda x, y : x[-y:].lower() if x else None
    
    feature["prev_suffix3"] = suffix(prev_word,3)
    feature["curr_suffix3"] = suffix(word,3)
    feature["next_suffix3"] = suffix(next_word,3)
    
    feature["prev_len"] = len(prev_word) if prev_word else 0
    feature["curr_len"] = len(word)
    feature["next_len"] = len(next_word) if next_word else 0
    
    feature["prev_word"] = prev_word
    feature["curr_word"] = word
    feature["next_word"] = next_word
    
    feature["prev_pos"] = prev_pos
    feature["curr_pos"] = pos_tag
    feature["next_pos"] = next_pos
    
    feature["curr_shape"] = named_entity.shape(word)
    feature["prev_shape"] = named_entity.shape(prev_word) if prev_word else None
    feature["next_shape"] = named_entity.shape(next_word) if next_word else None
    
    feature["curr_lemma"] = stemmer.stem(word)
    feature["prev_lemma"] = stemmer.stem(prev_word) if prev_word else None
    feature["next_lemma"] = stemmer.stem(next_word) if next_word else None
    
    return feature

features = []

for words_pos_tags in zip(df["words"].tolist(), df["pos_tags"].tolist()):
    prev_pos, prev_word = None, None
    zipped_list = zip(words_pos_tags[0], words_pos_tags[1])
    for num, word_pos_tag in enumerate(zipped_list, 1):
        feature = tuple([extract_features(word_pos_tag[0], 
                                          prev_word,
                                          zipped_list[num][0] if num < len(zipped_list) else None, 
                                          word_pos_tag[1][0], 
                                          prev_pos, 
                                          zipped_list[num][1][0] if num < len(zipped_list) else None,
                                          num), word_pos_tag[-1][1]])
        
        prev_pos, prev_word = word_pos_tag[1][0], word_pos_tag[0]
        features.append(feature)

In [9]:
memm_classifier = MaxentClassifier.train(features, "megam")

Exception RuntimeError: 'generator ignored GeneratorExit' in <generator object find_file_iter at 0x7f443494ea50> ignored


## Testing Phase

In [10]:
test_df = dev_df.copy()
test_df.loc[:, "sentence_token"] = test_df["sentence"].apply(lambda x : [y.split("\t") for y in x.split("\n")])
test_df.loc[:, "tags"] = test_df["sentence_token"].apply(lambda x : [y[2] for y in x])
test_df.loc[:, "words"] = test_df["sentence_token"].apply(lambda x : [y[1] for y in x])
test_df.loc[:, "pos"] = test_df["words"].apply(lambda x :  [x[1] for x in nltk.pos_tag(x)])

In [11]:
def predict_memm(pos, words):
    features = []
    prev_pos, prev_word = None, None
    zipped_list = zip(words, pos)
    for num, word_pos in enumerate(zipped_list, 1):
        features.append(extract_features(word_pos[0], 
                                         prev_word,
                                         zipped_list[num][0] if num < len(zipped_list) else None,
                                         word_pos[1], 
                                         prev_pos,
                                         zipped_list[num][1] if num < len(zipped_list) else None,
                                         num))
        prev_pos, prev_word = word_pos[1], word_pos[0]
    return zip(words, memm_classifier.classify_many(features))

test_df.loc[:, "prediction"] = test_df[["pos", "words"]].apply(lambda x: predict_memm(x[0], x[1]), axis = 1)

In [12]:
test_df.loc[:, "temp1"] = test_df["prediction"].apply(lambda x : [str(i) + "\t" + "\t".join(y) for i,y in enumerate(x,1)])
test_df.loc[:, "temp1"] = test_df["temp1"].apply(lambda x : "\n".join(x))

predictions = "\n\n".join(test_df["temp1"].tolist())
gold_standard = "\n\n".join(test_df["sentence"].tolist())
eval(StringIO(gold_standard), StringIO(predictions))

(3372, ' entities in gold standard.')
(2570, ' total entities found.')
(1508, ' of which were correct.')
('Precision: ', 0.5867704280155642, 'Recall: ', 0.4472123368920522, 'F1-measure: ', 0.5075732076741839)
|3372|2570|1508|0.586770428016|0.447212336892|0.507573207674|
