# NER with HMM with Part-Of-Speech as features

## Results

 |#ID | #Gold Standard | #Found | #Correct | Precision | Recall | F-1 |
 |----|----------------|--------|----------|-----------|--------|-----|
 |1|16637 | 5663 | 1287 | 0.2272647006886809 | 0.07735769670012622 | 0.11542600896860986 |
 |2|3413|1123|263|0.2341941228851291|0.07705830647524173|0.11596119929453264|
 |3|3413|1124|263| 0.23398576512455516|0.07705830647524173|0.11593564029094117|
 |4|3413|1124|263| 0.23398576512455516|0.07705830647524173|0.11593564029094117|
 #### Same 20 % dev set
 1. Training on whole and testing on whole corpus with smoothing of tag transition probabilities
 2. Training on 80 % of data and 20 % dev test with smoothing of tag transition probabilites
 3. Training on 80% of data and 20 % dev test with no smoothing
 4. Traning on 80% of data and 20 % dev test with smoothing of emission prbability

In [1]:
import pandas as pd
import numpy as np
from StringIO import StringIO
from evalNER import eval
from collections import Counter
import itertools
import nltk

In [2]:
def viterbi_algo(string_tokens, states, emission_prob, tag_transition_prob, word_vocab, start_of_string_state='<s>',
                 end_of_string_state='</s>'):
    T = len(string_tokens)
    N = len(states)

    viterbi = np.zeros((N + 3, T + 1))
    backpointer = np.zeros((N + 3, T + 1))

    safe_mul = lambda x, y: np.exp(np.log(x) + np.log(y))

    for index, state in enumerate(states, 1):
        tag_trans = tag_transition_prob[(start_of_string_state, state)]
        emission = emission_prob[(state, string_tokens[0])]
        viterbi[index, 1] = safe_mul(tag_trans, emission)
        backpointer[index, 1] = 0

    for o_i, o in enumerate(string_tokens[1:], 2):
        for s_i, s in enumerate(states, 1):

            for _s_i, _s in enumerate(states, 1):
                prev = viterbi[_s_i, o_i - 1]
                tag_trans = tag_transition_prob[(_s, s)]
                temp = safe_mul(prev, tag_trans)
                if temp > viterbi[s_i, o_i]:
                    viterbi[s_i, o_i] = temp
                    backpointer[s_i, o_i] = _s_i
            emission = emission_prob[(s, o)]
            viterbi[s_i, o_i] = safe_mul(viterbi[s_i, o_i], emission)

    for s_i, s in enumerate(states, 1):

        tag_trans = tag_transition_prob[(s, end_of_string_state)]
        temp = safe_mul(viterbi[s_i, T], tag_trans)

        if temp > viterbi[N + 1, T]:
            viterbi[N + 1, T] = temp
            backpointer[N + 1, T] = s_i

    ans = list(np.zeros((len(string_tokens) + 1,)))

    z_i = int(backpointer[N + 1, T])
    ans[T] = states[z_i - 1]

    for index in xrange(T, 1, -1):
        z_i = int(backpointer[z_i, index])
        ans[index - 1] = states[z_i - 1]

    ans = ans[1:]
    return ans


In [3]:
text = open("./gene-trainF17.txt").read()
lines = text.split("\n\n")
raw_df = pd.DataFrame(lines, columns = ["sentence"])
np.random.seed(1234)
msk = np.random.rand(len(raw_df)) < 0.8
train_df = raw_df[msk]
dev_df = raw_df[~msk]

## Training Phase

In [4]:
df = train_df.copy()
df.loc[:,"sentence"] = df["sentence"].apply(lambda x: '0\t' + "<s>" + '\t' + "<s>" + '\n' + x.strip() + '\n-1\t</s>\t</s>')

In [5]:
df.loc[:, "sentence_token"] = df["sentence"].apply(lambda x : [y.split("\t") for y in x.split("\n")])

In [6]:
df.loc[:, "tags"] = df["sentence_token"].apply(lambda x : [y[2] for y in x])
df.loc[:, "words"] = df["sentence_token"].apply(lambda x : [y[1] for y in x])
df.loc[:, "bi_tags"] = df["tags"].apply(lambda x : zip(x, list(x)[1:]))
df.loc[:, "pos"] = df["words"].apply(lambda x : [ y[1] for y in nltk.pos_tag(x[1:-1])])
df["pos_tags"] = df[["pos", "tags"]].apply(lambda x : zip(x[0],x[1][1:-1]), axis = 1)
df.drop('sentence', axis = 1, inplace = True)
df.drop('sentence_token', axis = 1, inplace = True)

In [7]:
df.loc[:, "tags"] = df["tags"].apply(lambda x : Counter(x))
df.loc[:, "bi_tags"] = df["bi_tags"].apply(lambda x : Counter(x))
df.loc[:, "pos_tags"] = df["pos_tags"].apply(lambda x : Counter(x))
df.loc[:, "pos"] = df["pos"].apply(lambda x : Counter(x))

In [8]:
tags_counter = sum(df["tags"].tolist(), Counter())
bi_tags_counter = sum(df["bi_tags"].tolist(), Counter())
pos_tags_counter = sum(df["pos_tags"].tolist(), Counter())
pos_counter = sum(df["pos"].tolist(), Counter())

In [9]:
pos_list = pos_counter.keys()
tag_list = tags_counter.keys()

In [24]:
tag_transition_prob = Counter()
for x in itertools.product(tag_list, tag_list):
    tag_transition_prob[x] = (bi_tags_counter[x] * 1.0)/(tags_counter[x[0]])


In [32]:
pos_tag_prob = Counter()
for pos, tag in itertools.product(pos_list, tag_list):
    pos_tag_prob[(pos, tag)] = (pos_tags_counter[(pos, tag)] + 1.0) / (tags_counter[tag] + len(tag_list))


## Testing Phase

In [33]:
test_df = dev_df.copy()
test_df.loc[:, "sentence"] = test_df["sentence"].apply(lambda x: '0\t' + "<s>" + '\t' + "<s>" + '\n' + x.strip() + '\n-1\t</s>\t</s>')
test_df.loc[:, "sentence_token"] = test_df["sentence"].apply(lambda x : [y.split("\t") for y in x.split("\n")])
test_df.loc[:, "tags"] = test_df["sentence_token"].apply(lambda x : [y[2] for y in x])
test_df.loc[:, "words"] = test_df["sentence_token"].apply(lambda x : [y[1] for y in x])
test_df.loc[:, "pos"] = test_df["words"].apply(lambda x :  [y[1] for y in nltk.pos_tag(x[1:-1])])

In [34]:
pos_tag_prob_t = {(k[1], k[0]):v for k,v in pos_tag_prob.items()}
test_df.loc[:, "prediction"] = test_df["pos"].apply(lambda x: viterbi_algo(x, ["I", "O", "B"], pos_tag_prob_t, tag_transition_prob, []))

In [35]:
test_df["temp1"] = test_df[["words", "prediction"]].apply(lambda x : zip(x[0][1:-1], x[1]), axis = 1)
test_df.loc[:, "temp1"] = test_df["temp1"].apply(lambda x: ["\t".join(y) for y in x])
test_df.loc[:, "temp1"] = test_df["temp1"].apply(lambda x : [str(i) + "\t" + y for i,y in enumerate(x,1)])
test_df.loc[:, "temp1"] = test_df["temp1"].apply(lambda x : "\n".join(x))

predictions = "\n\n".join(test_df["temp1"].tolist())
gold_standard = "\n\n".join(test_df["sentence_token"].apply(lambda x : "\n".join(["\t".join(y) for y in x[1:-1]])))
eval(StringIO(gold_standard), StringIO(predictions))

(3413, ' entities in gold standard.')
(1124, ' total entities found.')
(263, ' of which were correct.')
('Precision: ', 0.23398576512455516, 'Recall: ', 0.07705830647524173, 'F1-measure: ', 0.11593564029094117)


In [15]:
test_df

Unnamed: 0,sentence,sentence_token,tags,words,pos,prediction,temp1
7,0\t<s>\t<s>\n1\tTeratological\tO\n2\tstudy\tO\...,"[[0, <s>, <s>], [1, Teratological, O], [2, stu...","[<s>, O, O, O, O, O, O, O, O, O, O, </s>]","[<s>, Teratological, study, of, etoperidone, i...","[NNP, NN, IN, NN, IN, DT, NN, CC, NN, .]","[O, O, O, O, O, O, O, O, O, O]",1\tTeratological\tO\n2\tstudy\tO\n3\tof\tO\n4\...
8,0\t<s>\t<s>\n1\tColorectal\tO\n2\tpolyps\tO\n3...,"[[0, <s>, <s>], [1, Colorectal, O], [2, polyps...","[<s>, O, O, O, O, O, O, O, O, O, </s>]","[<s>, Colorectal, polyps, should, be, removed,...","[JJ, NNS, MD, VB, VBN, RB, WRB, JJ, .]","[O, O, O, O, O, O, O, O, O]",1\tColorectal\tO\n2\tpolyps\tO\n3\tshould\tO\n...
9,0\t<s>\t<s>\n1\tThe\tO\n2\tvariable\tO\n3\tHMG...,"[[0, <s>, <s>], [1, The, O], [2, variable, O],...","[<s>, O, O, B, O, O, O, O, O, O, O, O, O, O, O...","[<s>, The, variable, HMG, dosage, regimen, was...","[DT, JJ, NNP, NN, NN, VBD, VBN, TO, VB, DT, NN...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",1\tThe\tO\n2\tvariable\tO\n3\tHMG\tO\n4\tdosag...
19,0\t<s>\t<s>\n1\tSelective\tO\n2\tstimulation\t...,"[[0, <s>, <s>], [1, Selective, O], [2, stimula...","[<s>, O, O, O, O, B, I, I, O, O, O, O, O, O, O...","[<s>, Selective, stimulation, of, central, alp...","[JJ, NN, IN, JJ, SYM, :, NNS, VBG, NN, IN, JJ,...","[O, O, O, O, B, I, O, O, O, O, O, O, O, O, O, ...",1\tSelective\tO\n2\tstimulation\tO\n3\tof\tO\n...
24,0\t<s>\t<s>\n1\tStudies\tO\n2\ton\tO\n3\timmun...,"[[0, <s>, <s>], [1, Studies, O], [2, on, O], [...","[<s>, O, O, B, I, O, O, O, O, O, O, O, O, O, O...","[<s>, Studies, on, immunoglobulin, E, :, the, ...","[NNS, IN, JJ, NN, :, DT, NN, IN, DT, NN, IN, N...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]",1\tStudies\tO\n2\ton\tO\n3\timmunoglobulin\tO\...
30,0\t<s>\t<s>\n1\tLate\tO\n2\tcomplications\tO\n...,"[[0, <s>, <s>], [1, Late, O], [2, complication...","[<s>, O, O, O, O, O, O, O, O, </s>]","[<s>, Late, complications, of, catheterisation...","[JJ, NNS, IN, NN, IN, JJ, NN, .]","[O, O, O, O, O, O, O, O]",1\tLate\tO\n2\tcomplications\tO\n3\tof\tO\n4\t...
32,0\t<s>\t<s>\n1\tSings\tO\n2\twhich\tO\n3\tare\...,"[[0, <s>, <s>], [1, Sings, O], [2, which, O], ...","[<s>, O, O, O, O, O, O, O, O, O, O, </s>]","[<s>, Sings, which, are, indicative, of, metap...","[NNS, WDT, VBP, JJ, IN, JJ, NN, VBP, JJ, .]","[O, O, O, O, O, O, O, O, O, O]",1\tSings\tO\n2\twhich\tO\n3\tare\tO\n4\tindica...
37,0\t<s>\t<s>\n1\tAlbumin\tB\n2\tand\tO\n3\tcycl...,"[[0, <s>, <s>], [1, Albumin, B], [2, and, O], ...","[<s>, B, O, O, O, O, O, O, O, O, O, O, </s>]","[<s>, Albumin, and, cyclic, AMP, levels, in, p...","[NNP, CC, JJ, NNP, NNS, IN, JJ, NNS, IN, DT, NN]","[O, O, O, O, O, O, O, O, O, O, O]",1\tAlbumin\tO\n2\tand\tO\n3\tcyclic\tO\n4\tAMP...
39,0\t<s>\t<s>\n1\tThirty\tO\n2\t-\tO\n3\tfive\tO...,"[[0, <s>, <s>], [1, Thirty, O], [2, -, O], [3,...","[<s>, O, O, O, O, O, O, O, O, O, O, O, O, O, O...","[<s>, Thirty, -, five, strains, of, Legionnair...","[NNP, :, CD, NNS, IN, NNP, POS, NN, NNS, VBD, ...","[B, I, I, O, O, O, O, O, O, O, O, O, O, O, O, ...",1\tThirty\tB\n2\t-\tI\n3\tfive\tI\n4\tstrains\...
55,0\t<s>\t<s>\n1\tAll\tO\n2\tgroups\tO\n3\twere\...,"[[0, <s>, <s>], [1, All, O], [2, groups, O], [...","[<s>, O, O, O, O, O, O, O, O, O, O, O, O, O, O...","[<s>, All, groups, were, tested, in, a, Hebb, ...","[DT, NNS, VBD, VBN, IN, DT, NNP, :, NNP, VBP, ...","[O, O, O, O, O, O, B, I, I, O, O, O, O, O, O, ...",1\tAll\tO\n2\tgroups\tO\n3\twere\tO\n4\ttested...
