In [67]:
from seqlearn.datasets import load_conll
from seqlearn.perceptron import StructuredPerceptron
from seqlearn.evaluation import bio_f_score
from seqlearn.hmm import MultinomialHMM
from nltk.stem import PorterStemmer
import numpy as np
import pandas as pd
import sklearn as sk
import nltk as nl
import os
import re
import sys
import unidecode
from glob import glob


In [68]:
train_path = "../Data/bio-ner/train"
s_b = '_S_B_'
test_path = "../Data/bio-ner/dev"
train_files = glob(train_path + "/*.tsv")
ps = PorterStemmer()

In [69]:
def proc_text(word):
    word = word.lower()
    return unidecode.unidecode(ps.stem(word))

In [82]:
def create_file(pname, tname, step_one=False):
    file_data = []
    for file_name in pname:
        file = []
        with open(file_name, encoding='utf-8') as f:

            for line in f:
                par_line = line[:-1].split('\t')
                if par_line[0] != '-DOCSTART-':
                    if len(par_line) == 1:
                        file.append([s_b, 'O'])
                    else:
                        file.append([proc_text(par_line[0]), par_line[3]])
        file_data.append((file, file_name))
    data_pos = [[s_b, s_b, 'O']]
    
    for f in file_data:
        sentence = []
        for word in f[0][1:]:
            if word[0] == s_b:
                pos = nl.pos_tag([w[0] for w in sentence])
                for ii in range(len(sentence)):
                    sentence[ii][1] = pos[ii][1]
                data_pos += (sentence + [[s_b, s_b, 'O']])
                sentence = []
            else:
                sentence.append([word[0], '', word[1]])

    if os.path.exists(tname): # replace files instead of adding on 
        os.remove(tname)

    with open(tname, 'a') as f:
        for line in data_pos:
            if step_one:
                f.write("{}\t{}\t{}\n".format(line[0], line[1], line[2][0]))
            else:
                f.write("{}\t{}\t{}\n".format(line[0], line[1], line[2]))
    return data_pos

In [74]:
def features(seq, i): # The Feature Extractor! aka 90% of this project
	p = seq[i].split()[0]
	pos = seq[i].split()[1]
	yield "word=" + p # Current word and POS
	yield "pos=" + pos
	if i == 0: # add info for the previous word
		yield "preword=" + "START"
		yield "prepos=" + "START"
	else:
		pp = seq[i-1].split()[0]
		ppos = seq[i-1].split()[1]
		yield "prepos=" + ppos
		if pp.isupper() and len(pp) == 3: # check if previous word is acronym 
			yield "preUpper"
		if re.search(r"\d", pp.lower()): # check if prev word has number
			yield "preNumber"
		yield "preword=" + pp.lower()
	if (i + 1) == len(seq):
		yield "folword=" + "END"
		yield "folpos=" + "END"
	else: # check the same for the next word
		nnp = seq[i+1].split()[0]
		nnpos = seq[i+1].split()[1]
		yield "folpos=" + nnpos
		if nnp.isupper():
			yield "folUpper"
		if re.search(r"\d", nnp.lower()):
			yield "folNumber"
		yield "folword=" + nnp.lower()
	if p.isupper() and len(p) == 3:
		yield "Uppercase"
	if re.search(r"\d", p.lower()):
		yield "Number"
	if len(p) > 8: # check if current word is unusually long 
		yield "Long"

In [93]:
b = create_file(glob(train_path + "/*.tsv"), "train")

In [94]:
X_train, y_train, l_train = load_conll("train", features)

In [95]:
a = create_file(glob(test_path + "/*.tsv") ,"test")

In [96]:
X_test, y_test, l_test = load_conll("test", features)

In [97]:
per = StructuredPerceptron(lr_exponent=0.35,max_iter=300,verbose=1) # Some trial and error found that

per.fit(X_train, y_train, l_train)  # fit and predict
y_p = per.predict(X_test, l_test)

Iteration  1... loss = 0.6970
Iteration  2... loss = 0.3848
Iteration  3... loss = 0.3763
Iteration  4... loss = 0.2763
Iteration  5... loss = 0.3843
Iteration  6... loss = 0.3574
Iteration  7... loss = 0.2784
Iteration  8... loss = 0.2437
Iteration  9... loss = 0.2415
Iteration 10... loss = 0.2571
Iteration 11... loss = 0.2489
Iteration 12... loss = 0.2839
Iteration 13... loss = 0.2572
Iteration 14... loss = 0.2670
Iteration 15... loss = 0.2406
Iteration 16... loss = 0.2817
Iteration 17... loss = 0.2618
Iteration 18... loss = 0.2496
Iteration 19... loss = 0.2569
Iteration 20... loss = 0.2779
Iteration 21... loss = 0.2547
Iteration 22... loss = 0.2661
Iteration 23... loss = 0.2896
Iteration 24... loss = 0.2637
Iteration 25... loss = 0.2311
Iteration 26... loss = 0.2787
Iteration 27... loss = 0.2844
Iteration 28... loss = 0.2522
Iteration 29... loss = 0.2553
Iteration 30... loss = 0.2642
Iteration 31... loss = 0.2434
Iteration 32... loss = 0.2633
Iteration 33... loss = 0.3064
Iteration 

In [100]:
bio_f_score(y_test, y_p)

0.6017302146747837

In [90]:
y_test

array(['O', 'B', 'I', ..., 'I', 'O', 'O'], dtype='<U1')

In [101]:
y_train

array(['O', 'O', 'B', ..., 'B', 'O', 'O'], dtype='<U1')