In [2]:
import nltk as nl
import os
from unidecode import unidecode
from glob import glob
from nltk.stem import PorterStemmer
ps = PorterStemmer()


# def process_text(word):
#     word = word.lower()
#     return unidecode(ps.stem(word))

def process_text(word):
    return unidecode(word)

def create_file(path_name, temp_name, step_one=False, return_=False):
    # step_one determines if to just get the IOB component
    file_data = []
    sentence_break = '_S_B_'

    # load all files in given directory into a list
    for file_name in glob(path_name + '/*.tsv'):
        file = []
        with open(file_name, encoding='utf-8') as f:
            for line in f:
                par_line = line[:-1].split('\t')
                if par_line[0] != '-DOCSTART-':
                    if len(par_line) == 1:
                        file.append([sentence_break, 'O'])
                    else:
                        file.append([process_text(par_line[0]), par_line[3]])  # process each word
        file_data.append((file, file_name))
    data_pos = [[sentence_break, sentence_break, 'O']]

    # combine all files into a single list
    for f in file_data:
        sentence = []
        for word in f[0][1:]:
            if word[0] == sentence_break:
                pos = nl.pos_tag([w[0] for w in sentence])
                for ii in range(len(sentence)):
                    sentence[ii][1] = pos[ii][1]
                data_pos += (sentence + [[sentence_break, sentence_break, 'O']])
                sentence = []
            else:
                sentence.append([word[0], '', word[1]])

    # save as new file of all data together
    if os.path.exists(temp_name):  # replace files instead of adding on
        os.remove(temp_name)

    with open(temp_name, 'a') as f:
        for line in data_pos:
            if step_one:
                f.write("{}\t{}\t{}\n".format(line[0], line[1], line[2][0]))
            else:
                if line[2][0] not in ['I', 'O', 'B']:
                    f.write("{}\t{}\t{}\n".format(line[0], line[1], 'O'))
                else:
                    f.write("{}\t{}\t{}\n".format(line[0], line[1], line[2]))
    if return_:
        return data_pos


In [59]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import eli5

# help and inspiration from
# https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html#let-s-use-conll-2002-data-to-build-a-ner-system
# and
# https://towardsdatascience.com/named-entity-recognition-and-classification-with-scikit-learn-f05372f07ba2


def file_opener(file_name):
    sentence_list = []
    current_sentence = []
    with open(file_name) as f:
        for line in f:
            word = line.strip().split('\t')
            if word[0] == '_S_B_':
                if len(current_sentence) > 0:
                    sentence_list.append(current_sentence)
                    current_sentence = []
            else:
                current_sentence.append(tuple(word))
    return sentence_list


def feature_extractor(sentence, i):
    word = sentence[i][0]
    pos = sentence[i][1]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'ps.stem(word)': ps.stem(word),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'pos': pos,
        'pos[:2]': pos[:2],
    }
    if i > 0:
        word1 = sentence[i - 1][0]
        pos1 = sentence[i - 1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:ps.stem(word)': ps.stem(word1),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:pos': pos1,
            '-1:pos[:2]': pos1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sentence) - 1:
        word1 = sentence[i + 1][0]
        pos1 = sentence[i + 1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:ps.stem(word)': ps.stem(word1),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.isdigit()': word1.isdigit(),
            '+1:pos': pos1,
            '+1:pos[:2]': pos1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sentence_features(sentence):
    return [feature_extractor(sentence, i) for i in range(len(sentence))]


def sentence_labels(sentence):
    return [label for token, pos, label in sentence]


def sentence_tokens(sentence):
    return [token for token, pos, label in sentence]


In [60]:
train_sentences = file_opener("./NER/train")
dev_sentences = file_opener("./NER/dev")

X_train = [sentence_features(s) for s in train_sentences]
y_train = [sentence_labels(s) for s in train_sentences]

X_dev = [sentence_features(s) for s in dev_sentences]
y_dev = [sentence_labels(s) for s in dev_sentences]

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.09684573395986483,
    c2=0.0800864058815976,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)
labels = list(crf.classes_)
labels.remove('O')
y_predicted = crf.predict(X_dev)

# y_pred_flat = []
# y_dev_flat = []
# for x in y_predicted:
#     y_pred_flat += x
# for x in y_dev:
#     y_dev_flat += x
# print(set(y_pred_flat) - set(y_dev_flat))
# print(set(y_pred_flat))
# print(set(y_dev_flat))

f1 = metrics.flat_f1_score(y_dev, y_predicted, average='weighted', labels=labels)
print(f1)
eli5.show_weights(crf)

0.776543591581169


From \ To,O,B-Abiotic_Entity,I-Abiotic_Entity,B-Aggregate_Biotic_Abiotic_Entity,I-Aggregate_Biotic_Abiotic_Entity,B-Biotic_Entity,I-Biotic_Entity,B-Eventuality,I-Eventuality,B-Location,I-Location,B-Quality,I-Quality,B-Time,I-Time,B-Unit,I-Unit,B-Value,I-Value
O,3.074,0.6,-2.173,0.923,-2.626,0.778,-1.987,0.69,-2.686,1.209,-2.594,0.745,-0.586,1.263,-3.63,-0.265,-2.157,2.126,-1.84
B-Abiotic_Entity,0.473,-1.661,6.091,-1.43,-1.452,-1.623,-1.02,0.989,-1.176,0.0,0.0,-0.002,-1.014,0.203,0.0,0.0,0.0,0.0,0.0
I-Abiotic_Entity,-0.236,-0.948,2.361,0.0,-0.041,-0.282,0.0,0.359,-0.286,0.0,0.0,0.274,0.0,0.205,-0.144,0.0,0.0,0.0,0.0
B-Aggregate_Biotic_Abiotic_Entity,0.252,-1.054,-0.124,-1.053,6.054,-0.162,-2.476,0.401,-1.745,1.211,-0.226,-0.426,-1.481,0.563,-0.615,-0.055,-0.063,-0.908,0.0
I-Aggregate_Biotic_Abiotic_Entity,0.307,-0.526,-0.49,-0.572,5.418,-0.176,-0.906,-0.599,-0.298,0.0,0.0,-0.386,-0.165,0.281,-0.872,-0.001,0.0,0.0,0.0
B-Biotic_Entity,0.359,-0.292,-0.98,-0.359,-2.568,-1.231,5.237,1.184,-1.863,-0.611,-0.978,0.177,-2.784,-0.016,-0.783,0.0,-0.558,-0.603,-0.014
I-Biotic_Entity,0.264,-0.999,-0.114,-1.173,-0.741,-1.287,4.088,0.48,0.0,-0.949,-0.984,-0.479,-0.766,-0.214,-0.306,0.0,-0.309,-0.286,0.0
B-Eventuality,0.212,0.996,-1.259,1.212,-2.293,1.109,-2.509,-0.272,4.202,0.3,-0.521,0.008,-2.393,0.4,-3.073,-0.203,-0.905,0.459,-0.334
I-Eventuality,0.569,0.177,-0.045,-0.833,-0.357,0.088,-0.518,-0.664,4.009,-0.232,0.0,-0.522,-0.295,-0.122,-1.856,0.0,0.0,0.031,0.0
B-Location,-1.058,0.454,0.0,0.331,-0.65,0.0,-1.881,-1.089,-0.573,-0.271,5.115,-0.274,0.0,-0.0,0.0,0.0,-0.0,0.0,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,Unnamed: 18_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6,Unnamed: 18_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7,Unnamed: 18_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8,Unnamed: 18_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9,Unnamed: 17_level_9,Unnamed: 18_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10,Unnamed: 17_level_10,Unnamed: 18_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11,Unnamed: 17_level_11,Unnamed: 18_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12,Unnamed: 17_level_12,Unnamed: 18_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13,Unnamed: 17_level_13,Unnamed: 18_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14,Unnamed: 17_level_14,Unnamed: 18_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15,Unnamed: 17_level_15,Unnamed: 18_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16,Unnamed: 17_level_16,Unnamed: 18_level_16
Weight?,Feature,Unnamed: 2_level_17,Unnamed: 3_level_17,Unnamed: 4_level_17,Unnamed: 5_level_17,Unnamed: 6_level_17,Unnamed: 7_level_17,Unnamed: 8_level_17,Unnamed: 9_level_17,Unnamed: 10_level_17,Unnamed: 11_level_17,Unnamed: 12_level_17,Unnamed: 13_level_17,Unnamed: 14_level_17,Unnamed: 15_level_17,Unnamed: 16_level_17,Unnamed: 17_level_17,Unnamed: 18_level_17
Weight?,Feature,Unnamed: 2_level_18,Unnamed: 3_level_18,Unnamed: 4_level_18,Unnamed: 5_level_18,Unnamed: 6_level_18,Unnamed: 7_level_18,Unnamed: 8_level_18,Unnamed: 9_level_18,Unnamed: 10_level_18,Unnamed: 11_level_18,Unnamed: 12_level_18,Unnamed: 13_level_18,Unnamed: 14_level_18,Unnamed: 15_level_18,Unnamed: 16_level_18,Unnamed: 17_level_18,Unnamed: 18_level_18
+4.620,ps.stem(word):other,,,,,,,,,,,,,,,,,
+4.308,ps.stem(word):have,,,,,,,,,,,,,,,,,
+4.302,ps.stem(word):includ,,,,,,,,,,,,,,,,,
+3.707,ps.stem(word):repres,,,,,,,,,,,,,,,,,
+3.579,ps.stem(word):one,,,,,,,,,,,,,,,,,
+3.542,ps.stem(word):pyramid,,,,,,,,,,,,,,,,,
+3.490,ps.stem(word):differ,,,,,,,,,,,,,,,,,
+3.363,ps.stem(word):measur,,,,,,,,,,,,,,,,,
+3.286,ps.stem(word):investig,,,,,,,,,,,,,,,,,
+3.274,bias,,,,,,,,,,,,,,,,,

Weight?,Feature
+4.620,ps.stem(word):other
+4.308,ps.stem(word):have
+4.302,ps.stem(word):includ
+3.707,ps.stem(word):repres
+3.579,ps.stem(word):one
+3.542,ps.stem(word):pyramid
+3.490,ps.stem(word):differ
+3.363,ps.stem(word):measur
+3.286,ps.stem(word):investig
+3.274,bias

Weight?,Feature
+3.983,ps.stem(word):water
+3.554,word[-3:]:ron
+3.505,ps.stem(word):element
+3.436,word[-3:]:gen
+2.907,ps.stem(word):rock
+2.874,ps.stem(word):miner
+2.818,ps.stem(word):sediment
+2.800,ps.stem(word):chemic
+2.775,ps.stem(word):hydrocarbon
+2.575,ps.stem(word):carbon

Weight?,Feature
+2.192,-1:word.lower():mineral
+1.887,ps.stem(word):nutrient
+1.590,ps.stem(word):water
+1.557,word[-3:]:rgy
+1.557,ps.stem(word):energi
+1.557,word.lower():energy
+1.543,word[-2:]:gy
+1.539,-1:ps.stem(word):miner
+1.455,-1:word.lower():chemical
+1.338,-1:ps.stem(word):chemic

Weight?,Feature
+4.456,ps.stem(word):nutrient
+3.949,ps.stem(word):site
+3.371,ps.stem(word):habitat
+3.318,ps.stem(word):chain
+3.309,ps.stem(word):lake
+3.142,ps.stem(word):estuari
+3.101,ps.stem(word):resourc
+3.055,ps.stem(word):ecosystem
+3.031,ps.stem(word):coast
+3.020,ps.stem(word):region

Weight?,Feature
+2.926,ps.stem(word):region
+2.780,ps.stem(word):web
+2.611,ps.stem(word):chain
+2.475,ps.stem(word):spring
+2.120,ps.stem(word):forest
+2.101,ps.stem(word):seep
+2.095,ps.stem(word):ecosystem
+2.027,ps.stem(word):site
+1.959,-1:word.lower():food
+1.959,-1:ps.stem(word):food

Weight?,Feature
+4.770,word[-2:]:ae
+4.487,ps.stem(word):popul
+3.952,ps.stem(word):seedl
+3.941,ps.stem(word):insect
+3.727,ps.stem(word):organ
+3.709,ps.stem(word):seed
+3.535,ps.stem(word):stem
+3.478,ps.stem(word):anim
+3.476,word.lower():seedling
+3.461,ps.stem(word):plant

Weight?,Feature
+3.888,ps.stem(word):popul
+3.573,-1:ps.stem(word):sp
+3.573,-1:word.lower():sp
+3.284,-1:word.lower():organic
+2.929,ps.stem(word):commun
+2.621,ps.stem(word):seed
+2.530,ps.stem(word):organ
+2.295,-1:word.lower():living
+2.239,word[-2:]:us
+2.222,-1:ps.stem(word):live

Weight?,Feature
+3.828,ps.stem(word):eat
+3.797,ps.stem(word):exhibit
+3.776,ps.stem(word):increas
+3.750,ps.stem(word):transfer
+3.652,ps.stem(word):spread
+3.603,word.lower():connections
+3.539,ps.stem(word):movement
+3.431,ps.stem(word):declin
+3.428,ps.stem(word):role
+3.393,ps.stem(word):decreas

Weight?,Feature
+3.403,-1:word.lower():shut
+3.403,-1:ps.stem(word):shut
+3.203,ps.stem(word):activ
+2.817,ps.stem(word):transfer
+2.590,-1:ps.stem(word):depend
+2.532,ps.stem(word):cycl
+2.475,-1:word.lower():quick-establishing
+2.475,-1:ps.stem(word):quick-establish
+2.382,ps.stem(word):relationship
+2.349,ps.stem(word):effect

Weight?,Feature
+2.594,-1:ps.stem(word):sticki
+2.594,-1:word.lower():sticky
+2.568,+1:word.lower():population
+2.275,+1:ps.stem(word):popul
+1.817,pos:NNP
+1.796,word.lower():parts
+1.748,ps.stem(word):balkan
+1.723,ps.stem(word):part
+1.711,word[-3:]:rts
+1.659,-1:word.lower():in

Weight?,Feature
+2.316,-1:word.lower():mt
+2.316,-1:ps.stem(word):Mt
+2.009,-1:ps.stem(word):zone
+1.787,word.istitle()
+1.441,-1:ps.stem(word):central
+1.441,-1:word.lower():central
+1.298,ps.stem(word):island
+1.276,ps.stem(word):rainier
+1.276,word.lower():rainier
+1.224,word[-3:]:ier

Weight?,Feature
+4.737,ps.stem(word):number
+4.472,ps.stem(word):color
+4.402,ps.stem(word):temperatur
+4.377,ps.stem(word):size
+4.101,ps.stem(word):nich
+3.975,ps.stem(word):rate
+3.446,ps.stem(word):densiti
+3.371,ps.stem(word):yield
+3.327,ps.stem(word):abund
+3.240,ps.stem(word):morpholog

Weight?,Feature
+3.204,ps.stem(word):rate
+2.293,-1:ps.stem(word):popul
+2.256,-1:word.lower():population
+2.111,ps.stem(word):toler
+2.039,+1:ps.stem(word):medium-sever
+2.039,+1:word.lower():medium-severity
+2.039,-1:ps.stem(word):low-
+2.039,-1:word.lower():low-
+1.921,-1:word.lower():species
+1.921,-1:ps.stem(word):speci

Weight?,Feature
+4.665,ps.stem(word):season
+3.889,word[-2:]:0s
+3.516,ps.stem(word):annual
+3.474,ps.stem(word):winter
+3.215,ps.stem(word):year
+2.748,ps.stem(word):period
+2.622,ps.stem(word):night
+2.581,word.lower():annually
+2.397,word.lower():spring
+2.268,ps.stem(word):spring

Weight?,Feature
+2.953,ps.stem(word):20-month-old
+2.953,word.lower():20-month-old
+2.404,ps.stem(word):month
+2.362,ps.stem(word):season
+2.342,ps.stem(word):year
+2.206,word.lower():spring
+2.199,ps.stem(word):spring
+2.102,word.isdigit()
+2.088,word.lower():september
+2.088,ps.stem(word):septemb

Weight?,Feature
+3.788,ps.stem(word):percentag
+2.641,ps.stem(word):meter
+2.432,-1:ps.stem(word):/
+2.432,-1:word.lower():/
+2.035,word.lower():percentage
+1.913,ps.stem(word):inch
+1.855,ps.stem(word):mile
+1.855,word.lower():miles
+1.756,word.lower():pounds
+1.745,ps.stem(word):pound

Weight?,Feature
+2.093,+1:ps.stem(word):fish
+2.035,+1:word.lower():fish
+1.978,-1:word.lower():per
+1.978,-1:ps.stem(word):per
+1.854,-1:ps.stem(word):squar
+1.854,-1:word.lower():square
+1.672,word.lower():average
+1.656,-1:ps.stem(word):/
+1.656,-1:word.lower():/
+1.653,ps.stem(word):averag

Weight?,Feature
+2.951,pos:CD
+2.951,pos[:2]:CD
+2.746,ps.stem(word):thousand
+2.624,word.lower():hundreds
+2.277,word.lower():individual
+2.042,BOS
+2.040,ps.stem(word):hundr
+1.972,word[-3:]:ens
+1.935,word.lower():dozens
+1.935,ps.stem(word):dozen

Weight?,Feature
+1.357,word.lower():million
+1.352,ps.stem(word):million
+1.149,-1:word.lower():o
+1.149,-1:ps.stem(word):o
+1.148,pos[:2]:CD
+1.148,pos:CD
+0.922,ps.stem(word):thousand
+0.918,word.lower():thousands
+0.890,-1:ps.stem(word):ten
+0.890,-1:word.lower():tens


# Unified

In [1]:
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

In [52]:


crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  8.2min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=CRF(algorithm='lbfgs', all_possible_states=None,
                                 all_possible_transitions=True, averaging=None,
                                 c=None, c1=None, c2=None,
                                 calibration_candidates=None,
                                 calibration_eta=None,
                                 calibration_max_trials=None,
                                 calibration_rate=None,
                                 calibration_samples=None, delta=None,
                                 epsilon=None, error_sensitive=None,...
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-Quality', 'B-Biotic_Entity', 'B-Eventuality', 'I-Biotic_Entity', 'B-Location', 'I-Location', 'B-Time', 'I-Time', 'I-Eventuality', 'B-Value', 'B-Aggregate_Biotic_Abiotic_Entity', 'B-Unit', 'I-Aggregate_Biotic_Abiotic_Entity', 'I-Unit', 'B-Abiotic_Entity', 'I-Abioti

In [53]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.09684573395986483, 'c2': 0.0800864058815976}
best CV score: 0.716436871144859
model size: 1.72M


ModuleNotFoundError: No module named 'file_loader'

In [27]:
from glob import glob

train_files = set([x[21:] for x in glob("./Data/bio-ner/train" + '/*.tsv')])
dev_files = set([x[19:] for x in glob("./Data/bio-ner/dev" + '/*.tsv')])
test_files = set([x[20:] for x in glob("./Data/bio-ner/test" + '/*.tsv')])

In [35]:
train_files.intersection(dev_files)

{'13451175_okra.tsv'}

In [34]:

test_files.intersection(dev_files)

set()

In [None]:
Final Unified Test Scores
IOB Score: 0.8515465401313206
Class Score: 0.8171233546749452
Overall Score: 0.772362130135599







IOB Score: 0.8471202213420131	Diff:
Class Score: 0.8042069357722814	Diff:
Overall Score: 0.760802348363734	Diff: 

In [5]:
0.8471202213420131 - 0.8515465401313206

-0.004426318789307437

In [6]:
 0.8042069357722814 - 0.8171233546749452

-0.012916418902663818

In [7]:
0.760802348363734 - 0.772362130135599

-0.011559781771865008