In [4]:
import matplotlib
matplotlib.use('Agg')
from pylab import savefig
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
import os
from nltk.metrics.scores import accuracy
from nltk.metrics.scores import recall
from nltk.metrics.scores import precision
from nltk.metrics.scores import f_measure
import zipfile
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report



In [2]:
java_path = "C:/Program Files/Java/jdk1.8.0_101/bin/java.exe"
os.environ['JAVAHOME'] = java_path

In [5]:
def show_values(pc, fmt="%.2f", **kw):
    '''
    Heatmap with text in each cell with matplotlib's pyplot
    Source: http://stackoverflow.com/a/25074150/395857 
    By HYRY
    '''
    from itertools import izip
    pc.update_scalarmappable()
    ax = pc.get_axes()
    for p, color, value in izip(pc.get_paths(), pc.get_facecolors(), pc.get_array()):
        x, y = p.vertices[:-2, :].mean(0)
        if np.all(color[:3] > 0.5):
            color = (0.0, 0.0, 0.0)
        else:
            color = (1.0, 1.0, 1.0)
        ax.text(x, y, fmt % value, ha="center", va="center", color=color, **kw)


def cm2inch(*tupl):
    '''
    Specify figure size in centimeter in matplotlib
    Source: http://stackoverflow.com/a/22787457/395857
    By gns-ank
    '''
    inch = 2.54
    if type(tupl[0]) == tuple:
        return tuple(i/inch for i in tupl[0])
    else:
        return tuple(i/inch for i in tupl)


def heatmap(AUC, title, xlabel, ylabel, xticklabels, yticklabels, figure_width=40, figure_height=20, correct_orientation=False, cmap='RdBu'):
    '''
    Inspired by:
    - http://stackoverflow.com/a/16124677/395857 
    - http://stackoverflow.com/a/25074150/395857
    '''

    # Plot it out
    fig, ax = plt.subplots()    
    #c = ax.pcolor(AUC, edgecolors='k', linestyle= 'dashed', linewidths=0.2, cmap='RdBu', vmin=0.0, vmax=1.0)
    c = ax.pcolor(AUC, edgecolors='k', linestyle= 'dashed', linewidths=0.2, cmap=cmap)

    # put the major ticks at the middle of each cell
    ax.set_yticks(np.arange(AUC.shape[0]) + 0.5, minor=False)
    ax.set_xticks(np.arange(AUC.shape[1]) + 0.5, minor=False)

    # set tick labels
    #ax.set_xticklabels(np.arange(1,AUC.shape[1]+1), minor=False)
    ax.set_xticklabels(xticklabels, minor=False)
    ax.set_yticklabels(yticklabels, minor=False)

    # set title and x/y labels
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)      

    # Remove last blank column
    plt.xlim( (0, AUC.shape[1]) )

    # Turn off all the ticks
    ax = plt.gca()    
    for t in ax.xaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False
    for t in ax.yaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False

    # Add color bar
    plt.colorbar(c)

    # Add text in each cell 
    show_values(c)

    # Proper orientation (origin at the top left instead of bottom left)
    if correct_orientation:
        ax.invert_yaxis()
        ax.xaxis.tick_top()       

    # resize 
    fig = plt.gcf()
    #fig.set_size_inches(cm2inch(40, 20))
    #fig.set_size_inches(cm2inch(40*4, 20*4))
    fig.set_size_inches(cm2inch(figure_width, figure_height))



def plot_classification_report(classification_report, title='Classification report ', cmap='RdBu'):
    '''
    Plot scikit-learn classification report.
    Extension based on http://stackoverflow.com/a/31689645/395857 
    '''
    lines = classification_report.split('\n')

    classes = []
    plotMat = []
    support = []
    class_names = []
    for line in lines[2 : (len(lines) - 2)]:
        t = line.strip().split()
        if len(t) < 2: continue
        classes.append(t[0])
        v = [float(x) for x in t[1: len(t) - 1]]
        support.append(int(t[-1]))
        class_names.append(t[0])
        print(v)
        plotMat.append(v)

    print('plotMat: {0}'.format(plotMat))
    print('support: {0}'.format(support))

    xlabel = 'Metrics'
    ylabel = 'Classes'
    xticklabels = ['Precision', 'Recall', 'F1-score']
    yticklabels = ['{0} ({1})'.format(class_names[idx], sup) for idx, sup  in enumerate(support)]
    figure_width = 25
    figure_height = len(class_names) + 7
    correct_orientation = False
    heatmap(np.array(plotMat), title, xlabel, ylabel, xticklabels, yticklabels, figure_width, figure_height, correct_orientation, cmap=cmap)


In [6]:
def load_model(path):
    model = StanfordNERTagger(path,
                       '../libs/stanford-ner.jar',
                       encoding='utf-8')
    return model

In [7]:
def to4classes(items):
    d4 = []
    for item in items:
        if item == 'O' or item == 'B-NOT':
            d4.append('O')
        else:
            d4.append(item.split("-")[1])
    return d4

In [8]:
def choose_words(lines):
    lst = []
    for line in lines:
        lst.append(line.split(" ")[0])
    return lst

In [9]:
def reference_classes(lines):
    arr = []
    for line in lines:
        s2 = line.split(" ")[2]
        s2 = s2.split("\n")[0]
        u2 = unicode(s2, "utf-8")
        arr.append(u2)
    return arr

In [10]:
def predict(model, data):
    model_pred = model.tag(data)
    model_ypred = [x[1] for x in model_pred]
    model_ypred4 = to4classes(model_ypred)
    return model_ypred4  

In [11]:
def evaluate(y_true, y_pred, target_names, filename):
    plot_classification_report(classification_report(y_true, y_pred, target_names))
    plt.savefig(filename, dpi=200, format='png', bbox_inches='tight')
    plt.close()

In [65]:
#ftest_data = open("../datasets/processed/data_test1.bio", "r")
ftest_data = open("C:/workspace/nlp/ner/ner_historicpapers/datasets/data_test1.bio", "r")
fall_data = open("../datasets/processed/combined_data.bio", "r")

In [None]:
#load models
my_model = load_model("../models/model1.ser.gz")
out_model = load_model("../models/eunews.nl.crf.gz")

In [66]:
test_data = ftest_data.readlines()
all_data = fall_data.readlines()

In [67]:
#getting words and reference classes for different data files

In [68]:
test_words = choose_words(test_data)
all_words = choose_words(all_data)
y_test = reference_classes(test_data)
y_all_data = reference_classes(all_data)
y_test4 = to4classes(y_test)
y_all_data4 = to4classes(y_all_data)

In [40]:
#predicting

In [69]:
my_model_ypred4 = predict(my_model, test_words)
out_model_ypred4 = predict(out_model, test_words)

In [70]:
#evaluation

In [72]:
classes = list(set(y_test4))

In [73]:
evaluate(y_test4, my_model_ypred4, classes, "my_model_test0.25.png")

[0.53, 0.63, 0.57]
[0.47, 0.41, 0.44]
[0.98, 0.98, 0.98]
[0.52, 0.43, 0.47]
plotMat: [[0.53, 0.63, 0.57], [0.47, 0.41, 0.44], [0.98, 0.98, 0.98], [0.52, 0.43, 0.47]]
support: [824, 1028, 43599, 170]


In [74]:
evaluate(y_test4, out_model_ypred4, classes, "out_model_test0.25.png")

[0.28, 0.99, 0.44]
[0.31, 1.0, 0.47]
[1.0, 0.86, 0.92]
[0.08, 0.99, 0.15]
plotMat: [[0.28, 0.99, 0.44], [0.31, 1.0, 0.47], [1.0, 0.86, 0.92], [0.08, 0.99, 0.15]]
support: [824, 1028, 43599, 170]


In [75]:
my_model_ypred4 = predict(my_model, all_words)
out_model_ypred4 = predict(out_model, all_words)

In [76]:
evaluate(y_all_data4, my_model_ypred4, classes, "my_model_all_data.png")

[0.9, 0.93, 0.91]
[0.89, 0.86, 0.87]
[0.99, 0.99, 0.99]
[0.94, 0.91, 0.93]
plotMat: [[0.9, 0.93, 0.91], [0.89, 0.86, 0.87], [0.99, 0.99, 0.99], [0.94, 0.91, 0.93]]
support: [4448, 4473, 172403, 1158]


In [77]:
evaluate(y_all_data4, out_model_ypred4, classes, "out_model_all_data.png")

[0.41, 0.99, 0.58]
[0.4, 1.0, 0.57]
[1.0, 0.88, 0.94]
[0.13, 0.99, 0.24]
plotMat: [[0.41, 0.99, 0.58], [0.4, 1.0, 0.57], [1.0, 0.88, 0.94], [0.13, 0.99, 0.24]]
support: [4448, 4473, 172403, 1158]


In [79]:
print(classification_report(y_all_data4, out_model_ypred4, classes))

             precision    recall  f1-score   support

        LOC       0.41      0.99      0.58      4448
        PER       0.40      1.00      0.57      4473
          O       1.00      0.88      0.94    172403
        ORG       0.13      0.99      0.24      1158

avg / total       0.97      0.89      0.92    182482



In [19]:
#new experiments with new 4-fold evaluation 

In [19]:
model2 = load_model("../models/rusnew.crf.gz")

In [98]:
ftest_data = open("../datasets/processed/rustest.bio", "r")

In [99]:
test_data = ftest_data.readlines()

In [52]:
model2_ypred4 = predict(model2, test_words)

In [None]:
test_words = choose_words(test_data)
y_test = reference_classes(test_data)
y_test4 = to4classes(y_test)
classes = list(set(y_test4))

In [45]:
len(test_words)

45620

In [53]:
evaluate(y_test4, model2_ypred4, classes, "model3_test2_data.png")

[0.77, 0.62, 0.68]
[0.68, 0.59, 0.63]
[0.98, 0.99, 0.98]
[0.8, 0.28, 0.41]
plotMat: [[0.77, 0.62, 0.68], [0.68, 0.59, 0.63], [0.98, 0.99, 0.98], [0.8, 0.28, 0.41]]
support: [1135, 778, 43412, 295]


In [14]:
frus = open("../datasets/processed/rustestset.bio", "r")

In [15]:
rus = frus.readlines()

In [16]:
yref = []
words = []
for r in rus:
    s = r.split(" ")[1]
    
    words.append(r.split(" ")[0])
    yref.append(s.split("\n")[0])
ll = list(set(yref))

In [17]:
ll

['Facility', 'O', 'Project', 'Person', 'Location', 'Org', 'LocOrg']

In [20]:
ypred = model2.tag(words)
#words

In [21]:
y = [x[1] for x in ypred]

In [22]:
classes = list(set(y))

In [23]:
classes

[u'Person', u'LocOrg', u'Location', u'O', u'Org']

In [108]:
yf = []
for x in yref:
    if x == 'Project' or x == 'Facility':
        yf.append('O')
    else:
        yf.append(x)

In [95]:
evaluate(yf, y, classes, "model_rus_data.png")

[0.76, 0.92, 0.83]
[0.75, 0.46, 0.57]
[0.79, 0.53, 0.63]
[0.98, 0.99, 0.98]
[0.68, 0.6, 0.64]
plotMat: [[0.76, 0.92, 0.83], [0.75, 0.46, 0.57], [0.79, 0.53, 0.63], [0.98, 0.99, 0.98], [0.68, 0.6, 0.64]]
support: [1238, 543, 607, 27448, 1104]


In [109]:
print(classification_report(yf, y, classes))

             precision    recall  f1-score   support

     Person       0.84      0.83      0.84      2122
     LocOrg       0.63      0.63      0.63       756
   Location       0.59      0.53      0.56       808
          O       0.97      0.99      0.98     52988
        Org       0.76      0.42      0.54      2708

avg / total       0.94      0.95      0.94     59382



In [30]:
c = 0
kol = 0
fname = rus[0].split(" ")[4].split("\n")[0]
fn = open("../datasets/processed/pred/" + fname + ".task1", "w+")
lines = []
vecs = []
for line in rus:
    #print(line)
    namefile = line.split(" ")[4].split("\n")[0]
    if namefile != fname:
        line1 = lines[0]
        cnt = 1
        for line2 in lines[1:]:
            if line1 == line2 and (vecs[cnt - 1][0] + vecs[cnt - 1][1] + 1) == vecs[cnt][0]:
                vecs[cnt][0] = vecs[cnt - 1][0]
                vecs[cnt][1] = vecs[cnt - 1][1] + 1 + vecs[cnt][1]
            else:
                fn.write(line1 + " " + str(vecs[cnt - 1][0]) + " " + str(vecs[cnt - 1][1]) + "\n")
            line1 = line2
            cnt = cnt + 1
        fn.write(line1 + " " + str(vecs[cnt - 1][0]) + " " + str(vecs[cnt - 1][1]) + "\n")
        fn.close()
        lines = []
        vecs = []
        fname = namefile
        fn = open("../datasets/processed/pred/" + fname + ".task1", "w+")
        
        
    if y[c] != 'O':
        if y[c].lower() == 'locorg':
            lines.append(y[c].lower())
        else:
            lines.append(y[c].lower()[:3])
        vecs.append([int(line.split(" ")[2]),int(line.split(" ")[3])])
        #fn.write(y[c].lower()[:3] + " " + line.split(" ")[2] + " " + line.split(" ")[3] + "\n")
    if y[c] == 'Org':
        kol = kol + 1
    c = c + 1
fn.close()
print(kol)
    

1481


In [2]:
s = "sdaaadffd"
print(s[1:])

daaadffd
