In [210]:
from IPython.display import display

In [211]:
import numpy
import pandas
pandas.options.display.float_format = '{:,.3f}'.format
import os
import re
from sklearn import metrics

In [212]:
iterations = ['NER', 'ENTITY', 'LKIF', 'YAGO']
base_dirname = '../results/echr/batch/holdout/{}/'

In [213]:
averages = ['micro', 'macro', 'weighted']

In [214]:
metrics_columns = ['Precision', 'Recall', 'F1 Score']

In [215]:
def get_input_files(input_dirpath, pattern):
    """Returns the names of the files in input_dirpath that matches pattern."""
    all_files = os.listdir(input_dirpath)
    result = []
    for filename in all_files:
        if re.match(pattern, filename) and os.path.isfile(os.path.join(
                input_dirpath, filename)):
            result.append(os.path.join(input_dirpath, filename))
    return result

In [216]:
def add_prec_rec(predictions, results, general_results, dataset_name, iteration):
    for average in averages:
        values = metrics.precision_recall_fscore_support(
            predictions.true, predictions.prediction,
            average=average, warn_for=()
        )[:3]
        if average == 'macro':
            general_results.loc[dataset_name, iteration][['Precision(macro)', 'Recall(macro)', 'F1 Score(macro)']] = values
        results.loc[iteration, average] = values

def add_top_bottom(predictions, general_results, dataset_name, iteration):
    if iteration == 'NER':
        return
    values = metrics.precision_recall_fscore_support(predictions.true, predictions.prediction,
                                                                      average=None, warn_for=())
    values = pandas.DataFrame(numpy.vstack(values).T, columns=['Prec', 'Recall', 'F1score', 'Support'])
    to_take = int(values.shape[0] * 0.2)  # 20%
    top_values = values.sort_values('Support', ascending=False)[:to_take].mean().values[:3]
    general_results.loc[dataset_name, iteration][
        ['Top 20% Precision', 'Top 20% Recall', 'Top 20% Fscore']] = top_values
    bottom_values = values.sort_values('Support', ascending=True)[:to_take].mean().values[:3]
    general_results.loc[dataset_name, iteration][
        ['Bottom 20% Precision', 'Bottom 20% Recall', 'Bottom 20% Fscore']] = bottom_values
    

In [217]:
def get_results(base_filename, dataset_names):
    index = pandas.MultiIndex.from_product([iterations, averages], names=['Task', 'Average'])
    general_columns = ['Accuracy', 'Precision(macro)', 'Recall(macro)', 'F1 Score(macro)',
                       'Top 20% Precision', 'Bottom 20% Precision',
                       'Top 20% Recall', 'Bottom 20% Recall', 'Top 20% Fscore', 'Bottom 20% Fscore']
    general_results = pandas.DataFrame(
        columns=general_columns,
        index=pandas.MultiIndex.from_product([[x[0] for x in dataset_names], iterations], names=['Dataset', 'Task']))
    for dataset_name, dataset_description in dataset_names:
        prediction_dirname = base_dirname.format(dataset_name)
        print('')
        print(dataset_description)
        results = pandas.DataFrame(columns=metrics_columns, index=index)
        for iteration in iterations:
            filenames = get_input_files(prediction_dirname, base_filename.format(iteration))
            if len(filenames) is 0:
                print('Error with iteration {} and dataset {}'.format(iteration, dataset_name))
                continue
            filename = filenames[0]
            predictions = pandas.read_csv(filename)
            general_results.loc[dataset_name, iteration]['Accuracy'] = metrics.accuracy_score(
                predictions.true, predictions.prediction)
            add_prec_rec(predictions, results, general_results, dataset_name, iteration)
            add_top_bottom(predictions, general_results, dataset_name, iteration)
        display(results)

    print('General Results')
    display(general_results)

In [218]:
dataset_names = [
    ('handcrafted', 'Handcrafted Features'),
    ('wv_echr', 'Word vectors trained with the ECHR documents'),
    ('wv_wiki', 'Word vectors trained with the Wikipedia documents'),
    ('wv_mixed', 'Word vectors trained with documents from Wikipedia and ECHR')
]
prediction_base_filename = r'test_predictions_.*{}.*csv'
print('TEST RESULTS')
get_results(prediction_base_filename, dataset_names)

TEST RESULTS

Handcrafted Features


Unnamed: 0_level_0,Unnamed: 1_level_0,Precision,Recall,F1 Score
Task,Average,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NER,micro,0.993,0.993,0.993
NER,macro,0.993,0.992,0.993
NER,weighted,0.993,0.993,0.993
ENTITY,micro,0.946,0.946,0.946
ENTITY,macro,0.903,0.896,0.897
ENTITY,weighted,0.946,0.946,0.945
LKIF,micro,0.921,0.921,0.921
LKIF,macro,0.612,0.647,0.612
LKIF,weighted,0.919,0.921,0.916
YAGO,micro,0.924,0.924,0.924



Word vectors trained with the ECHR documents


Unnamed: 0_level_0,Unnamed: 1_level_0,Precision,Recall,F1 Score
Task,Average,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NER,micro,0.84,0.84,0.84
NER,macro,0.84,0.841,0.84
NER,weighted,0.841,0.84,0.84
ENTITY,micro,0.811,0.811,0.811
ENTITY,macro,0.764,0.742,0.747
ENTITY,weighted,0.804,0.811,0.805
LKIF,micro,0.786,0.786,0.786
LKIF,macro,0.552,0.462,0.494
LKIF,weighted,0.778,0.786,0.775
YAGO,micro,0.776,0.776,0.776



Word vectors trained with the Wikipedia documents


Unnamed: 0_level_0,Unnamed: 1_level_0,Precision,Recall,F1 Score
Task,Average,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NER,micro,0.86,0.86,0.86
NER,macro,0.861,0.861,0.86
NER,weighted,0.862,0.86,0.86
ENTITY,micro,0.801,0.801,0.801
ENTITY,macro,0.766,0.741,0.753
ENTITY,weighted,0.802,0.801,0.8
LKIF,micro,0.799,0.799,0.799
LKIF,macro,0.542,0.442,0.469
LKIF,weighted,0.789,0.799,0.785
YAGO,micro,0.799,0.799,0.799



Word vectors trained with documents from Wikipedia and ECHR


Unnamed: 0_level_0,Unnamed: 1_level_0,Precision,Recall,F1 Score
Task,Average,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NER,micro,0.833,0.833,0.833
NER,macro,0.833,0.833,0.833
NER,weighted,0.834,0.833,0.833
ENTITY,micro,0.786,0.786,0.786
ENTITY,macro,0.77,0.725,0.745
ENTITY,weighted,0.786,0.786,0.784
LKIF,micro,0.769,0.769,0.769
LKIF,macro,0.586,0.461,0.497
LKIF,weighted,0.76,0.769,0.752
YAGO,micro,0.781,0.781,0.781


General Results


Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Precision(macro),Recall(macro),F1 Score(macro),Top 20% Precision,Bottom 20% Precision,Top 20% Recall,Bottom 20% Recall,Top 20% Fscore,Bottom 20% Fscore
Dataset,Task,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
handcrafted,NER,0.993,0.993,0.992,0.993,,,,,,
handcrafted,ENTITY,0.946,0.903,0.896,0.897,1.0,0.926,0.995,0.962,0.997,0.943
handcrafted,LKIF,0.921,0.612,0.647,0.612,0.904,0.0,0.912,0.0,0.903,0.0
handcrafted,YAGO,0.924,0.662,0.66,0.652,0.871,0.545,0.917,0.545,0.875,0.545
wv_echr,NER,0.84,0.84,0.841,0.84,,,,,,
wv_echr,ENTITY,0.811,0.764,0.742,0.747,0.841,0.8,0.884,0.923,0.862,0.857
wv_echr,LKIF,0.786,0.552,0.462,0.494,0.808,0.0,0.772,0.0,0.785,0.0
wv_echr,YAGO,0.776,0.371,0.33,0.337,0.773,0.0,0.726,0.0,0.744,0.0
wv_wiki,NER,0.86,0.861,0.861,0.86,,,,,,
wv_wiki,ENTITY,0.801,0.766,0.741,0.753,0.824,0.826,0.874,0.731,0.848,0.776


In [219]:
dataset_names = [
    ('handcrafted', 'Handcrafted Features using the ECHR trained classifier'),
    ('wv_echr', 'Word vectors trained with the ECHR documents using the ECHR trained classifier'),
    ('wv_wiki', 'Word vectors trained with the Wikipedia documents using the ECHR trained classifier'),
    ('wv_mixed', 'Word vectors trained with documents from Wikipedia and ECHR using the ECHR trained classifier'),
    ('handcrafted_wiki_classifier', 'Handcrafted features using the Wikipedia trained classifier'),
    ('wv_wiki_wiki_classifier',
     'Word vectors trained with the Wikipedia documents using the Wikipedia trained classifier'),
]
evaluation_base_filename = r'evaluation_predictions_.*{}.*csv'
print('EVALUATION RESULTS')
get_results(evaluation_base_filename, dataset_names)

EVALUATION RESULTS

Handcrafted Features using the ECHR trained classifier


Unnamed: 0_level_0,Unnamed: 1_level_0,Precision,Recall,F1 Score
Task,Average,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NER,micro,0.729,0.729,0.729
NER,macro,0.282,0.319,0.3
NER,weighted,0.645,0.729,0.684
ENTITY,micro,0.814,0.814,0.814
ENTITY,macro,0.75,0.453,0.516
ENTITY,weighted,0.789,0.814,0.772
LKIF,micro,0.823,0.823,0.823
LKIF,macro,0.346,0.26,0.267
LKIF,weighted,0.78,0.823,0.776
YAGO,micro,0.821,0.821,0.821



Word vectors trained with the ECHR documents using the ECHR trained classifier


Unnamed: 0_level_0,Unnamed: 1_level_0,Precision,Recall,F1 Score
Task,Average,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NER,micro,0.629,0.629,0.629
NER,macro,0.3,0.276,0.287
NER,weighted,0.686,0.629,0.656
ENTITY,micro,0.783,0.783,0.783
ENTITY,macro,0.574,0.618,0.584
ENTITY,weighted,0.799,0.783,0.786
LKIF,micro,0.788,0.788,0.788
LKIF,macro,0.273,0.288,0.258
LKIF,weighted,0.795,0.788,0.781
YAGO,micro,0.801,0.801,0.801



Word vectors trained with the Wikipedia documents using the ECHR trained classifier


Unnamed: 0_level_0,Unnamed: 1_level_0,Precision,Recall,F1 Score
Task,Average,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NER,micro,0.647,0.647,0.647
NER,macro,0.302,0.283,0.292
NER,weighted,0.689,0.647,0.667
ENTITY,micro,0.781,0.781,0.781
ENTITY,macro,0.606,0.659,0.624
ENTITY,weighted,0.815,0.781,0.794
LKIF,micro,0.781,0.781,0.781
LKIF,macro,0.27,0.32,0.278
LKIF,weighted,0.798,0.781,0.782
YAGO,micro,0.792,0.792,0.792



Word vectors trained with documents from Wikipedia and ECHR using the ECHR trained classifier


Unnamed: 0_level_0,Unnamed: 1_level_0,Precision,Recall,F1 Score
Task,Average,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NER,micro,0.635,0.635,0.635
NER,macro,0.294,0.278,0.286
NER,weighted,0.671,0.635,0.652
ENTITY,micro,0.772,0.772,0.772
ENTITY,macro,0.557,0.585,0.559
ENTITY,weighted,0.789,0.772,0.777
LKIF,micro,0.794,0.794,0.794
LKIF,macro,0.305,0.331,0.286
LKIF,weighted,0.787,0.794,0.782
YAGO,micro,0.784,0.784,0.784



Handcrafted features using the Wikipedia trained classifier
Error with iteration NER and dataset handcrafted_wiki_classifier


Unnamed: 0_level_0,Unnamed: 1_level_0,Precision,Recall,F1 Score
Task,Average,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NER,micro,,,
NER,macro,,,
NER,weighted,,,
ENTITY,micro,0.78,0.78,0.78
ENTITY,macro,0.555,0.236,0.258
ENTITY,weighted,0.719,0.78,0.697
LKIF,micro,0.779,0.779,0.779
LKIF,macro,0.114,0.061,0.067
LKIF,weighted,0.663,0.779,0.696
YAGO,micro,0.779,0.779,0.779



Word vectors trained with the Wikipedia documents using the Wikipedia trained classifier
Error with iteration NER and dataset wv_wiki_wiki_classifier


Unnamed: 0_level_0,Unnamed: 1_level_0,Precision,Recall,F1 Score
Task,Average,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NER,micro,,,
NER,macro,,,
NER,weighted,,,
ENTITY,micro,0.753,0.753,0.753
ENTITY,macro,0.352,0.217,0.226
ENTITY,weighted,0.66,0.753,0.68
LKIF,micro,0.761,0.761,0.761
LKIF,macro,0.066,0.046,0.045
LKIF,weighted,0.625,0.761,0.675
YAGO,micro,0.758,0.758,0.758


General Results


Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Precision(macro),Recall(macro),F1 Score(macro),Top 20% Precision,Bottom 20% Precision,Top 20% Recall,Bottom 20% Recall,Top 20% Fscore,Bottom 20% Fscore
Dataset,Task,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
handcrafted,NER,0.729,0.282,0.319,0.3,,,,,,
handcrafted,ENTITY,0.814,0.75,0.453,0.516,0.823,0.857,0.976,0.615,0.893,0.716
handcrafted,LKIF,0.823,0.346,0.26,0.267,0.849,0.375,0.527,0.5,0.596,0.417
handcrafted,YAGO,0.821,0.329,0.195,0.211,0.627,0.222,0.367,0.222,0.405,0.222
wv_echr,NER,0.629,0.3,0.276,0.287,,,,,,
wv_echr,ENTITY,0.783,0.574,0.618,0.584,0.894,0.434,0.85,0.551,0.872,0.486
wv_echr,LKIF,0.788,0.273,0.288,0.258,0.734,0.083,0.741,0.25,0.707,0.125
wv_echr,YAGO,0.801,0.246,0.212,0.195,0.608,0.0,0.41,0.0,0.433,0.0
wv_wiki,NER,0.647,0.302,0.283,0.292,,,,,,
wv_wiki,ENTITY,0.781,0.606,0.659,0.624,0.908,0.532,0.833,0.641,0.869,0.581


In [268]:
import pickle
with open('/home/ccardellino/datasets/echr/annotated/processed/classes.p', 'rb') as echr_file:
    echr_classes = pickle.load(echr_file)

In [266]:
predictions = pandas.read_csv('/home/mteruel/mirel/results/echr/batch/holdout/wv_echr/evaluation_predictions_YAGO.csv')

In [269]:
echr_classes

{'ENTITY': (array(['I-', 'I-abstraction', 'I-act', 'I-document', 'I-organization',
         'I-person', 'O'], 
        dtype='<U14'), array([   2,  250,  845,  331,  258,  278, 1985])),
 'LKIF': (array(['I-', 'I-Code', 'I-Crime', 'I-Decision', 'I-Jurisdiction',
         'I-Legal_Document', 'I-Legal_Person', 'I-Legal_Proceeding',
         'I-Legal_Role', 'I-Legal_Speech_Act', 'I-Obligation',
         'I-Professional_Legal_Role', 'I-Prohibition', 'I-Public_Body',
         'I-Regulation', 'I-Right', 'I-Social_Role',
         'I-wordnet_case_law_106535035', 'I-wordnet_deterrence_101076488',
         'O'], 
        dtype='<U30'),
  array([  15,   59,   97,    8,   10,  215,   40,  758,  129,   99,   87,
           90,    4,  258,    7,   52,   16,   16,    4, 1985])),
 'NER': (array(['I', 'O'], 
        dtype='<U1'), array([1964, 1985])),
 'URI': (array(['I-', 'I-European_Convention_on_Human_Rights',
         'I-NOT_IN_WIKIPEDIA_Croatian_Criminal_Code',
         'I-NOT_IN_WIKIPEDIA_Croatian

In [270]:
print(metrics.precision_recall_fscore_support(predictions.true, predictions.prediction, labels=echr_classes['YAGO'][0], average='macro'))
print(metrics.precision_recall_fscore_support(predictions.true, predictions.prediction, average='macro'))
precs = metrics.precision_recall_fscore_support(predictions.true, predictions.prediction, labels=numpy.unique(predictions.true), average=None)[0]

(0.22970854420326137, 0.19799850033263833, 0.18172161283624483, None)
(0.24641462014531673, 0.21239839126592111, 0.19493773013342625, None)


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [265]:
precs.shape

(6,)

In [262]:
numpy.unique(predictions.true).shape, numpy.unique(predictions.prediction).shape

((59,), (39,))

In [255]:
metrics.precision_recall_fscore_support(predictions.true, predictions.prediction, average=None)[0].shape

  'precision', 'predicted', average, warn_for)


(59,)

In [258]:
set(numpy.unique(predictions.true)).difference(set(numpy.unique(predictions.prediction)))

{0, 4, 5, 8, 11, 12, 14, 16, 20, 21, 24, 28, 34, 40, 43, 45, 51, 53, 54, 57}

In [260]:
lala = set(numpy.unique(predictions.true))
lala.update(set(numpy.unique(predictions.prediction)))
len(lala)

59