In [1]:
import json

In [2]:
print('test')

test


In [3]:
def format_float(value):
    # check for nan
    if value != value:
        return '   NaN'
    return "{:+.3f}".format(value)

In [4]:
experiment_1 = 'pioneer/saves/pioneer/'
experiment_2 = 'pioneer/saves/pioneer_lstm/'

report_path = 'classification_report.txt'

In [5]:
# load both dicts

experiment_1_dict = json.load(open(experiment_1 + report_path))
experiment_2_dict = json.load(open(experiment_2 + report_path))

In [6]:
# compare both dicts

e1_m_precision = experiment_1_dict['macro avg']['precision']
e1_m_recall = experiment_1_dict['macro avg']['recall']
e1_m_f1 = experiment_1_dict['macro avg']['f1-score']
e1_w_precision = experiment_1_dict['weighted avg']['precision']
e1_w_recall = experiment_1_dict['weighted avg']['recall']
e1_w_f1 = experiment_1_dict['weighted avg']['f1-score']

e2_m_precision = experiment_2_dict['macro avg']['precision']
e2_m_recall = experiment_2_dict['macro avg']['recall']
e2_m_f1 = experiment_2_dict['macro avg']['f1-score']
e2_w_precision = experiment_2_dict['weighted avg']['precision']
e2_w_recall = experiment_2_dict['weighted avg']['recall']
e2_w_f1 = experiment_2_dict['weighted avg']['f1-score']

# remove macro avg, weighted avg and accuracy
del experiment_1_dict['macro avg']
del experiment_1_dict['weighted avg']
del experiment_1_dict['accuracy']
del experiment_2_dict['macro avg']
del experiment_2_dict['weighted avg']
del experiment_2_dict['accuracy']

In [7]:
# print all metrics before and after
print('Before:')
print('Experiment 1 macro precision: ' + format_float(e1_m_precision))
print('Experiment 1 macro recall: ' + format_float(e1_m_recall))
print('Experiment 1 macro f1: ' + format_float(e1_m_f1))
print('Experiment 1 weighted precision: ' + format_float(e1_w_precision))
print('Experiment 1 weighted recall: ' + format_float(e1_w_recall))
print('Experiment 1 weighted f1: ' + format_float(e1_w_f1))
print('Experiment 2 macro precision: ' + format_float(e2_m_precision))
print('Experiment 2 macro recall: ' + format_float(e2_m_recall))
print('Experiment 2 macro f1: ' + format_float(e2_m_f1))
print('Experiment 2 weighted precision: ' + format_float(e2_w_precision))
print('Experiment 2 weighted recall: ' + format_float(e2_w_recall))
print('Experiment 2 weighted f1: ' + format_float(e2_w_f1))



Before:
Experiment 1 macro precision: +0.832
Experiment 1 macro recall: +0.873
Experiment 1 macro f1: +0.822
Experiment 1 weighted precision: +0.982
Experiment 1 weighted recall: +0.969
Experiment 1 weighted f1: +0.974
Experiment 2 macro precision: +0.856
Experiment 2 macro recall: +0.890
Experiment 2 macro f1: +0.825
Experiment 2 weighted precision: +0.990
Experiment 2 weighted recall: +0.969
Experiment 2 weighted f1: +0.976


In [8]:
# print difference in metrics
print(f'macro avg precision difference:     {e2_m_precision - e1_m_precision:+.3f}')
print(f'macro avg recall difference:        {e2_m_recall - e1_m_recall:+.3f}')
print(f'macro avg f1 difference:            {e2_m_f1 - e1_m_f1:+.3f}', end='\n\n')
print(f'weighted avg precision difference:  {e2_w_precision - e1_w_precision:+.3f}')
print(f'weighted avg recall difference:     {e2_w_recall - e1_w_recall:+.3f}')
print(f'weighted avg f1 difference:         {e2_w_f1 - e1_w_f1:+.3f}')

macro avg precision difference:     +0.023
macro avg recall difference:        +0.017
macro avg f1 difference:            +0.003

weighted avg precision difference:  +0.008
weighted avg recall difference:     +0.000
weighted avg f1 difference:         +0.002


In [9]:
# build a dict with the difference in metrics
diff_dict = {}
for key in experiment_1_dict.keys():
    diff_dict[key] = {}
    for metric in experiment_1_dict[key].keys():
        diff_dict[key][metric] = experiment_2_dict[key][metric] - experiment_1_dict[key][metric]

# copy support metric from experiment 1
for key in experiment_1_dict.keys():
    diff_dict[key]['support'] = experiment_1_dict[key]['support']

# sort by f1-score
sorted_diff_dict = {k: v for k, v in sorted(diff_dict.items(), key=lambda item: item[1]['f1-score'], reverse=True)}

In [10]:
# print sorted dict
print('Sorted by f1-score difference:')
for key in sorted_diff_dict.keys():
    print(f'{(key + ":").ljust(17)}precision: {format_float(sorted_diff_dict[key]["precision"])}, recall: {format_float(sorted_diff_dict[key]["recall"])}, f1-score: {format_float(sorted_diff_dict[key]["f1-score"])}, support: {sorted_diff_dict[key]["support"]}')

Sorted by f1-score difference:
mail:            precision: +0.333, recall: +0.500, f1-score: +0.400, support: 2.0
cmp:             precision: +0.286, recall: +0.000, f1-score: +0.167, support: 5.0
rm:              precision: +0.369, recall: +0.000, f1-score: +0.153, support: 36.0
mount:           precision: +0.167, recall: +0.000, f1-score: +0.133, support: 2.0
netstat:         precision: +0.182, recall: +0.000, f1-score: +0.093, support: 58.0
awk:             precision: +0.167, recall: +0.000, f1-score: +0.091, support: 5.0
trivial-rewrite: precision: +0.188, recall: +0.019, f1-score: +0.083, support: 377.0
mktemp:          precision: +0.107, recall: +0.000, f1-score: +0.066, support: 6.0
wget:            precision: +0.078, recall: +0.041, f1-score: +0.060, support: 74.0
wc:              precision: +0.097, recall: +0.000, f1-score: +0.058, support: 7.0
cleanup:         precision: +0.515, recall: -0.264, f1-score: +0.051, support: 276.0
dhclient:        precision: +0.101, recall: +0.00

In [11]:
# check which classes were never predicted in both experiments (precision NaN)
e1_never_predicted = []
e2_never_predicted = []

for key in experiment_1_dict.keys():
    if experiment_1_dict[key]['precision'] != experiment_1_dict[key]['precision']:
        e1_never_predicted.append(key)
    if experiment_2_dict[key]['precision'] != experiment_2_dict[key]['precision']:
        e2_never_predicted.append(key)

print('Classes never predicted in experiment 1:')
print(e1_never_predicted)
print('Classes never predicted in experiment 2:')
print(e2_never_predicted)

Classes never predicted in experiment 1:
['basename', 'expr']
Classes never predicted in experiment 2:
['basename', 'expr']
