In [67]:
import json

In [68]:
def format_float(value):
    # check for nan
    if value != value:
        return '   NaN'
    return "{:+.3f}".format(value)

In [69]:
experiment_1 = 'saves/baseline/'
experiment_2 = 'schedule/saves/deltatime/'

report_path = 'classification_report.txt'

In [70]:
# load both dicts

experiment_1_dict = json.load(open(experiment_1 + report_path))
experiment_2_dict = json.load(open(experiment_2 + report_path))

In [71]:
# compare both dicts

e1_m_precision = experiment_1_dict['macro avg']['precision']
e1_m_recall = experiment_1_dict['macro avg']['recall']
e1_m_f1 = experiment_1_dict['macro avg']['f1-score']
e1_w_precision = experiment_1_dict['weighted avg']['precision']
e1_w_recall = experiment_1_dict['weighted avg']['recall']
e1_w_f1 = experiment_1_dict['weighted avg']['f1-score']

e2_m_precision = experiment_2_dict['macro avg']['precision']
e2_m_recall = experiment_2_dict['macro avg']['recall']
e2_m_f1 = experiment_2_dict['macro avg']['f1-score']
e2_w_precision = experiment_2_dict['weighted avg']['precision']
e2_w_recall = experiment_2_dict['weighted avg']['recall']
e2_w_f1 = experiment_2_dict['weighted avg']['f1-score']

# remove macro avg, weighted avg and accuracy
del experiment_1_dict['macro avg']
del experiment_1_dict['weighted avg']
del experiment_1_dict['accuracy']
del experiment_2_dict['macro avg']
del experiment_2_dict['weighted avg']
del experiment_2_dict['accuracy']

In [72]:
# print all metrics before and after
print('Before:')
print('Experiment 1 macro precision: ' + format_float(e1_m_precision))
print('Experiment 1 macro recall: ' + format_float(e1_m_recall))
print('Experiment 1 macro f1: ' + format_float(e1_m_f1))
print('Experiment 1 weighted precision: ' + format_float(e1_w_precision))
print('Experiment 1 weighted recall: ' + format_float(e1_w_recall))
print('Experiment 1 weighted f1: ' + format_float(e1_w_f1))
print('Experiment 2 macro precision: ' + format_float(e2_m_precision))
print('Experiment 2 macro recall: ' + format_float(e2_m_recall))
print('Experiment 2 macro f1: ' + format_float(e2_m_f1))
print('Experiment 2 weighted precision: ' + format_float(e2_w_precision))
print('Experiment 2 weighted recall: ' + format_float(e2_w_recall))
print('Experiment 2 weighted f1: ' + format_float(e2_w_f1))



Before:
Experiment 1 macro precision: +0.717
Experiment 1 macro recall: +0.768
Experiment 1 macro f1: +0.684
Experiment 1 weighted precision: +0.924
Experiment 1 weighted recall: +0.884
Experiment 1 weighted f1: +0.890
Experiment 2 macro precision: +0.513
Experiment 2 macro recall: +0.746
Experiment 2 macro f1: +0.548
Experiment 2 weighted precision: +0.912
Experiment 2 weighted recall: +0.856
Experiment 2 weighted f1: +0.867


In [73]:
# print difference in metrics
print(f'macro avg precision difference:     {e2_m_precision - e1_m_precision:+.3f}')
print(f'macro avg recall difference:        {e2_m_recall - e1_m_recall:+.3f}')
print(f'macro avg f1 difference:            {e2_m_f1 - e1_m_f1:+.3f}', end='\n\n')
print(f'weighted avg precision difference:  {e2_w_precision - e1_w_precision:+.3f}')
print(f'weighted avg recall difference:     {e2_w_recall - e1_w_recall:+.3f}')
print(f'weighted avg f1 difference:         {e2_w_f1 - e1_w_f1:+.3f}')

macro avg precision difference:     -0.204
macro avg recall difference:        -0.022
macro avg f1 difference:            -0.136

weighted avg precision difference:  -0.012
weighted avg recall difference:     -0.029
weighted avg f1 difference:         -0.023


In [74]:
# build a dict with the difference in metrics
diff_dict = {}
for key in experiment_1_dict.keys():
    diff_dict[key] = {}
    for metric in experiment_1_dict[key].keys():
        diff_dict[key][metric] = experiment_2_dict[key][metric] - experiment_1_dict[key][metric]

# copy support metric from experiment 1
for key in experiment_1_dict.keys():
    diff_dict[key]['support'] = experiment_1_dict[key]['support']

# sort by f1-score
sorted_diff_dict = {k: v for k, v in sorted(diff_dict.items(), key=lambda item: item[1]['f1-score'], reverse=True)}

In [75]:
# print sorted dict
print('Sorted by f1-score difference:')
for key in sorted_diff_dict.keys():
    print(f'{(key + ":").ljust(17)}precision: {format_float(sorted_diff_dict[key]["precision"])}, recall: {format_float(sorted_diff_dict[key]["recall"])}, f1-score: {format_float(sorted_diff_dict[key]["f1-score"])}, support: {sorted_diff_dict[key]["support"]}')

Sorted by f1-score difference:
adjkerntz:       precision:    NaN, recall: +1.000, f1-score: +0.500, support: 11.0
basename:        precision:    NaN, recall: +0.500, f1-score: +0.143, support: 2.0
wget:            precision: +0.162, recall: +0.054, f1-score: +0.126, support: 74.0
anvil:           precision: +0.117, recall: -0.042, f1-score: +0.090, support: 378.0
pkg:             precision: -0.131, recall: +0.141, f1-score: +0.031, support: 1701.0
grep:            precision: +0.016, recall: +0.200, f1-score: +0.030, support: 5.0
imapd:           precision: +0.060, recall: -0.000, f1-score: +0.018, support: 2469.0
sleep:           precision: +0.004, recall: +0.000, f1-score: +0.002, support: 4889.0
find:            precision: -0.000, recall: +0.004, f1-score: +0.002, support: 4006.0
expr:            precision:    NaN, recall: +0.000, f1-score: +0.000, support: 2.0
master:          precision: +0.000, recall: +0.000, f1-score: +0.000, support: 580.0
python2.7:       precision: +0.000, re

In [76]:
# check which classes were never predicted in both experiments (precision NaN)
e1_never_predicted = []
e2_never_predicted = []

for key in experiment_1_dict.keys():
    if experiment_1_dict[key]['precision'] != experiment_1_dict[key]['precision']:
        e1_never_predicted.append(key)
    if experiment_2_dict[key]['precision'] != experiment_2_dict[key]['precision']:
        e2_never_predicted.append(key)

print('Classes never predicted in experiment 1:')
print(e1_never_predicted)
print('Classes never predicted in experiment 2:')
print(e2_never_predicted)


Classes never predicted in experiment 1:
['adjkerntz', 'basename', 'expr', 'tty']
Classes never predicted in experiment 2:
[]
