In [86]:
import json
import os
import copy
import itertools
import numpy as np
import statistics as s

## Read data

In [151]:
root_dir = 'C:\\Users\\janek\\Development\\Git\\master-thesis\\data\\logs_server\\three_seeds\\'
run_subdirs = [x[0] for x in os.walk(root_dir)]
print(run_subdirs[1])

# create dict with param string as key and related cv_metrics dict as value
runs = []
for sub_dir in run_subdirs:
    if sub_dir.endswith("False"):
        # cut generic parts & parse
        param_list_short = sub_dir[147:]
        param_list_short = param_list_short[:param_list_short.index("sw=False") - 1]
        param_list_short = param_list_short.replace(",r=cv", "")
        # 0: label set; 1: other label weights; 2: seeds
        param_tuple = [kv.split("=")[1] for kv in param_list_short.split(",")]
        # read cv_metrics.json
        with open(os.path.join(sub_dir, "cv_metrics.json"), "r") as file:
            cv_metrics = json.load(file)
            for key, value in cv_metrics.items():
                if isinstance(value, float):
                    cv_metrics[key] = round(value, 4)
                else:
                    cv_metrics[key] = [round(x, 4) for x in value]
        runs.append((param_tuple, cv_metrics))


C:\Users\janek\Development\Git\master-thesis\data\logs_server\three_seeds\GatewayTokenClassifier.py-2022-11-01_191511-bs=8,ds=0.1,e=3,eh=False,f=5,l=all,olw=0.1,r=cv,s=43,sw=False


In [152]:
for params, metrics in runs:
    print(params, metrics)

['all', '0.1', '43'] {'avg_loss': 0.1426, 'avg_xor_precision': 0.273, 'avg_xor_recall': 0.5678, 'avg_xor_f1': 0.3071, 'avg_and_recall': 0.003, 'avg_and_precision': 0.0012, 'avg_and_f1': 0.0017, 'avg_overall_accuracy': 0.465, 'loss': [0.2007, 0.1394, 0.1613, 0.094, 0.1174], 'xor_precision': [0.0411, 0.1376, 0.2765, 0.4426, 0.4669], 'xor_recall': [0.6364, 0.4227, 0.4524, 0.6273, 0.7002], 'xor_f1': [0.0746, 0.1951, 0.296, 0.4502, 0.5196], 'and_recall': [0.0152, 0.0, 0.0, 0.0, 0.0], 'and_precision': [0.0061, 0.0, 0.0, 0.0, 0.0], 'and_f1': [0.0087, 0.0, 0.0, 0.0, 0.0], 'overall_accuracy': [0.3349, 0.4901, 0.4398, 0.5272, 0.5331]}
['all', '0.2', '43'] {'avg_loss': 0.2657, 'avg_xor_precision': 0.2028, 'avg_xor_recall': 0.5333, 'avg_xor_f1': 0.2499, 'avg_and_recall': 0.0, 'avg_and_precision': 0.0, 'avg_and_f1': 0.0, 'avg_overall_accuracy': 0.4395, 'loss': [0.365, 0.2608, 0.3346, 0.2303, 0.1375], 'xor_precision': [0.1253, 0.4061, 0.0765, 0.2075, 0.1985], 'xor_recall': [0.4091, 0.4227, 0.5498, 0

## Analyze Data

In [122]:
def fitler_runs(runs, labels=None, weights=None, seed=None):
    runs_filtered = copy.deepcopy(runs)
    if labels:
        runs_filtered = [r for r in runs_filtered if r[0][0] == str(labels)]
    if weights:
        runs_filtered = [r for r in runs_filtered if r[0][1] == str(weights)]
    if seed:
        runs_filtered = [r for r in runs_filtered if r[0][2] == str(seed)]
    return runs_filtered

In [154]:
s43_runs = fitler_runs(runs, seed=43)
print(len(s43_runs))
s44_runs = fitler_runs(runs, seed=44)
s45_runs = fitler_runs(runs, seed=45)

16


In [160]:
avg_metrics = ['avg_overall_accuracy', 'avg_xor_precision', 'avg_xor_recall', 'avg_xor_f1', 
               'avg_and_recall', 'avg_and_precision', 'avg_and_f1', 'avg_loss',]
avg_metrics_interest = ['avg_overall_accuracy', 'avg_xor_precision', 'avg_xor_recall', 'avg_xor_f1', 'avg_and_f1']
metrics_interest = ['overall_accuracy', 'xor_precision', 'xor_recall', 'xor_f1', 'and_f1']

### A) Seeds Difference

In [155]:
param_lists = [
   ['all', 'filtered'],
   ['0.05', '0.1', '0.2', '0.3', '0.4', '0.5', '0.75', '1.0'],
]
label_weights_combinations = list(itertools.product(*param_lists))
for element in label_weights_combinations:
    print(element)

('all', '0.05')
('all', '0.1')
('all', '0.2')
('all', '0.3')
('all', '0.4')
('all', '0.5')
('all', '0.75')
('all', '1.0')
('filtered', '0.05')
('filtered', '0.1')
('filtered', '0.2')
('filtered', '0.3')
('filtered', '0.4')
('filtered', '0.5')
('filtered', '0.75')
('filtered', '1.0')


In [168]:
label_weights_combinations_stats = []
for (labels, weight) in label_weights_combinations:
    print(f" {labels, weight} ".center(100, '-'))
    label_weights_combination_runs = fitler_runs(runs, labels=labels, weights=weight)
    label_weights_combination_metrics = {}
    
    for avg_metric in avg_metrics_interest:
        # label_weights_combination_values
        values = [r[1][avg_metric] for r in label_weights_combination_runs]
        print(avg_metric.ljust(20, ' '), end=' ')
        mean = round(np.mean(values), 4)
        print(f"mean: {mean}".ljust(12, ' '), end=' ')
        print(f"| var: {round(s.variance(values), 4)}".ljust(15, ' '), end=' ')
        print(f"| stdev: {round(s.stdev(values), 4)}".ljust(16, ' '), end=' ')
        print(f"| values: {values}")
        label_weights_combination_metrics[avg_metric] = mean

    label_weights_combinations_stats.append(((labels, weight), label_weights_combination_metrics))
    print()

----------------------------------------- ('all', '0.05') ------------------------------------------
avg_overall_accuracy mean: 0.4322 | var: 0.0043   | stdev: 0.0654  | values: [0.3708, 0.501, 0.4247]
avg_xor_precision    mean: 0.2208 | var: 0.0019   | stdev: 0.0439  | values: [0.18, 0.2151, 0.2672]
avg_xor_recall       mean: 0.5419 | var: 0.0052   | stdev: 0.0721  | values: [0.4774, 0.6197, 0.5286]
avg_xor_f1           mean: 0.2615 | var: 0.0027   | stdev: 0.0523  | values: [0.2022, 0.2813, 0.301]
avg_and_f1           mean: 0.0007 | var: 0.0      | stdev: 0.0012  | values: [0.0, 0.0, 0.002]

------------------------------------------ ('all', '0.1') ------------------------------------------
avg_overall_accuracy mean: 0.4631 | var: 0.0004   | stdev: 0.0198  | values: [0.465, 0.4819, 0.4425]
avg_xor_precision    mean: 0.245  | var: 0.0012   | stdev: 0.035   | values: [0.273, 0.2562, 0.2058]
avg_xor_recall       mean: 0.5912 | var: 0.0004   | stdev: 0.0211  | values: [0.5678, 0.6086, 0.

### B) Best Models w.r.t. Metrics

##### 1) XOR Precision
models with middle high weight for other labels perform well, but recall for top ones much lower than precision

In [169]:
# 1) XOR Precision
label_weights_combinations_stats.sort(key=lambda param_stats: param_stats[1]['avg_xor_precision'], reverse=True)
for x in label_weights_combinations_stats[:4]:
    print(x)

(('all', '0.4'), {'avg_overall_accuracy': 0.517, 'avg_xor_precision': 0.3731, 'avg_xor_recall': 0.2655, 'avg_xor_f1': 0.2783, 'avg_and_f1': 0.0})
(('all', '0.5'), {'avg_overall_accuracy': 0.5051, 'avg_xor_precision': 0.3499, 'avg_xor_recall': 0.2616, 'avg_xor_f1': 0.26, 'avg_and_f1': 0.0})
(('all', '0.3'), {'avg_overall_accuracy': 0.4704, 'avg_xor_precision': 0.3089, 'avg_xor_recall': 0.3612, 'avg_xor_f1': 0.2789, 'avg_and_f1': 0.0})
(('all', '0.2'), {'avg_overall_accuracy': 0.4389, 'avg_xor_precision': 0.2602, 'avg_xor_recall': 0.5118, 'avg_xor_f1': 0.2876, 'avg_and_f1': 0.0001})


##### 2) XOR Recall
+ models with low weight for other labels perform well -> makes sense because then overall more XOR is predicted and the recall gets higher
+ but precision for top ones much lower than recall

In [163]:
label_weights_combinations_stats.sort(key=lambda param_stats: param_stats[1]['avg_xor_recall'], reverse=True)
for x in label_weights_combinations_stats[:4]:
    print(x)

(('all', '0.1'), {'avg_overall_accuracy': 0.4631, 'avg_xor_precision': 0.245, 'avg_xor_recall': 0.5912, 'avg_xor_f1': 0.289, 'avg_and_f1': 0.0016})
(('all', '0.05'), {'avg_overall_accuracy': 0.4322, 'avg_xor_precision': 0.2208, 'avg_xor_recall': 0.5419, 'avg_xor_f1': 0.2615, 'avg_and_f1': 0.0007})
(('all', '0.2'), {'avg_overall_accuracy': 0.4389, 'avg_xor_precision': 0.2602, 'avg_xor_recall': 0.5118, 'avg_xor_f1': 0.2876, 'avg_and_f1': 0.0001})
(('filtered', '0.05'), {'avg_overall_accuracy': 0.7839, 'avg_xor_precision': 0.2579, 'avg_xor_recall': 0.4328, 'avg_xor_f1': 0.268, 'avg_and_f1': 0.0})


##### 3) XOR Recall + XOR Precision
+ ?

In [174]:
label_weights_combinations_stats.sort(
    key=lambda param_stats: (param_stats[1]['avg_xor_precision'] + param_stats[1]['avg_xor_recall'],), 
    reverse=True)
for x in label_weights_combinations_stats[:4]:
    print(x)

(('all', '0.1'), {'avg_overall_accuracy': 0.4631, 'avg_xor_precision': 0.245, 'avg_xor_recall': 0.5912, 'avg_xor_f1': 0.289, 'avg_and_f1': 0.0016})
(('all', '0.2'), {'avg_overall_accuracy': 0.4389, 'avg_xor_precision': 0.2602, 'avg_xor_recall': 0.5118, 'avg_xor_f1': 0.2876, 'avg_and_f1': 0.0001})
(('all', '0.05'), {'avg_overall_accuracy': 0.4322, 'avg_xor_precision': 0.2208, 'avg_xor_recall': 0.5419, 'avg_xor_f1': 0.2615, 'avg_and_f1': 0.0007})
(('filtered', '0.05'), {'avg_overall_accuracy': 0.7839, 'avg_xor_precision': 0.2579, 'avg_xor_recall': 0.4328, 'avg_xor_f1': 0.268, 'avg_and_f1': 0.0})


##### 4a) Overall Accuracy
+ Obviously filtered label set performs best because of high share of "Other" class

In [175]:
label_weights_combinations_stats.sort(key=lambda param_stats: param_stats[1]['avg_overall_accuracy'], reverse=True)
for x in label_weights_combinations_stats[:4]:
    print(x)

(('filtered', '1.0'), {'avg_overall_accuracy': 0.9542, 'avg_xor_precision': 0.0209, 'avg_xor_recall': 0.0541, 'avg_xor_f1': 0.0225, 'avg_and_f1': 0.0})
(('filtered', '0.75'), {'avg_overall_accuracy': 0.9244, 'avg_xor_precision': 0.0356, 'avg_xor_recall': 0.0471, 'avg_xor_f1': 0.0227, 'avg_and_f1': 0.0})
(('filtered', '0.4'), {'avg_overall_accuracy': 0.9099, 'avg_xor_precision': 0.0759, 'avg_xor_recall': 0.0599, 'avg_xor_f1': 0.0462, 'avg_and_f1': 0.0})
(('filtered', '0.5'), {'avg_overall_accuracy': 0.9034, 'avg_xor_precision': 0.04, 'avg_xor_recall': 0.0518, 'avg_xor_f1': 0.0252, 'avg_and_f1': 0.0})


##### 4b) Overall Accuracy - only all labels
+ performance is about the same, because majority of labels gets the same weight; only minority gateways are higher

In [178]:
all_weights_combinations_stats = [x for x in copy.deepcopy(label_weights_combinations_stats) if x[0][0] == 'all']
all_weights_combinations_stats.sort(key=lambda param_stats: param_stats[1]['avg_overall_accuracy'], reverse=True)
for x in all_weights_combinations_stats[:4]:
    print(x)

(('all', '0.4'), {'avg_overall_accuracy': 0.517, 'avg_xor_precision': 0.3731, 'avg_xor_recall': 0.2655, 'avg_xor_f1': 0.2783, 'avg_and_f1': 0.0})
(('all', '1.0'), {'avg_overall_accuracy': 0.5116, 'avg_xor_precision': 0.2151, 'avg_xor_recall': 0.1599, 'avg_xor_f1': 0.16, 'avg_and_f1': 0.0})
(('all', '0.5'), {'avg_overall_accuracy': 0.5051, 'avg_xor_precision': 0.3499, 'avg_xor_recall': 0.2616, 'avg_xor_f1': 0.26, 'avg_and_f1': 0.0})
(('all', '0.75'), {'avg_overall_accuracy': 0.5022, 'avg_xor_precision': 0.2076, 'avg_xor_recall': 0.1321, 'avg_xor_f1': 0.1436, 'avg_and_f1': 0.0})


### C) Fold Difference

In [150]:
folds = ['1', '2', '3', '4', '5']

for fold_idx, fold in enumerate(folds):
    print(f" Fold {fold} ".center(100, '-'))
    filtered_runs = fitler_runs(runs, labels="all", seed=45)  # ONLY EXAMPLE, CHOOSE EVERY OTHER PARAM FILTER -> SAME EFFECT
    
    for metric in metrics_interest:
        # label_weights_combination_values
        params = [r[0] for r in filtered_runs]
        # print(params, end=' ')
        values = [r[1][metric][fold_idx] for r in filtered_runs]
        print(metric.ljust(16, ' '), end=' ')
        mean = round(np.mean(values), 4)
        print(f"mean: {mean}".ljust(12, ' '), end=' ')
        print(f"| var: {round(s.variance(values), 4)}".ljust(15, ' '), end=' ')
        print(f"| stdev: {round(s.stdev(values), 4)}".ljust(16, ' '), end=' ')
        print(f"| values ({len(values)}): {values[:4]} ...")
        #label_weights_combination_metrics[avg_metric] = mean

#     label_weights_combinations_stats.append(((labels, weight), label_weights_combination_metrics))
    print()

---------------------------------------------- Fold 1 ----------------------------------------------
overall_accuracy mean: 0.4225 | var: 0.0022   | stdev: 0.0471  | values (7): [0.318, 0.4318, 0.4243, 0.4461] ...
xor_precision    mean: 0.1377 | var: 0.0044   | stdev: 0.0662  | values (7): [0.0374, 0.1282, 0.0909, 0.1364] ...
xor_recall       mean: 0.2857 | var: 0.0353   | stdev: 0.1878  | values (7): [0.6364, 0.4091, 0.1591, 0.1591] ...
xor_f1           mean: 0.147  | var: 0.003    | stdev: 0.0551  | values (7): [0.0685, 0.1793, 0.097, 0.1364] ...
and_f1           mean: 0.001  | var: 0.0      | stdev: 0.0026  | values (7): [0.007, 0.0, 0.0, 0.0] ...

---------------------------------------------- Fold 2 ----------------------------------------------
overall_accuracy mean: 0.4646 | var: 0.0011   | stdev: 0.0329  | values (7): [0.442, 0.5194, 0.4493, 0.4547] ...
xor_precision    mean: 0.1334 | var: 0.023    | stdev: 0.1518  | values (7): [0.0708, 0.4379, 0.1979, 0.1364] ...
xor_recall  