In [1]:
import matplotlib.pyplot as plt
import scipy
import numpy as np
import time

# Experiment 1

In [2]:
groups = {"SVC": 0, "Trees": 1, "NB": 2, "Neighbors":3, "Linear":4, "DA":5, "nn":6, "Dummy":7}
labels = ["SVC", "Trees", "NB", "Neighbors", "Linear", "DA", "Network", "Dummy"]
dir = "experiment1/finalgraphs"

def create_MAE_hist(mean_absolute_errors):
    log_data = np.log10(mean_absolute_errors)
    n_bins = 40
    log_bins = np.linspace(min(log_data), max(log_data), n_bins + 1)
    bins = 10**log_bins
    plt.hist(mean_absolute_errors, bins=bins, density=True)
    plt.title(f"Distribution of the Mean absolute error. Train:{trained_on}, Eval:{eval_on}")
    
    kde = scipy.stats.gaussian_kde(mean_absolute_errors)
    x_vals = np.linspace(min(mean_absolute_errors), max(mean_absolute_errors), 500)
    plt.xscale("log")
    plt.plot(x_vals, kde(x_vals), 'r-', linewidth=2, label='KDE')

for trained_on in groups.keys():
    # baseline_dir = f"experiment1/trained{trained_on}/eval{trained_on}/aggregate/data_summary_trained{trained_on}_eval{trained_on}.txt"
    # with open(baseline_dir, 'r') as baseline_file:
    #         lines_baseline = baseline_file.readlines()
    # baseline_population = eval(lines_baseline[0])
    
    for eval_on in groups.keys(): 
        for i in [1,8,42]:
            # f"experiment1/trained{trained_on}/eval{eval_on}/aggregate/data_summary_trained{trained_on}_eval{eval_on}.txt"
            file_dir = f"experiment1/trained{trained_on}/eval{eval_on}/state{i}/data_summary.txt"
            with open(file_dir, 'r') as file:
                lines = file.readlines()
                
            # print(trained_on + eval_on + lines[1])
            population_2 = scipy.stats.zscore(eval(lines[0]))
            print(f"{np.average(population_2)} {np.std(population_2)}")
            
            ks_test = scipy.stats.kstest(population_2, 'norm')
            if ks_test.pvalue > 0.05:
                create_MAE_hist(population_2)
                plt.show()
            else:
                print(f"{trained_on}, {eval_on} state{i} is not normally distributed. pvalue:{ks_test.pvalue}")
        
        

-1.5983287086790765e-16 1.0
SVC, SVC state1 is not normally distributed. pvalue:1.5902372990363732e-90
1.1844622228999586e-16 1.0
SVC, SVC state8 is not normally distributed. pvalue:1.1259573928139247e-56
8.200146979343338e-17 1.0
SVC, SVC state42 is not normally distributed. pvalue:1.4141489301094115e-98
1.63866358022829e-16 1.0
SVC, Trees state1 is not normally distributed. pvalue:7.695598606274047e-83
-1.8207373113647665e-17 1.0
SVC, Trees state8 is not normally distributed. pvalue:5.657347216360432e-153
-2.73110596704715e-17 1.0
SVC, Trees state42 is not normally distributed. pvalue:1.550111199687418e-163
8.094585734336981e-17 1.0
SVC, NB state1 is not normally distributed. pvalue:2.222792182520586e-99
3.5616177231082714e-17 1.0
SVC, NB state8 is not normally distributed. pvalue:2.259278319320787e-121
-2.5902674349878338e-17 1.0
SVC, NB state42 is not normally distributed. pvalue:2.1083227661845513e-136
1.1396034254372096e-16 1.0
SVC, Neighbors state1 is not normally distributed. p

# Experiment 2

In [3]:
groups = {"LDA": 0, "QDA": 1, "Trees": 2}
labels = ["LDA", "QDA", "Trees"]
dir = "experiment2/finalgraphs"

def create_MAE_hist(mean_absolute_errors):
    log_data = np.log10(mean_absolute_errors)
    n_bins = 40
    log_bins = np.linspace(min(log_data), max(log_data), n_bins + 1)
    bins = 10**log_bins
    plt.hist(mean_absolute_errors, bins=bins, density=True)
    plt.title(f"Distribution of the Mean absolute error. Train:{trained_on}, Eval:{eval_on}")
    
    kde = scipy.stats.gaussian_kde(mean_absolute_errors)
    x_vals = np.linspace(min(mean_absolute_errors), max(mean_absolute_errors), 500)
    plt.xscale("log")
    plt.plot(x_vals, kde(x_vals), 'r-', linewidth=2, label='KDE')

for trained_on in groups.keys():
    # baseline_dir = f"experiment1/trained{trained_on}/eval{trained_on}/aggregate/data_summary_trained{trained_on}_eval{trained_on}.txt"
    # with open(baseline_dir, 'r') as baseline_file:
    #         lines_baseline = baseline_file.readlines()
    # baseline_population = eval(lines_baseline[0])
    
    for eval_on in groups.keys(): 
        for i in [1,8,42]:
            # f"experiment1/trained{trained_on}/eval{eval_on}/aggregate/data_summary_trained{trained_on}_eval{eval_on}.txt"
            file_dir = f"experiment2/trained{trained_on}/eval{eval_on}/state{i}augment/data_summary.txt"
            with open(file_dir, 'r') as file:
                lines = file.readlines()
                
            # print(trained_on + eval_on + lines[1])
            population_2 = scipy.stats.zscore(eval(lines[0]))
            print(f"transformed distribution stats: avg:{np.average(population_2)} std:{np.std(population_2)}")
            
            ks_test = scipy.stats.kstest(population_2, 'norm')
            if ks_test.pvalue > 0.05:
                create_MAE_hist(population_2)
                plt.show()
            else:
                print(f"{trained_on}, {eval_on} state{i} is not normally distributed. pvalue:{ks_test.pvalue}")

transformed distribution stats: avg:-5.973337650505246e-17 std:1.0
LDA, LDA state1 is not normally distributed. pvalue:2.3403341704536187e-69
transformed distribution stats: avg:5.366636977040032e-17 std:1.0
LDA, LDA state8 is not normally distributed. pvalue:9.836493699744073e-66
transformed distribution stats: avg:1.093984196705312e-17 std:1.0
LDA, LDA state42 is not normally distributed. pvalue:2.3612219785627784e-69
transformed distribution stats: avg:1.1705811132785835e-17 std:1.0
LDA, QDA state1 is not normally distributed. pvalue:8.277486654739258e-55
transformed distribution stats: avg:2.341162226557167e-17 std:1.0
LDA, QDA state8 is not normally distributed. pvalue:1.2193134821386554e-51
transformed distribution stats: avg:-5.852905566392917e-17 std:1.0
LDA, QDA state42 is not normally distributed. pvalue:2.7889132390329203e-46
transformed distribution stats: avg:2.0483294752853626e-16 std:1.0
LDA, Trees state1 is not normally distributed. pvalue:0.0
transformed distribution s

# supplementary experiment

In [4]:
groups = {"LDA": 0, "QDA": 1, "Trees": 2}
labels = ["LDA", "QDA", "Trees"]
dir = "supplementary-exp/finalgraphs"

def create_MAE_hist(mean_absolute_errors):
    log_data = np.log10(mean_absolute_errors)
    n_bins = 40
    log_bins = np.linspace(min(log_data), max(log_data), n_bins + 1)
    bins = 10**log_bins
    plt.hist(mean_absolute_errors, bins=bins, density=True)
    plt.title(f"Distribution of the Mean absolute error. Train:{trained_on}, Eval:{eval_on}")
    
    kde = scipy.stats.gaussian_kde(mean_absolute_errors)
    x_vals = np.linspace(min(mean_absolute_errors), max(mean_absolute_errors), 500)
    plt.xscale("log")
    plt.plot(x_vals, kde(x_vals), 'r-', linewidth=2, label='KDE')

for trained_on in groups.keys():
    # baseline_dir = f"experiment1/trained{trained_on}/eval{trained_on}/aggregate/data_summary_trained{trained_on}_eval{trained_on}.txt"
    # with open(baseline_dir, 'r') as baseline_file:
    #         lines_baseline = baseline_file.readlines()
    # baseline_population = eval(lines_baseline[0])
    
    for eval_on in groups.keys(): 
        for i in [1,8,42]:
            for string in ["augment", "no_augment"]:
                # f"experiment1/trained{trained_on}/eval{eval_on}/aggregate/data_summary_trained{trained_on}_eval{eval_on}.txt"
                file_dir = f"supplementary-exp/trained{trained_on}/eval{eval_on}/state{i}{string}/data_summary.txt"
                with open(file_dir, 'r') as file:
                    lines = file.readlines()
                    
                # print(trained_on + eval_on + lines[1])
                population_2 = scipy.stats.zscore(eval(lines[0]))
                print(f"transformed distribution stats: avg:{np.average(population_2)} std:{np.std(population_2)}")
                
                ks_test = scipy.stats.kstest(population_2, 'norm')
                if ks_test.pvalue > 0.05:
                    create_MAE_hist(population_2)
                    plt.show()
                else:
                    print(f"{trained_on}, {eval_on} state{i} is not normally distributed. pvalue:{ks_test.pvalue}")

transformed distribution stats: avg:-5.973337650505246e-17 std:1.0
LDA, LDA state1 is not normally distributed. pvalue:2.3403341704536187e-69
transformed distribution stats: avg:6.257782300529305e-17 std:1.0
LDA, LDA state1 is not normally distributed. pvalue:7.022760793248524e-44
transformed distribution stats: avg:5.366636977040032e-17 std:1.0
LDA, LDA state8 is not normally distributed. pvalue:9.836493699744073e-66
transformed distribution stats: avg:3.219982186224019e-17 std:1.0
LDA, LDA state8 is not normally distributed. pvalue:3.323012211304682e-34
transformed distribution stats: avg:1.093984196705312e-17 std:1.0
LDA, LDA state42 is not normally distributed. pvalue:2.3612219785627784e-69
transformed distribution stats: avg:-5.4699209835265604e-17 std:0.9999999999999998
LDA, LDA state42 is not normally distributed. pvalue:5.448706194779753e-29
transformed distribution stats: avg:7.0234866796715e-17 std:1.0000000000000002
LDA, QDA state1 is not normally distributed. pvalue:3.74390

# Experiment 3

In [5]:
groups = {"LDA": 0, "Sigmoid": 1, "Centroid": 2, "Gradient": 3, "ExtraTree":4}
labels = ["LDA", "Sigmoid", "Centroid", "Gradient", "ExtraTree"]
dir = "experiment3/finalgraphs"

def create_MAE_hist(mean_absolute_errors):
    log_data = np.log10(mean_absolute_errors)
    n_bins = 40
    log_bins = np.linspace(min(log_data), max(log_data), n_bins + 1)
    bins = 10**log_bins
    plt.hist(mean_absolute_errors, bins=bins, density=True)
    plt.title(f"Distribution of the Mean absolute error. Train:{trained_on}, Eval:{eval_on}")
    
    kde = scipy.stats.gaussian_kde(mean_absolute_errors)
    x_vals = np.linspace(min(mean_absolute_errors), max(mean_absolute_errors), 500)
    plt.xscale("log")
    plt.plot(x_vals, kde(x_vals), 'r-', linewidth=2, label='KDE')

for trained_on in groups.keys():
    # baseline_dir = f"experiment1/trained{trained_on}/eval{trained_on}/aggregate/data_summary_trained{trained_on}_eval{trained_on}.txt"
    # with open(baseline_dir, 'r') as baseline_file:
    #         lines_baseline = baseline_file.readlines()
    # baseline_population = eval(lines_baseline[0])
    
    for eval_on in groups.keys(): 
        for i in [1,8,42]:
            # f"experiment1/trained{trained_on}/eval{eval_on}/aggregate/data_summary_trained{trained_on}_eval{eval_on}.txt"
            file_dir = f"experiment3/trained{trained_on}/eval{eval_on}/state{i}augment/data_summary.txt"
            with open(file_dir, 'r') as file:
                lines = file.readlines()
                
            # print(trained_on + eval_on + lines[1])
            population_2 = scipy.stats.zscore(eval(lines[0]))
            print(f"transformed distribution stats: avg:{np.average(population_2)} std:{np.std(population_2)}")
            
            ks_test = scipy.stats.kstest(population_2, 'norm')
            if ks_test.pvalue > 0.05:
                create_MAE_hist(population_2)
                plt.show()
            else:
                print(f"{trained_on}, {eval_on} state{i} is not normally distributed. pvalue:{ks_test.pvalue}")

transformed distribution stats: avg:-5.973337650505246e-17 std:1.0
LDA, LDA state1 is not normally distributed. pvalue:2.3403341704536187e-69
transformed distribution stats: avg:5.366636977040032e-17 std:1.0
LDA, LDA state8 is not normally distributed. pvalue:9.836493699744073e-66
transformed distribution stats: avg:1.093984196705312e-17 std:1.0
LDA, LDA state42 is not normally distributed. pvalue:2.3612219785627784e-69
transformed distribution stats: avg:1.1423516652091643e-17 std:0.9999999999999998
LDA, Sigmoid state1 is not normally distributed. pvalue:4.929749495913789e-55
transformed distribution stats: avg:-1.1423516652091642e-16 std:1.0
LDA, Sigmoid state8 is not normally distributed. pvalue:2.848597512404257e-48
transformed distribution stats: avg:3.427054995627493e-17 std:1.0
LDA, Sigmoid state42 is not normally distributed. pvalue:2.0462931197135983e-46
transformed distribution stats: avg:-1.1994692484696226e-16 std:1.0
LDA, Centroid state1 is not normally distributed. pvalue