In [53]:
import pandas as pd
import numpy as np
# Read the CSV file
df = pd.read_csv('fold_results.csv',index_col="script")

df.drop(columns=['fold'], inplace=True)

means = df.groupby(["script","model"]).mean()*100
stds = df.groupby(["script","model"]).std()*100

In [54]:
# Rename columns in means and stds
means_renamed = means.rename(columns={'valid_accuracy': 'valid_mean', 'test_accuracy': 'test_mean'})
stds_renamed = stds.rename(columns={'valid_accuracy': 'valid_std', 'test_accuracy': 'test_std'})

# Merge means and stds on their index (script, model)
merged = means_renamed.merge(stds_renamed, left_index=True, right_index=True)
merged

Unnamed: 0_level_0,Unnamed: 1_level_0,valid_mean,test_mean,valid_std,test_std
script,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
train.py,KONet,96.170982,94.694377,1.178732,0.475919
train.py,conv_next,96.68736,94.718826,1.007166,0.693465
train.py,dense,96.035923,94.841076,1.248884,0.85049
train.py,efficient,96.334173,94.889976,1.027667,0.46747
train.py,mobilenet,93.618475,93.251834,2.375152,1.686864
train_distillation.py,conv_next,96.442721,94.449878,1.198579,0.823509
train_distillation.py,mobilenet,94.053037,93.740831,1.82846,1.076288
train_distillation_other.py,conv_next,99.511164,99.608802,0.540336,0.478007
train_distillation_other.py,mobilenet,98.832921,99.022005,1.23281,1.411614
train_incremental_early_stop.py,conv_next,90.746021,89.90146,1.605424,1.777613


In [55]:
from scipy.stats import ttest_ind

# Dictionary to store t-test results
ttest_results = {}

# Significance threshold
alpha = 0.05

# Iterate over each script
for script in df.index.unique():
    # Get models for this script
    models = df.loc[script, 'model'].unique()
    # Iterate over all pairs of models
    for i, model1 in enumerate(models):
        for model2 in models[i+1:]:
            # Get valid_accuracy for both models
            acc1 = df.loc[(script, ), :][df.loc[(script, ), :]['model'] == model1]['valid_accuracy']
            acc2 = df.loc[(script, ), :][df.loc[(script, ), :]['model'] == model2]['valid_accuracy']
            # Perform t-test
            t_stat, p_val = ttest_ind(acc1, acc2, equal_var=False)
            # Check statistical significance
            significant = p_val < alpha
            # Store result
            ttest_results[(script, model1, model2)] = {
                't_stat': t_stat,
                'p_val': p_val,
                'significant': significant
            }

ttest_results

{('train.py', 'mobilenet', 'efficient'): {'t_stat': -3.318384459796801,
  'p_val': 0.005968273141131318,
  'significant': True},
 ('train.py', 'mobilenet', 'dense'): {'t_stat': -2.8487789977549793,
  'p_val': 0.013169196817909238,
  'significant': True},
 ('train.py', 'mobilenet', 'conv_next'): {'t_stat': -3.7616869082229716,
  'p_val': 0.002661947704820441,
  'significant': True},
 ('train.py', 'mobilenet', 'KONet'): {'t_stat': -3.0441486928748667,
  'p_val': 0.009278742354672494,
  'significant': True},
 ('train.py', 'efficient', 'dense'): {'t_stat': 0.5831463670334186,
  'p_val': 0.5672953147625032,
  'significant': False},
 ('train.py', 'efficient', 'conv_next'): {'t_stat': -0.7761918984858358,
  'p_val': 0.44771944341727654,
  'significant': False},
 ('train.py', 'efficient', 'KONet'): {'t_stat': 0.32999780654078703,
  'p_val': 0.745281346167938,
  'significant': False},
 ('train.py', 'dense', 'conv_next'): {'t_stat': -1.2839862722892514,
  'p_val': 0.21614802004750477,
  'signifi

In [None]:
# Find models common to both train.py and train_distillation.py
scripts_of_interest = ['train.py', 'train_distillation.py']
models_train = set(df.loc[scripts_of_interest[0], 'model'].unique())
models_distill = set(df.loc[scripts_of_interest[1], 'model'].unique())
common_models = models_train.intersection(models_distill)

# Dictionary to store t-test results for common models
ttest_common = {}

for model in common_models:
    acc_train = df.loc[(scripts_of_interest[0]), :][df.loc[(scripts_of_interest[0]), :]['model'] == model]['valid_accuracy']
    acc_distill = df.loc[(scripts_of_interest[1]), :][df.loc[(scripts_of_interest[1]), :]['model'] == model]['valid_accuracy']
    t_stat, p_val = ttest_ind(acc_train, acc_distill, equal_var=False)
    significant = p_val < alpha
    ttest_common[model] = {
        't_stat': t_stat,
        'p_val': p_val,
        'significant': significant
    }
ttest_common

{'conv_next': {'t_stat': 0.494146825549851,
  'p_val': 0.6273543360528215,
  'significant': False},
 'mobilenet': {'t_stat': -0.4584602571433014,
  'p_val': 0.6524642423571612,
  'significant': False}}

In [None]:
# Find models common to both train.py and train_distillation.py
scripts_of_interest = ['train_other.py', 'train_distillation_other.py']
models_train = set(df.loc[scripts_of_interest[0], 'model'].unique())
models_distill = set(df.loc[scripts_of_interest[1], 'model'].unique())
common_models = models_train.intersection(models_distill)

# Dictionary to store t-test results for common models
ttest_common = {}

for model in common_models:
    acc_train = df.loc[(scripts_of_interest[0]), :][df.loc[(scripts_of_interest[0]), :]['model'] == model]['valid_accuracy']
    acc_distill = df.loc[(scripts_of_interest[1]), :][df.loc[(scripts_of_interest[1]), :]['model'] == model]['valid_accuracy']
    t_stat, p_val = ttest_ind(acc_train, acc_distill, equal_var=False)
    significant = p_val < alpha
    ttest_common[model] = {
        't_stat': t_stat,
        'p_val': p_val,
        'significant': significant
    }
ttest_common

{'conv_next': {'t_stat': -0.6443232522043958,
  'p_val': 0.5283584265214332,
  'significant': False},
 'mobilenet': {'t_stat': 2.0344056249231786,
  'p_val': 0.06936747379891674,
  'significant': False}}

In [None]:
# Find models common to both train.py and train_distillation.py
scripts_of_interest = ['train_incremental_ewc.py', 'train_incremental_early_stop.py']
models_train = set(df.loc[scripts_of_interest[0], 'model'].unique())
models_distill = set(df.loc[scripts_of_interest[1], 'model'].unique())
common_models = models_train.intersection(models_distill)

# Dictionary to store t-test results for common models
ttest_common = {}

for model in common_models:
    acc_train = df.loc[(scripts_of_interest[0]), :][df.loc[(scripts_of_interest[0]), :]['model'] == model]['valid_accuracy']
    acc_distill = df.loc[(scripts_of_interest[1]), :][df.loc[(scripts_of_interest[1]), :]['model'] == model]['valid_accuracy']
    t_stat, p_val = ttest_ind(acc_train, acc_distill, equal_var=False)
    significant = p_val < alpha
    ttest_common[model] = {
        't_stat': t_stat,
        'p_val': p_val,
        'significant': significant
    }
ttest_common

{'dense': {'t_stat': -5.4013757277156484,
  'p_val': 0.00011832536373437486,
  'significant': True},
 'conv_next': {'t_stat': -5.678392235061869,
  'p_val': 2.3406923342070926e-05,
  'significant': True},
 'mobilenet': {'t_stat': -3.6137356144056594,
  'p_val': 0.002034171917254026,
  'significant': True}}