# ANOVA Testing

## Variables:
- Size of model (tiny or base)
- Use of perspective transformation (prePT or PT)
- Use of HHA encoding
- Use of simulated data

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from matplotlib import pyplot as plt
import seaborn as sns
import itertools
import statsmodels.api as sm
from statsmodels.formula.api import ols
from glob import glob
import os

## Data Preparation

In [2]:
files = glob('./*.csv')
print(files)
df = pd.concat([pd.read_csv(fp).assign(Vars=os.path.basename(fp)[:-4]) for fp in files], ignore_index=True)
df.sample(5)

['.\\ViT_Base_HHA.csv', '.\\ViT_Base_HHA_PT.csv', '.\\ViT_Base_prePT.csv', '.\\ViT_Base_PT.csv', '.\\ViT_Base_Sim_HHA.csv', '.\\ViT_Base_Sim_HHA_PT.csv', '.\\ViT_Base_Sim_prePT.csv', '.\\ViT_Base_Sim_PT.csv', '.\\ViT_Tiny_HHA.csv', '.\\ViT_Tiny_HHA_PT.csv', '.\\ViT_Tiny_prePT.csv', '.\\ViT_Tiny_PT.csv', '.\\ViT_Tiny_Sim_HHA.csv', '.\\ViT_Tiny_Sim_HHA_PT.csv', '.\\ViT_Tiny_Sim_prePT.csv', '.\\ViT_Tiny_Sim_PT.csv']


Unnamed: 0,TP,FP,FN,TN,Specificity,Sensitivity,Precision,Accuracy,F1,MCC,Vars
250,172,12,21,944,0.987448,0.891192,0.934783,0.971279,0.912467,0.895665,ViT_Tiny_prePT
260,170,16,23,2421,0.993435,0.880829,0.913978,0.985171,0.897098,0.889287,ViT_Tiny_prePT
384,206,85,38,2763,0.970154,0.844262,0.707904,0.96022,0.770093,0.751984,ViT_Tiny_Sim_PT
64,211,10,33,2838,0.996489,0.864754,0.954751,0.986093,0.907527,0.901318,ViT_Base_prePT
173,157,25,48,3239,0.992341,0.765854,0.862637,0.978956,0.81137,0.801857,ViT_Base_Sim_prePT


In [3]:
df.describe()

Unnamed: 0,TP,FP,FN,TN,Specificity,Sensitivity,Precision,Accuracy,F1,MCC
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,206.48,42.215,45.27,2753.4025,0.98415,0.819516,0.85449,0.970581,0.829584,0.817741
std,49.073007,51.627777,26.963302,1518.248402,0.019744,0.101534,0.102361,0.020183,0.088897,0.088936
min,31.0,0.0,5.0,944.0,0.841158,0.098101,0.361552,0.840181,0.178161,0.291292
25%,168.0,17.0,28.0,2435.75,0.982315,0.771325,0.835424,0.966723,0.794814,0.777987
50%,204.0,27.0,41.0,2504.0,0.990169,0.844082,0.883444,0.975209,0.851907,0.836716
75%,252.0,45.0,56.0,2833.25,0.994031,0.885246,0.916065,0.982516,0.885285,0.875028
max,300.0,406.0,285.0,32350.0,1.0,0.974093,1.0,0.997974,0.939086,0.934594


In [4]:
df.loc[df['Vars'].str.contains('Base'), 'model_size'] = 'Base'
df.loc[df['Vars'].str.contains('Tiny'), 'model_size'] = 'Tiny'
df.loc[df['Vars'].str.contains('HHA'), 'Encoding'] = 'HHA'
df.loc[df['Vars'].str.contains('HHA') == False, 'Encoding'] = 'Depth'
df.loc[df['Vars'].str.contains('prePT') | (df['Vars'].str.contains('PT') == False), 'Transformation'] = 'None'
df.loc[(df['Vars'].str.contains('prePT') == False) & df['Vars'].str.contains('PT'), 'Transformation'] = 'PT'
df.loc[df['Vars'].str.contains('Sim'), 'Simulated_Data'] = 'Sim'
df.loc[df['Vars'].str.contains('Sim') == False, 'Simulated_Data'] = 'None'

del df['Vars']

In [5]:
df.sample(5)

Unnamed: 0,TP,FP,FN,TN,Specificity,Sensitivity,Precision,Accuracy,F1,MCC,model_size,Encoding,Transformation,Simulated_Data
319,219,22,25,2826,0.992275,0.897541,0.908714,0.984799,0.903093,0.894866,Tiny,HHA,,Sim
320,177,15,16,2422,0.993845,0.917098,0.921875,0.988213,0.919481,0.913125,Tiny,HHA,,Sim
230,180,18,13,2419,0.992614,0.932642,0.909091,0.988213,0.920716,0.914438,Tiny,HHA,PT,
374,209,30,35,2818,0.989466,0.856557,0.874477,0.978978,0.865424,0.854078,Tiny,Depth,,Sim
361,263,44,53,2512,0.982786,0.832278,0.856678,0.966226,0.844302,0.825473,Tiny,Depth,,Sim


In [6]:
metrics = ['Specificity', 'Sensitivity', 'Precision', 'Accuracy', 'F1', 'MCC']
vars = ['model_size', 'Encoding', 'Transformation', 'Simulated_Data']
var_pairs = list(itertools.combinations(vars, 2))

for metric in metrics:
    print(f'Two-way ANOVA tests for {metric}:\n')
    for (var1, var2) in var_pairs:
        model = ols(f'{metric} ~ C({var1}) + C({var2}) + C({var1}):C({var2})', data=df).fit()

        result = sm.stats.anova_lm(model, type=2)

        print(result, '\n')

Two-way ANOVA tests for Specificity:

                              df    sum_sq   mean_sq         F    PR(>F)
C(model_size)                1.0  0.001828  0.001828  4.781173  0.029357
C(Encoding)                  1.0  0.002199  0.002199  5.752307  0.016929
C(model_size):C(Encoding)    1.0  0.000104  0.000104  0.273229  0.601466
Residual                   396.0  0.151406  0.000382       NaN       NaN 

                                    df    sum_sq   mean_sq         F    PR(>F)
C(model_size)                      1.0  0.001828  0.001828  4.758571  0.029740
C(Transformation)                  1.0  0.001578  0.001578  4.108112  0.043348
C(model_size):C(Transformation)    1.0  0.000006  0.000006  0.016916  0.896584
Residual                         396.0  0.152125  0.000384       NaN       NaN 

                                    df    sum_sq   mean_sq         F    PR(>F)
C(model_size)                      1.0  0.001828  0.001828  4.783325  0.029321
C(Simulated_Data)                  1.0  

In [7]:
metrics = ['Specificity', 'Sensitivity', 'Precision', 'Accuracy', 'F1', 'MCC']
vars = ['model_size', 'Encoding', 'Transformation', 'Simulated_Data']
var_pairs = list(itertools.combinations(vars, 2))
var_trips = list(itertools.combinations(vars, 3))

modelString = ''
for var in vars:
    modelString = modelString + f' + C({var})'

for (var1, var2) in var_pairs:
    modelString = modelString + f' + C({var1}):C({var2})'

for (var1, var2, var3) in var_trips:
    modelString = modelString + f' + C({var1}):C({var2}):C({var3})'

modelString = modelString + f' + C({vars[0]}):C({vars[1]}):C({vars[2]}):C({vars[3]})'

for metric in metrics:
    print(f'Multivariate ANOVA tests for {metric}:\n')
    model = ols(f'{metric} ~ {modelString}', data=df).fit()

    result = sm.stats.anova_lm(model, type=2)

    print(result, '\n')

Multivariate ANOVA tests for Specificity:

                                                       df    sum_sq   mean_sq  \
C(model_size)                                         1.0  0.001828  0.001828   
C(Encoding)                                           1.0  0.002199  0.002199   
C(Transformation)                                     1.0  0.001578  0.001578   
C(Simulated_Data)                                     1.0  0.001618  0.001618   
C(model_size):C(Encoding)                             1.0  0.000104  0.000104   
C(model_size):C(Transformation)                       1.0  0.000006  0.000006   
C(model_size):C(Simulated_Data)                       1.0  0.000754  0.000754   
C(Encoding):C(Transformation)                         1.0  0.000209  0.000209   
C(Encoding):C(Simulated_Data)                         1.0  0.000985  0.000985   
C(Transformation):C(Simulated_Data)                   1.0  0.000009  0.000009   
C(model_size):C(Encoding):C(Transformation)           1.0  0.00119