# ANOVA Testing

## Independant Variables:

- Size of model (tiny or base)
- Use of perspective transformation (prePT or PT)
- Use of HHA encoding
- Use of simulated data

## Dependant Variables (Metrics):

- Specificity
- Sensitivity
- Precision
- Accuracy
- F1-Score
- MCC

In [1]:
import pandas as pd
from tabulate import tabulate
import itertools
import statsmodels.api as sm
from statsmodels.multivariate.manova import MANOVA
from statsmodels.formula.api import ols
from glob import glob
import os

## Data Preparation

In [2]:
files = glob('./*.csv')
print(files)
df = pd.concat([pd.read_csv(fp).assign(Vars=os.path.basename(fp)[:-4]) for fp in files], ignore_index=True)
df.sample(5)

['.\\ViT_Base_HHA.csv', '.\\ViT_Base_HHA_PT.csv', '.\\ViT_Base_prePT.csv', '.\\ViT_Base_PT.csv', '.\\ViT_Base_Sim_HHA.csv', '.\\ViT_Base_Sim_HHA_PT.csv', '.\\ViT_Base_Sim_prePT.csv', '.\\ViT_Base_Sim_PT.csv', '.\\ViT_Tiny_HHA.csv', '.\\ViT_Tiny_HHA_PT.csv', '.\\ViT_Tiny_prePT.csv', '.\\ViT_Tiny_PT.csv', '.\\ViT_Tiny_Sim_HHA.csv', '.\\ViT_Tiny_Sim_HHA_PT.csv', '.\\ViT_Tiny_Sim_prePT.csv', '.\\ViT_Tiny_Sim_PT.csv']


Unnamed: 0,TP,FP,FN,TN,Specificity,Sensitivity,Precision,Accuracy,F1,MCC,Vars
297,265,55,37,2472,0.978235,0.877483,0.828125,0.96748,0.85209,0.83428,ViT_Tiny_PT
311,291,211,25,2345,0.917449,0.920886,0.579681,0.917827,0.711491,0.690742,ViT_Tiny_Sim_HHA
67,269,178,33,2349,0.929561,0.890728,0.60179,0.925415,0.718291,0.694462,ViT_Base_prePT
114,205,11,39,2837,0.996138,0.840164,0.949074,0.983829,0.891304,0.884518,ViT_Base_Sim_HHA
340,170,16,23,2421,0.993435,0.880829,0.913978,0.985171,0.897098,0.889287,ViT_Tiny_Sim_HHA_PT


In [3]:
df.describe()

Unnamed: 0,TP,FP,FN,TN,Specificity,Sensitivity,Precision,Accuracy,F1,MCC
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,206.48,42.215,45.27,2753.4025,0.98415,0.819516,0.85449,0.970581,0.829584,0.817741
std,49.073007,51.627777,26.963302,1518.248402,0.019744,0.101534,0.102361,0.020183,0.088897,0.088936
min,31.0,0.0,5.0,944.0,0.841158,0.098101,0.361552,0.840181,0.178161,0.291292
25%,168.0,17.0,28.0,2435.75,0.982315,0.771325,0.835424,0.966723,0.794814,0.777987
50%,204.0,27.0,41.0,2504.0,0.990169,0.844082,0.883444,0.975209,0.851907,0.836716
75%,252.0,45.0,56.0,2833.25,0.994031,0.885246,0.916065,0.982516,0.885285,0.875028
max,300.0,406.0,285.0,32350.0,1.0,0.974093,1.0,0.997974,0.939086,0.934594


In [4]:
df.loc[df['Vars'].str.contains('Base'), 'model_size'] = 'Base'
df.loc[df['Vars'].str.contains('Tiny'), 'model_size'] = 'Tiny'
df.loc[df['Vars'].str.contains('HHA'), 'Encoding'] = 'HHA'
df.loc[df['Vars'].str.contains('HHA') == False, 'Encoding'] = 'Depth'
df.loc[df['Vars'].str.contains('prePT') | (df['Vars'].str.contains('PT') == False), 'Transformation'] = 'None'
df.loc[(df['Vars'].str.contains('prePT') == False) & df['Vars'].str.contains('PT'), 'Transformation'] = 'PT'
df.loc[df['Vars'].str.contains('Sim'), 'Simulated_Data'] = 'Sim'
df.loc[df['Vars'].str.contains('Sim') == False, 'Simulated_Data'] = 'None'

del df['Vars']

In [5]:
df.sample(5)

Unnamed: 0,TP,FP,FN,TN,Specificity,Sensitivity,Precision,Accuracy,F1,MCC,model_size,Encoding,Transformation,Simulated_Data
82,253,32,49,2495,0.987337,0.837748,0.887719,0.971368,0.86201,0.846494,Base,Depth,PT,
81,272,57,44,2499,0.9777,0.860759,0.826748,0.964833,0.843411,0.823824,Base,Depth,PT,
189,185,28,59,2820,0.990169,0.758197,0.868545,0.971863,0.809628,0.796651,Base,Depth,PT,Sim
262,235,108,67,2419,0.957262,0.778146,0.685131,0.938141,0.728682,0.695723,Tiny,Depth,,
256,229,43,87,2513,0.983177,0.724684,0.841912,0.954735,0.778912,0.756485,Tiny,Depth,,


## N-Way ANOVA

Where N is the number of independent variables.

We test each combination of the 4 independent variables on each of the 6 dependant variables separately

In [6]:
metrics = ['Specificity', 'Sensitivity', 'Precision', 'Accuracy', 'F1', 'MCC']
vars = ['model_size', 'Encoding', 'Transformation', 'Simulated_Data']

# Build right half of formula
independantString = ''
for i in range(1, len(vars) + 1):
    combs = list(itertools.combinations(vars, i))
    for comb in combs:
        if independantString != '':
            independantString += ' + '
        independantString_sub = ''
        for var in comb:
            if independantString_sub == '':
                independantString_sub = f'C({var})'
            else:
                independantString_sub += f':C({var})'
        independantString += independantString_sub

for metric in metrics:
    print(f'N-Way ANOVA tests for {metric}:\n')
    model = ols(f'{metric} ~ {independantString}', data=df).fit()

    result = sm.stats.anova_lm(model, type=2)

    print(tabulate(result, headers='keys', tablefmt='rst'), '\n')

N-Way ANOVA tests for Specificity:

..                                                               df       sum_sq      mean_sq            F       PR(>F)
C(model_size)                                                     1  0.00182802   0.00182802     4.91697      0.027179
C(Encoding)                                                       1  0.00219932   0.00219932     5.91568      0.0154628
C(Transformation)                                                 1  0.00157815   0.00157815     4.24486      0.0400426
C(Simulated_Data)                                                 1  0.00161818   0.00161818     4.35255      0.037613
C(model_size):C(Encoding)                                         1  0.000104466  0.000104466    0.28099      0.59636
C(model_size):C(Transformation)                                   1  6.49833e-06  6.49833e-06    0.0174791    0.894889
C(model_size):C(Simulated_Data)                                   1  0.00075372   0.00075372     2.02734      0.155303
C(Encoding

## Multivariate ANOVA (MANOVA)

We test each of the 4 independent variables separately on each combination of the 6 dependant variables (MANOVA)

In [7]:
metrics = ['Specificity', 'Sensitivity', 'Precision', 'Accuracy', 'F1', 'MCC']
vars = ['model_size', 'Encoding', 'Transformation', 'Simulated_Data']

for i in range(2, len(metrics) + 1):
    metric_combs = list(itertools.combinations(metrics, i))
    for comb in metric_combs:
        dependantString = ''
        for metric in comb:
            if dependantString != '':
                dependantString += ' + '
            dependantString += metric
        
        for var in vars:
            independantString = f'C({var})'
            maov = MANOVA.from_formula(f'{dependantString} ~ {independantString}', data=df)
            print(f'MANOVAs for {dependantString}:')
            result = maov.mv_test().summary_frame
            result.drop('Pillai\'s trace', level=1, axis=0, inplace=True)
            result.drop('Hotelling-Lawley trace', level=1, axis=0, inplace=True)
            result.drop('Roy\'s greatest root', level=1, axis=0, inplace=True)
            result = result.reset_index(level=[1])
            del result['Statistic']
            # result = result.iloc[:,0]
            # print(result)
            print(tabulate(result, headers='keys', tablefmt='rst'), '\n')
            # print(maov.mv_test(), '\n')


MANOVAs for Specificity + Sensitivity:
Effect               Value    Num DF    Den DF       F Value       Pr > F
Intercept      0.000726545         2       397  273012        0
C(model_size)  0.962463            2       397       7.74174  0.000503167

MANOVAs for Specificity + Sensitivity:
Effect           Value    Num DF    Den DF       F Value      Pr > F
Intercept    0.0007356         2       397  269649        0
C(Encoding)  0.973347          2       397       5.43547  0.00468973

MANOVAs for Specificity + Sensitivity:
Effect                   Value    Num DF    Den DF       F Value    Pr > F
Intercept          0.000743592         2       397  266749         0
C(Transformation)  0.989132            2       397       2.18108   0.11427

MANOVAs for Specificity + Sensitivity:
Effect                   Value    Num DF    Den DF      F Value     Pr > F
Intercept          0.000734683         2       397  269986       0
C(Simulated_Data)  0.987224            2       397       2.5689  0.077

## N-Way Multivariate ANOVA (N-Way MANOVA)

Where N is the number of independent variables.

We test each combination of the 4 independent variables on each combination of the 6 dependant variables (MANOVA)

In [8]:
metrics = ['Specificity', 'Sensitivity', 'Precision', 'Accuracy', 'F1', 'MCC']
vars = ['model_size', 'Encoding', 'Transformation', 'Simulated_Data']

# Build right half of formula
independantString = ''
for i in range(1, len(vars) + 1):
    var_combs = list(itertools.combinations(vars, i))
    for comb in var_combs:
        if independantString != '':
            independantString += ' + '
        independantString_sub = ''
        for var in comb:
            if independantString_sub == '':
                independantString_sub = f'C({var})'
            else:
                independantString_sub += f':C({var})'
        independantString += independantString_sub

for i in range(2, len(metrics) + 1):
    metric_combs = list(itertools.combinations(metrics, i))
    for comb in metric_combs:
        dependantString = ''
        for metric in comb:
            if dependantString != '':
                dependantString += ' + '
            dependantString += metric
        maov = MANOVA.from_formula(f'{dependantString} ~ {independantString}', data=df)
        print(f'MANOVAs for {dependantString}:')
        result = maov.mv_test().summary_frame
        result.drop('Pillai\'s trace', level=1, axis=0, inplace=True)
        result.drop('Hotelling-Lawley trace', level=1, axis=0, inplace=True)
        result.drop('Roy\'s greatest root', level=1, axis=0, inplace=True)
        result = result.reset_index(level=[1])
        del result['Statistic']
        # result = result.iloc[:,0]
        # print(result)
        print(tabulate(result, headers='keys', tablefmt='rst'), '\n')
        # print(maov.mv_test(), '\n')


MANOVAs for Specificity + Sensitivity:
Effect                                                              Value    Num DF    Den DF        F Value    Pr > F
Intercept                                                      0.00535939         2       383  35540.2        0
C(model_size)                                                  0.994931           2       383      0.975567   0.377913
C(Encoding)                                                    0.999747           2       383      0.0484159  0.952743
C(Transformation)                                              0.996837           2       383      0.607679   0.545138
C(Simulated_Data)                                              0.994282           2       383      1.10135    0.333474
C(model_size):C(Encoding)                                      0.999783           2       383      0.0415465  0.959309
C(model_size):C(Transformation)                                0.998975           2       383      0.196444   0.82173
C(model_size):C(S