In [1]:
# package statsmodels not compatible with allensdk, stats test done in env without allensdk
import pandas as pd
import numpy as np

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [2]:
df_f1 = pd.read_csv('f1.csv')
df_acy = pd.read_csv('acy.csv')

In [3]:
mod_one_f1 = ols('value ~ variable', data = df_f1[df_f1['feature_n'] == 40]).fit()
print(sm.stats.anova_lm(mod_one_f1, typ=2))

            sum_sq     df          F        PR(>F)
variable  0.096958    2.0  97.454674  2.880042e-33
Residual  0.147743  297.0        NaN           NaN


In [4]:
mod_one_acy = ols('value ~ variable', data = df_acy[df_acy['feature_n'] == 40]).fit()
print(sm.stats.anova_lm(mod_one_acy, typ=2))

            sum_sq     df          F        PR(>F)
variable  0.051947    2.0  64.175435  6.844706e-24
Residual  0.120204  297.0        NaN           NaN


In [5]:
model_f1 = ols('value ~ C(feature_n) + C(variable) + C(feature_n):C(variable)', data=df_f1).fit()
table_f1 = sm.stats.anova_lm(model_f1, typ=2)
print(table_f1)

print(pairwise_tukeyhsd(df_f1['value'], np.array(df_f1['feature_n'].apply(str) + '-' + df_f1['variable']), alpha=0.05))

                            sum_sq     df           F        PR(>F)
C(feature_n)              0.004694    2.0    4.957415  7.226248e-03
C(variable)               0.238116    2.0  251.495351  2.527795e-87
C(feature_n):C(variable)  0.056458    4.0   29.814949  2.658727e-23
Residual                  0.421799  891.0         NaN           NaN
  Multiple Comparison of Means - Tukey HSD, FWER=0.05   
 group1   group2  meandiff p-adj   lower   upper  reject
--------------------------------------------------------
10.0-knn  10.0-rf   0.0311  0.001  0.0215  0.0406   True
10.0-knn 10.0-svm  -0.0166  0.001 -0.0261  -0.007   True
10.0-knn 20.0-knn  -0.0195  0.001  -0.029 -0.0099   True
10.0-knn  20.0-rf   0.0205  0.001  0.0109  0.0301   True
10.0-knn 20.0-svm  -0.0032    0.9 -0.0128  0.0064  False
10.0-knn 40.0-knn  -0.0212  0.001 -0.0307 -0.0116   True
10.0-knn  40.0-rf   0.0223  0.001  0.0127  0.0319   True
10.0-knn 40.0-svm   0.0067 0.4333 -0.0029  0.0162  False
 10.0-rf 10.0-svm  -0.0476  0.001

In [6]:
model_acy = ols('value ~ C(feature_n) + C(variable) + C(feature_n):C(variable)', data=df_acy).fit()
table_acy = sm.stats.anova_lm(model_acy, typ=2)
print(table_acy)

print(pairwise_tukeyhsd(df_acy['value'], np.array(df_acy['feature_n'].apply(str) + '-' + df_acy['variable']), alpha=0.05))

                            sum_sq     df           F        PR(>F)
C(feature_n)              0.005193    2.0    6.420760  1.703739e-03
C(variable)               0.114395    2.0  141.438987  4.502890e-54
C(feature_n):C(variable)  0.043930    4.0   27.157517  2.728613e-21
Residual                  0.360319  891.0         NaN           NaN
  Multiple Comparison of Means - Tukey HSD, FWER=0.05   
 group1   group2  meandiff p-adj   lower   upper  reject
--------------------------------------------------------
10.0-knn  10.0-rf   0.0187  0.001  0.0099  0.0276   True
10.0-knn 10.0-svm  -0.0177  0.001 -0.0265 -0.0088   True
10.0-knn 20.0-knn  -0.0181  0.001  -0.027 -0.0093   True
10.0-knn  20.0-rf     0.01 0.0136  0.0012  0.0188   True
10.0-knn 20.0-svm  -0.0071 0.2355 -0.0159  0.0017  False
10.0-knn 40.0-knn  -0.0182  0.001 -0.0271 -0.0094   True
10.0-knn  40.0-rf   0.0131  0.001  0.0043   0.022   True
10.0-knn 40.0-svm   0.0039    0.9  -0.005  0.0127  False
 10.0-rf 10.0-svm  -0.0364  0.001