In [28]:
import numpy as np
import pandas as pd
from statsmodels.stats.anova import AnovaRM
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import ttest_rel
import pingouin as pg
from constants import (ACCURACIES_PATH, ACC_UNSHIFTED_PATH)
from utils.utils_statistical_analysis import stats_preprocessing

# Importing data and formatting it
Imports all the accuracies and brings them into a table that can be used for statistical analysis. Also segments the data to analyse only certain blocks.

In [13]:
accuracies_data = pd.read_csv(ACCURACIES_PATH, header=None)
stats_data = stats_preprocessing(accuracies_data)

#stats_data_pre = stats_data.loc[(stats_data['Run'] == 1) | (stats_data['Run'] == 2)]
#stats_data_treatment = stats_data.loc[(stats_data['Run'] == 3) | (stats_data['Run'] == 4)]
#stats_data_post = stats_data.loc[(stats_data['Run'] == 5) | (stats_data['Run'] == 6)]

#stats_data.info()

# Output
Exporting data to use in other programs

In [None]:
# np.savetxt("data/stats_data.csv", stats_data, delimiter=",")
# np.savetxt("data/stats_data_treatment.csv", stats_data_treatment, delimiter=",")
# np.savetxt("data/stats_data_pre.csv", stats_data_pre, delimiter=",")
# np.savetxt("data/stats_data_post.csv", stats_data_post, delimiter=",")

# ANOVA
calculates within-subjects ANOVA

In [None]:
model_1 = AnovaRM(data=stats_data, depvar='Accuracy', subject='Subject', within=['Run', 'Treatment']).fit()
print("\033[4m" + "Model 1" + "\033[0m")
print(model_1)
print("")

model_2 = pg.rm_anova(dv='Accuracy', within=['Run', 'Treatment'], subject='Subject', data=stats_data)
print("\033[4m" + "Model 2" + "\033[0m")
print(model_2)
print("")

# T-Test
Calculates a t-test comparing individual runs to each other

In [None]:
sham_4 = stats_data.query('Treatment == 1 and Run == 6')['Accuracy']
stim_4 = stats_data.query('Treatment == 2 and Run == 6')['Accuracy']
print(np.mean(sham_4))
print(np.mean(stim_4))
print("")

t_test_1 = pg.ttest(sham_4, stim_4, paired=True)
print("\033[4m" + "T-Test 1" + "\033[0m")
print(t_test_1)

t_test_2 = ttest_rel(sham_4, stim_4, alternative='less')
print("\033[4m" + "T-Test 2" + "\033[0m")
print(t_test_2)

# Performance Split
Splitting dataset into high and low performers and calculating ANOVA

In [33]:
subject_performance = stats_data.Accuracy.to_numpy()
subject_performance = np.mean(subject_performance.reshape(-1, 12), axis=1)
performance_index = np.argpartition(subject_performance, int(len(subject_performance)/2))
performance_index = performance_index + 1
performance_index = np.array_split(performance_index,2)

low_performers = stats_data[~(stats_data.Subject.isin(performance_index[1]))]
high_performers = stats_data[~(stats_data.Subject.isin(performance_index[0]))]

anova_low_performers = pg.rm_anova(dv='Accuracy', within=['Run', 'Treatment'], subject='Subject', data=low_performers)
anova_high_performers = pg.rm_anova(dv='Accuracy', within=['Run', 'Treatment'], subject='Subject', data=high_performers)

print("\033[4m" + "Low Performers" + "\033[0m")
print(np.mean(low_performers.Accuracy))
print(anova_low_performers)
print("")
print("\033[4m" + "High Performers" + "\033[0m")
print(np.mean(high_performers.Accuracy))
print(anova_high_performers)

[4mLow Performers[0m
0.5751168272397891
            Source        SS  ddof1  ddof2        MS         F     p-unc  \
0              Run  0.052037      5     75  0.010407  1.544898  0.186236   
1        Treatment  0.001240      1     15  0.001240  0.122413  0.731296   
2  Run * Treatment  0.019499      5     75  0.003900  0.322364  0.898046   

   p-GG-corr       ng2       eps  
0   0.208021  0.028173  0.704322  
1   0.731296  0.000690  1.000000  
2   0.846930  0.010746  0.734675  

[4mHigh Performers[0m
0.6768251468489238
            Source        SS  ddof1  ddof2        MS         F     p-unc  \
0              Run  0.011203      5     75  0.002241  0.298189  0.912426   
1        Treatment  0.013792      1     15  0.013792  1.778969  0.202176   
2  Run * Treatment  0.066757      5     75  0.013351  1.742729  0.135251   

   p-GG-corr       ng2       eps  
0   0.878480  0.007314  0.802277  
1   0.202176  0.008989  1.000000  
2   0.158664  0.042056  0.733412  


# Comparing day 1 to day 2
Using the unshifted data to test if there is a difference in the performance between day 1 and day 2 and to see if there is a learning effect

In [None]:
accuracies_unshifted_data = pd.read_csv(ACC_UNSHIFTED_PATH, header=None)
stats_data_unshifted = stats_preprocessing(accuracies_unshifted_data)

stats_data_day1 = stats_data.loc[(stats_data['Treatment'] == 1)]
stats_data_day2 = stats_data.loc[(stats_data['Treatment'] == 2)]

unshifted_anova = pg.rm_anova(dv='Accuracy', within=['Run', 'Treatment'], subject='Subject', data=stats_data_unshifted)
anova_day1 = pg.rm_anova(dv='Accuracy', within=['Run'], subject='Subject', data=stats_data_day1)
anova_day2 = pg.rm_anova(dv='Accuracy', within=['Run'], subject='Subject', data=stats_data_day2)

print("\033[4m" + "ANOVA comparing day 1 and 2" + "\033[0m")
print(unshifted_anova)
print("")
print("\033[4m" + "ANOVA of day 1" + "\033[0m")
print(anova_day1)
print("")
print("\033[4m" + "ANOVA of day 2" + "\033[0m")
print(anova_day2)

# Comparing Baseline to Stimulation Phase
Calculating the ANOVA comparing baseline trials with trial 3 and 4. First on both days, then only on the day of the actual stimulation.

In [32]:
stats_data_short = stats_data.loc[(stats_data['Run'] == 1) | (stats_data['Run'] == 2) | (stats_data['Run'] == 3) | (stats_data['Run'] == 4)]
stats_data_short.Run = stats_data_short.Run.replace(2, 1)
anova_short = pg.rm_anova(dv='Accuracy', within=['Run', 'Treatment'], subject='Subject', data=stats_data_short)

stats_data_short_treatment = stats_data_short.loc[(stats_data_short['Treatment'] == 2)]
anova_short_treatment = pg.rm_anova(dv='Accuracy', within=['Run'], subject='Subject', data=stats_data_short_treatment)

tukey = pairwise_tukeyhsd(endog=stats_data_short_treatment['Accuracy'],
                          groups=stats_data_short_treatment['Run'],
                          alpha=0.05)

print("\033[4m" + "ANOVA comparing Baseline to run 3 and 4" + "\033[0m")
print(anova_short)
print("")
print("\033[4m" + "ANOVA comparing Baseline to run 3 and 4 in treatment condition" + "\033[0m")
print(anova_short_treatment)
print("")
print("\033[4m" + "Pairwise comparisons" + "\033[0m")
print(tukey)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stats_data_short.Run = stats_data_short.Run.replace(2, 1)


[4mANOVA comparing Baseline to run 3 and 4[0m
            Source        SS  ddof1  ddof2        MS         F     p-unc  \
0              Run  0.019604      2     62  0.009802  1.875857  0.161817   
1        Treatment  0.006703      1     31  0.006703  0.747643  0.393865   
2  Run * Treatment  0.034661      2     62  0.017331  2.250362  0.113900   

   p-GG-corr       ng2       eps  
0   0.163623  0.010137  0.960784  
1   0.393865  0.003490  1.000000  
2   0.117997  0.017785  0.931006  

[4mANOVA comparing Baseline to run 3 and 4 in treatment condition[0m
  Source  ddof1  ddof2         F     p-unc       ng2       eps
0    Run      2     62  3.244521  0.045697  0.051021  0.894647

[4mPairwise comparisons[0m
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     1      3  -0.0126   0.85 -0.0674 0.0423  False
     1      4   0.0405   0.19 -0.0143 0.0954  False
     3      4   0.0