In [43]:
import matplotlib.cm
import numpy as np
import pandas as pd
from statsmodels.stats.anova import AnovaRM
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import ttest_rel
from scipy.stats import sem
import pingouin as pg
from constants import (ACCURACIES_PATH, ACC_UNSHIFTED_PATH, COND_ACC_PATH, RTs_PATH)
from utils.utils_statistical_analysis import stats_preprocessing

# Importing data and formatting it
Imports all the accuracies and brings them into a table that can be used for statistical analysis. Also segments the data to analyse only certain blocks.

In [44]:
accuracies_data = pd.read_csv(ACCURACIES_PATH, header=None)
stats_data = stats_preprocessing(accuracies_data)
cond_acc = pd.read_csv(COND_ACC_PATH, header=None)

stats_data_pre = stats_data.loc[(stats_data['Run'] == 1) | (stats_data['Run'] == 2)]
stats_data_treatment = stats_data.loc[(stats_data['Run'] == 3) | (stats_data['Run'] == 4)]
stats_data_post = stats_data.loc[(stats_data['Run'] == 5) | (stats_data['Run'] == 6)]

stats_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   Subject    360 non-null    category
 1   Treatment  360 non-null    category
 2   Run        360 non-null    category
 3   Accuracy   360 non-null    float64 
dtypes: category(3), float64(1)
memory usage: 5.6 KB


# ANOVA
calculates within-subjects ANOVA

In [46]:
model_1 = AnovaRM(data=stats_data, depvar='Accuracy', subject='Subject', within=['Run', 'Treatment']).fit()
print("\033[4m" + "Model 1" + "\033[0m")
print(model_1)
print("")

model_2 = pg.rm_anova(dv='Accuracy', within=['Run', 'Treatment'], subject='Subject', data=stats_data)
print("\033[4m" + "Model 2" + "\033[0m")
print(model_2)
print("")

[4mModel 1[0m
                   Anova
              F Value Num DF  Den DF  Pr > F
--------------------------------------------
Run            1.1026 5.0000 145.0000 0.3617
Treatment      0.0274 1.0000  29.0000 0.8697
Run:Treatment  1.5101 5.0000 145.0000 0.1902


[4mModel 2[0m
            Source        SS  ddof1  ddof2        MS         F     p-unc   
0              Run  0.039426      5    145  0.007885  1.102560  0.361671  \
1        Treatment  0.000278      1     29  0.000278  0.027378  0.869728   
2  Run * Treatment  0.073737      5    145  0.014747  1.510086  0.190185   

   p-GG-corr       ng2       eps  
0   0.360734  0.009930  0.915380  
1   0.869728  0.000071  1.000000  
2   0.205774  0.018413  0.771206  



# T-Test
Calculates a t-test comparing individual runs to each other

In [47]:
sham_4 = stats_data.query('Treatment == 1 and Run == 4')['Accuracy']
stim_4 = stats_data.query('Treatment == 2 and Run == 4')['Accuracy']
print(np.mean(sham_4))
print(np.mean(stim_4))
print("")

sham_4 = sham_4 * 100
stim_4 = stim_4 * 100

t_test_1 = pg.ttest(sham_4, stim_4, paired=True, alternative='less')
print("\033[4m" + "T-Test 1" + "\033[0m")
print(t_test_1)

t_test_2 = ttest_rel(sham_4, stim_4, alternative='less')
print("\033[4m" + "T-Test 2" + "\033[0m")
print(t_test_2)

0.6219806763285024
0.6737318840579711

[4mT-Test 1[0m
               T  dof alternative     p-val          CI95%   cohen-d   BF10   
T-test -1.883567   29        less  0.034843  [-inf, -0.51]  0.491072  1.843  \

           power  
T-test  0.836767  
[4mT-Test 2[0m
TtestResult(statistic=-1.8835673089658451, pvalue=0.03484343549304212, df=29)


# Performance Split
Splitting dataset into high and low performers and calculating ANOVA

In [48]:
subject_performance = stats_data.Accuracy.to_numpy()
subject_performance = np.mean(subject_performance.reshape(-1, 12), axis=1)
performance_index = np.argpartition(subject_performance, int(len(subject_performance)/2))
performance_index = performance_index + 1
performance_index = np.array_split(performance_index,2)

low_performers = stats_data[~(stats_data.Subject.isin(performance_index[1]))]
high_performers = stats_data[~(stats_data.Subject.isin(performance_index[0]))]

anova_low_performers = pg.rm_anova(dv='Accuracy', within=['Run', 'Treatment'], subject='Subject', data=low_performers)
anova_high_performers = pg.rm_anova(dv='Accuracy', within=['Run', 'Treatment'], subject='Subject', data=high_performers)

print("\033[4m" + "Low Performers" + "\033[0m")
print(np.mean(low_performers.Accuracy))
print(anova_low_performers)
print("")
print("\033[4m" + "High Performers" + "\033[0m")
print(np.mean(high_performers.Accuracy))
print(anova_high_performers)

[4mLow Performers[0m
0.5869382228077881
            Source        SS  ddof1  ddof2        MS         F     p-unc   
0              Run  0.087998      5     70  0.017600  2.687691  0.027977  \
1        Treatment  0.004865      1     14  0.004865  0.396308  0.539140   
2  Run * Treatment  0.050712      5     70  0.010142  0.842745  0.524040   

   p-GG-corr       ng2       eps  
0   0.047477  0.050954  0.711995  
1   0.539140  0.002959  1.000000  
2   0.481618  0.030012  0.623106  

[4mHigh Performers[0m
0.6795857121944078
            Source        SS  ddof1  ddof2        MS         F     p-unc   
0              Run  0.014059      5     70  0.002812  0.381437  0.859888  \
1        Treatment  0.008713      1     14  0.008713  1.112359  0.309432   
2  Run * Treatment  0.050929      5     70  0.010186  1.306564  0.271203   

   p-GG-corr       ng2       eps  
0   0.819099  0.009834  0.791784  
1   0.309432  0.006117  1.000000  
2   0.282029  0.034729  0.702572  


# Variance Split
Splitting dataset into subjects with high and low variance and calculating ANOVA

In [49]:
variances = accuracies_data.var(axis=1)
variance_index = np.argpartition(variances, int(len(variances)/2))
variance_index = variance_index + 1
variance_index = np.array_split(variance_index,2)

low_variance = stats_data[~(stats_data.Subject.isin(variance_index[0]))]
high_variance = stats_data[~(stats_data.Subject.isin(variance_index[1]))]

anova_low_variance = pg.rm_anova(dv='Accuracy', within=['Run', 'Treatment'], subject='Subject', data=low_variance)
anova_high_variance = pg.rm_anova(dv='Accuracy', within=['Run', 'Treatment'], subject='Subject', data=high_variance)

print("\033[4m" + "Low Variance" + "\033[0m")
print("Mean Accuracy = " + str(np.mean(low_variance.Accuracy)))
print("Mean Variance = " + str(np.mean(variances[variance_index[0] - 1])))
print(anova_low_variance)
print("")
print("\033[4m" + "High Variance" + "\033[0m")
print("Mean Accuracy = " + str(np.mean(high_variance.Accuracy)))
print("Mean Variance = " + str(np.mean(variances[variance_index[1] - 1])))
print(anova_high_variance)

[4mLow Variance[0m
Mean Accuracy = 0.6183254647928559
Mean Variance = 0.006048358705199442
            Source        SS  ddof1  ddof2        MS         F     p-unc   
0              Run  0.034528      5     70  0.006906  0.858475  0.513453  \
1        Treatment  0.002086      1     14  0.002086  0.152938  0.701631   
2  Run * Treatment  0.124834      5     70  0.024967  1.843896  0.115490   

   p-GG-corr       ng2       eps  
0   0.485713  0.015260  0.720549  
1   0.701631  0.000935  1.000000  
2   0.146311  0.053053  0.670416  

[4mHigh Variance[0m
Mean Accuracy = 0.6481984702093397
Mean Variance = 0.011292980480732235
            Source        SS  ddof1  ddof2        MS         F     p-unc   
0              Run  0.021745      5     70  0.004349  0.666033  0.650437  \
1        Treatment  0.004798      1     14  0.004798  0.690972  0.419786   
2  Run * Treatment  0.013619      5     70  0.002724  0.472503  0.795522   

   p-GG-corr       ng2       eps  
0   0.609315  0.013974  0.7

# Comparing day 1 to day 2
Using the unshifted data to test if there is a difference in the performance between day 1 and day 2 and to see if there is a learning effect

In [50]:
accuracies_unshifted_data = pd.read_csv(ACC_UNSHIFTED_PATH, header=None)
stats_data_unshifted = stats_preprocessing(accuracies_unshifted_data)

stats_data_day1 = stats_data.loc[(stats_data['Treatment'] == 1)]
stats_data_day2 = stats_data.loc[(stats_data['Treatment'] == 2)]

unshifted_anova = pg.rm_anova(dv='Accuracy', within=['Run', 'Treatment'], subject='Subject', data=stats_data_unshifted)
anova_day1 = pg.rm_anova(dv='Accuracy', within=['Run'], subject='Subject', data=stats_data_day1)
anova_day2 = pg.rm_anova(dv='Accuracy', within=['Run'], subject='Subject', data=stats_data_day2)

print("\033[4m" + "ANOVA comparing day 1 and 2" + "\033[0m")
print(unshifted_anova)
print("")
print("\033[4m" + "ANOVA of day 1" + "\033[0m")
print(anova_day1)
print("")
print("\033[4m" + "ANOVA of day 2" + "\033[0m")
print(anova_day2)

[4mANOVA comparing day 1 and 2[0m
            Source        SS  ddof1  ddof2        MS         F     p-unc   
0              Run  0.028988      5    155  0.005798  0.808349  0.545344  \
1        Treatment  0.001393      1     31  0.001393  0.140839  0.710004   
2  Run * Treatment  0.053952      5    155  0.010790  1.105476  0.359779   

   p-GG-corr       ng2       eps  
0   0.534290  0.006554  0.899613  
1   0.710004  0.000317  1.000000  
2   0.356783  0.012130  0.786865  

[4mANOVA of day 1[0m
  Source  ddof1  ddof2         F     p-unc       ng2       eps
0    Run      5    145  0.469302  0.798648  0.010026  0.793371

[4mANOVA of day 2[0m
  Source  ddof1  ddof2         F     p-unc       ng2       eps
0    Run      5    145  2.212851  0.056101  0.045327  0.872518


# Comparing Baseline to Stimulation Phase
Calculating the ANOVA comparing baseline trials with trial 3 and 4. First on both days, then only on the day of the actual stimulation.

In [51]:
#stats_data_short = stats_data.loc[(stats_data['Run'] == 1) | (stats_data['Run'] == 2) | (stats_data['Run'] == 3) | (stats_data['Run'] == 4)]
stats_data.Run = stats_data.Run.replace(2, 1)
anova_short = pg.rm_anova(dv='Accuracy', within=['Run', 'Treatment'], subject='Subject', data=stats_data)

stats_data_treatment = stats_data.loc[(stats_data['Treatment'] == 2)]
anova_short_treatment = pg.rm_anova(dv='Accuracy', within=['Run'], subject='Subject', data=stats_data_treatment)

#tukey = pairwise_tukeyhsd(endog=stats_data_treatment['Accuracy'],
#                          groups=stats_data_treatment['Run'],
#                          alpha=0.05)

print("\033[4m" + "ANOVA comparing Baseline to run 3 and 4" + "\033[0m")
print(anova_short)
print("")
print("\033[4m" + "ANOVA comparing Baseline to run 3 and 4 in treatment condition" + "\033[0m")
print(anova_short_treatment)
print("")
#print("\033[4m" + "Pairwise comparisons" + "\033[0m")
#print(tukey)

[4mANOVA comparing Baseline to run 3 and 4[0m
            Source        SS  ddof1  ddof2        MS         F     p-unc   
0              Run  0.030176      4    116  0.007544  1.218305  0.306869  \
1        Treatment  0.002025      1     29  0.002025  0.232004  0.633657   
2  Run * Treatment  0.045915      4    116  0.011479  1.231386  0.301421   

   p-GG-corr       ng2       eps  
0   0.307684  0.009466  0.901629  
1   0.633657  0.000641  1.000000  
2   0.303237  0.014333  0.760820  

[4mANOVA comparing Baseline to run 3 and 4 in treatment condition[0m
  Source  ddof1  ddof2         F     p-unc       ng2       eps
0    Run      4    116  2.242153  0.068702  0.040615  0.841719



# Comparing Sham to Stimulation Phase
Calculating ANOVA comparing run 3 and 4 of the sham day with run 3 and 4 of the stimulation day

In [52]:
stats_data_sham_stim = stats_data.loc[(stats_data['Run'] == 3) | (stats_data['Run'] == 4)]
stats_data_sham_stim.loc[(stats_data['Treatment'] == 1)].Run = stats_data_sham_stim.loc[(stats_data['Treatment'] == 1)].Run.replace(4, 3)

anova_sham_stim = pg.rm_anova(dv='Accuracy', within=['Run', 'Treatment'], subject='Subject', data=stats_data_sham_stim)
print(anova_sham_stim)

            Source        SS  ddof1  ddof2        MS         F     p-unc   
0              Run  0.025530      1     29  0.025530  4.092442  0.052381  \
1        Treatment  0.010519      1     29  0.010519  0.937197  0.341005   
2  Run * Treatment  0.032721      1     29  0.032721  3.738896  0.062974   

   p-GG-corr       ng2  eps  
0   0.052381  0.018259  1.0  
1   0.341005  0.007605  1.0  
2   0.062974  0.023282  1.0  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stats_data_sham_stim.loc[(stats_data['Treatment'] == 1)].Run = stats_data_sham_stim.loc[(stats_data['Treatment'] == 1)].Run.replace(4, 3)


In [53]:
print(stats_data_sham_stim.loc[(stats_data['Treatment'] == 1)].Run)

2      3
3      4
14     3
15     4
26     3
27     4
38     3
39     4
50     3
51     4
62     3
63     4
74     3
75     4
86     3
87     4
98     3
99     4
110    3
111    4
122    3
123    4
134    3
135    4
146    3
147    4
158    3
159    4
170    3
171    4
182    3
183    4
194    3
195    4
206    3
207    4
218    3
219    4
230    3
231    4
242    3
243    4
254    3
255    4
266    3
267    4
278    3
279    4
290    3
291    4
302    3
303    4
314    3
315    4
326    3
327    4
338    3
339    4
350    3
351    4
Name: Run, dtype: category
Categories (5, int64): [1, 3, 4, 5, 6]


# Descriptives on individual subjects
General statistics of individual subjects needed for the results section

In [54]:
# get mean and SD of single participant
subject = 26
print(accuracies_data.iloc[subject].mean())
print(accuracies_data.iloc[subject].std())

# get mean and SD of all participants
print(accuracies_data.stack().mean())
print(accuracies_data.stack().std())
print(sem(accuracies_data.stack()))

# get mean and SD of day 1 or day 2
# print(stats_data_day1["Accuracy"].mean())
# print(stats_data_day2["Accuracy"].mean())
# print(stats_data_day1["Accuracy"].std())
# print(stats_data_day2["Accuracy"].std())

# age of the participants
# age = [28, 24, 23, 23, 23, 28, 23, 26, 25, 24, 27, 27, 23, 26, 21, 25, 24, 36, 24, 26, 26, 26, 29, 34, 24, 22, 34, 25, 28, 26, 25, 26]
# print(len(age))
# print(np.mean(age))
# print(np.median(age))
# print(np.std(age))
# print(min(age))
# print(max(age))

0.7291666666666665
0.12115292114938993
0.6332619675010978
0.10614001730855123
0.005594070093078611


Comparing accuracy for remember stimulus 1 and remember stimulus 2

In [55]:
print(np.mean(cond_acc.loc[0]))
print(np.mean(cond_acc.loc[1]))
print(np.std(cond_acc.loc[0]))
print(np.std(cond_acc.loc[1]))

cond_ttest = pg.ttest(cond_acc.loc[0], cond_acc.loc[1], paired=False)
print("\033[4m" + "Conditional T-Test" + "\033[0m")
print(t_test_1)

0.6335468219668099
0.6330849320361348
0.06394476211463561
0.06034410459973374
[4mConditional T-Test[0m
               T  dof alternative     p-val          CI95%   cohen-d   BF10   
T-test -1.883567   29        less  0.034843  [-inf, -0.51]  0.491072  1.843  \

           power  
T-test  0.836767  
