In [142]:
import pandas as pd
import plotly.express as px
from scipy.stats import ttest_ind, linregress, f_oneway
import numpy as np

In [61]:
dfs = []
per_training = []
participants = ['esther', 'aaron', 'tianyi', 'vilem', 'stephanie', 'loran', 'klemen', 'liv', 'maxine', 'davidh', 'hidde', 'kyriakos', 'tim', 'ethelbert', 'davidk']
for i in range(15):
    df_train = pd.read_csv(f'subject-{i}-{participants[i]}.csv')
    df_train.fact_id = df_train.fact_id.apply(lambda x: str(x)+f'_{i}')
    df_train = df_train.groupby('fact_id').max('trial').reset_index().loc[:,['fact_id', 'alpha']]
    df_test = pd.read_csv(f'subject-{i}-test.csv')
    df_test.fact_id = df_test.fact_id.apply(lambda x: str(x)+f'_{i}')
    df_test = df_test.loc[:,['fact_id', 'correct', 'answer']]
    df = pd.merge(df_test, df_train, on='fact_id', how='outer')
    df['unseen'] = df.alpha.apply(lambda x: not x==x)
    df['participant'] = i
    dfs.append(df)
df = pd.concat(dfs)
df

Unnamed: 0,fact_id,correct,answer,alpha,unseen,participant
0,17_0,False,mozart,,True,0
1,13_0,False,chopin,,True,0
2,3_0,False,bach,,True,0
3,21_0,True,vivaldi,0.355469,False,0
4,11_0,False,chopin,,True,0
...,...,...,...,...,...,...
20,11_14,False,vivaldi,,True,14
21,1_14,False,mozart,,True,14
22,8_14,True,chopin,0.808984,False,14
23,6_14,False,vivaldi,,True,14


In [84]:
df_seen = df[df.unseen==False]
df_seen

Unnamed: 0,fact_id,correct,answer,alpha,unseen,participant
3,21_0,True,vivaldi,0.355469,False,0
5,14_0,True,chopin,0.510547,False,0
11,7_0,False,beethoven,0.816406,False,0
17,1_0,True,bach,0.907813,False,0
19,4_0,True,bach,0.501172,False,0
...,...,...,...,...,...,...
12,21_14,True,mozart,0.583203,False,14
15,20_14,False,vivaldi,0.393750,False,14
16,2_14,True,bach,0.317578,False,14
18,5_14,True,bach,0.349609,False,14


In [63]:
fig = px.box(x=df_seen.correct, y=df_seen.alpha, width=400, height=400)
fig.show()

In [116]:
ttest_ind(df_seen.alpha[df_seen.correct == False], df_seen.alpha[df_seen.correct == True])

Ttest_indResult(statistic=-0.836596854565186, pvalue=0.40417713471104344)

In [141]:
for name, group in df_seen.groupby('answer'):
    print(name)
    print(ttest_ind(group.alpha[group.correct == False], group.alpha[group.correct == True]))

bach
Ttest_indResult(statistic=-0.7474410692960464, pvalue=0.46042977765794313)
beethoven
Ttest_indResult(statistic=0.9564976987970177, pvalue=0.3473074460880724)
chopin
Ttest_indResult(statistic=-1.2814052956896231, pvalue=0.20955381828133854)
mozart
Ttest_indResult(statistic=-0.4074378388991237, pvalue=0.6874538414149385)
vivaldi
Ttest_indResult(statistic=0.9305730670409019, pvalue=0.360320788107538)


In [166]:
df_composer_frequency = (df_seen.groupby(['participant', 'answer']).count()/5).iloc[:,0]
#df_composer_frequency = pd.pivot(df_composer_frequency, 'answe)
df_composer_frequency = df_composer_frequency.unstack().fillna(0).reset_index(drop=True,axis=2)
df_composer_frequency

TypeError: reset_index() got an unexpected keyword argument 'axis'

In [135]:
correct_unseen_facts_same_composer = []
for fact_id, correct, answer, alpha, unseen, participant in df_seen.values:
    correct_rate = df[(df.participant == participant) & (df.fact_id != fact_id) & (df.answer == answer) & (df.unseen == True)].groupby('participant').mean('correct').correct.values
    if len(correct_rate) > 0:
        correct_rate = correct_rate[0]
    else:
        correct_rate = np.nan
    correct_unseen_facts_same_composer.append(correct_rate)
df_generalized = df_seen.copy()
df_generalized['generalized'] = correct_unseen_facts_same_composer
df_generalized['generalized'] = df_generalized['generalized'].astype(float)
df_generalized = df_generalized.reset_index(drop=True)
df_generalized = df_generalized.dropna().reset_index(drop=True)
df_generalized = df_generalized.groupby(['participant', 'answer']).mean()
df_generalized

Unnamed: 0_level_0,Unnamed: 1_level_0,correct,alpha,unseen,generalized
participant,answer,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,bach,1.0,0.704492,0.0,0.000000
0,beethoven,0.0,0.816406,0.0,0.500000
0,chopin,1.0,0.510547,0.0,0.000000
0,mozart,1.0,0.300391,0.0,0.500000
0,vivaldi,1.0,0.355469,0.0,0.750000
...,...,...,...,...,...
13,vivaldi,0.0,0.609766,0.0,0.250000
14,bach,1.0,0.333594,0.0,0.333333
14,chopin,1.0,0.799805,0.0,0.333333
14,mozart,1.0,0.583203,0.0,0.500000


In [136]:
fig = px.scatter(x=df_generalized.generalized, y=df_generalized.alpha, width=400, height=400, labels={'x': 'accuracy on unseen facts same composer', 'y': 'alpha'})
fig.show()

In [137]:
linregress(df_generalized.generalized, df_generalized.alpha)

LinregressResult(slope=-0.09145396062540452, intercept=0.5076947201390319, rvalue=-0.206021864956045, pvalue=0.09699363384223988, stderr=0.05429765569985804, intercept_stderr=0.02623062486291233)

In [140]:
for name, group in df_generalized.groupby('answer'):
    print(name)
    print(linregress(group.generalized, group.alpha))

bach
LinregressResult(slope=-0.29329902296228727, intercept=0.6465272233627334, rvalue=-0.714346645482943, pvalue=0.006081959044915277, stderr=0.08663118385090231, intercept_stderr=0.04771945718906409)
beethoven
LinregressResult(slope=0.05333022595725731, intercept=0.4203044416926387, rvalue=0.11463816747243759, pvalue=0.7092085497635632, stderr=0.1393398052019736, intercept_stderr=0.06024989564396476)
chopin
LinregressResult(slope=-0.07766617859435179, intercept=0.5825661237697904, rvalue=-0.21242993310075048, pvalue=0.5074276899898315, stderr=0.11297676906428503, intercept_stderr=0.06545325941311264)
mozart
LinregressResult(slope=-0.12374253862359544, intercept=0.5469821658473784, rvalue=-0.2789537950824907, pvalue=0.3560342292063775, stderr=0.12843972878548574, intercept_stderr=0.051417041614114846)
vivaldi
LinregressResult(slope=-0.005306403130671499, intercept=0.3744778669464609, rvalue=-0.01762384772241919, pvalue=0.9502925410060165, stderr=0.0834949821654045, intercept_stderr=0.