In [89]:
import pandas as pd
import plotly.express as px
from scipy.stats import ttest_ind
import numpy as np

In [61]:
dfs = []
participants = ['esther', 'aaron', 'tianyi', 'vilem', 'stephanie', 'loran', 'klemen', 'liv', 'maxine', 'davidh', 'hidde', 'kyriakos', 'tim', 'ethelbert', 'davidk']
for i in range(15):
    df_train = pd.read_csv(f'subject-{i}-{participants[i]}.csv')
    df_train.fact_id = df_train.fact_id.apply(lambda x: str(x)+f'_{i}')
    df_train = df_train.groupby('fact_id').max('trial').reset_index().loc[:,['fact_id', 'alpha']]
    df_test = pd.read_csv(f'subject-{i}-test.csv')
    df_test.fact_id = df_test.fact_id.apply(lambda x: str(x)+f'_{i}')
    df_test = df_test.loc[:,['fact_id', 'correct', 'answer']]
    df = pd.merge(df_test, df_train, on='fact_id', how='outer')
    df['unseen'] = df.alpha.apply(lambda x: not x==x)
    df['participant'] = i
    dfs.append(df)
df = pd.concat(dfs)
df

Unnamed: 0,fact_id,correct,answer,alpha,unseen,participant
0,17_0,False,mozart,,True,0
1,13_0,False,chopin,,True,0
2,3_0,False,bach,,True,0
3,21_0,True,vivaldi,0.355469,False,0
4,11_0,False,chopin,,True,0
...,...,...,...,...,...,...
20,11_14,False,vivaldi,,True,14
21,1_14,False,mozart,,True,14
22,8_14,True,chopin,0.808984,False,14
23,6_14,False,vivaldi,,True,14


In [84]:
df_seen = df[df.unseen==False]
df_seen

Unnamed: 0,fact_id,correct,answer,alpha,unseen,participant
3,21_0,True,vivaldi,0.355469,False,0
5,14_0,True,chopin,0.510547,False,0
11,7_0,False,beethoven,0.816406,False,0
17,1_0,True,bach,0.907813,False,0
19,4_0,True,bach,0.501172,False,0
...,...,...,...,...,...,...
12,21_14,True,mozart,0.583203,False,14
15,20_14,False,vivaldi,0.393750,False,14
16,2_14,True,bach,0.317578,False,14
18,5_14,True,bach,0.349609,False,14


In [63]:
fig = px.box(x=df_seen.correct, y=df_seen.alpha, width=400, height=400)
fig.show()

In [64]:
ttest_ind(df_seen.correct, df_seen.alpha)

Ttest_indResult(statistic=6.756222035493122, pvalue=7.512742745007251e-11)

In [108]:
correct_unseen_facts_same_composer = []
for fact_id, correct, answer, alpha, unseen, participant in df_seen.values:
    correct_rate = df[(df.participant == participant) & (df.fact_id != fact_id) & (df.answer == answer) & (df.unseen == True)].groupby('participant').mean('correct').correct.values
    if len(correct_rate) > 0:
        correct_rate = correct_rate[0]
    else:
        correct_rate = np.nan
    correct_unseen_facts_same_composer.append(correct_rate)
df_generalized = df_seen.copy()
df_generalized['generalized'] = correct_unseen_facts_same_composer
df_generalized['generalized'] = df_generalized['generalized'].astype(float)
df_generalized = df_generalized.reset_index(drop=True)
df_generalized = df_generalized.dropna().reset_index(drop=True)
df_generalized

Unnamed: 0,fact_id,correct,answer,alpha,unseen,participant,generalized
0,21_0,True,vivaldi,0.355469,False,0,0.750000
1,14_0,True,chopin,0.510547,False,0,0.000000
2,7_0,False,beethoven,0.816406,False,0,0.500000
3,1_0,True,bach,0.907813,False,0,0.000000
4,4_0,True,bach,0.501172,False,0,0.000000
...,...,...,...,...,...,...,...
139,21_14,True,mozart,0.583203,False,14,0.500000
140,20_14,False,vivaldi,0.393750,False,14,0.333333
141,2_14,True,bach,0.317578,False,14,0.333333
142,5_14,True,bach,0.349609,False,14,0.333333


In [115]:
fig = px.scatter(x=df_generalized.generalized, y=df_generalized.alpha, width=400, height=400, labels={'x': 'accuracy on unseen facts same composer', 'y': 'alpha'})
fig.show()

In [114]:
ttest_ind(df_generalized.generalized, df_generalized.alpha)

Ttest_indResult(statistic=-2.979293083809806, pvalue=0.0031370329874489)