In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import shapiro, levene, kruskal, sem, f_oneway
import random
from sklearn.utils import resample
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [13]:
#Reading the dataset and drop NaN values

dataset = 'data_thesis.csv'
data = pd.read_csv(dataset)
data.dropna(inplace=True)

data = data[data['sideMatch'] != 'timedout']

# Filtering out trialno with only one row (timedout trials)
grouped_data = data.groupby(['session', 'trialno'])

data_filtered = grouped_data.filter(lambda x: len(x) > 1)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

print(data_filtered)

pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')


       session  trialno difficulty  accuracy period_in_trial  step_reached  \
0     curE002a        8       Easy       0.0     exploration             3   
1     curE002a        8       Easy       0.0        decision             3   
3     curE002a        9       Easy       1.0     exploration             3   
4     curE002a        9       Easy       1.0        decision             3   
6     curE002a       10       Easy       1.0     exploration             3   
7     curE002a       10       Easy       1.0        decision             3   
9     curE002a       11     Medium       1.0     exploration             3   
10    curE002a       11     Medium       1.0        decision             3   
12    curE002a       12     Medium       0.0     exploration             3   
13    curE002a       12     Medium       0.0        decision             3   
15    curE002a       13     Medium       1.0     exploration             3   
16    curE002a       13     Medium       1.0        decision    

In [39]:
# Filter out participants that didn't go above easy difficulty level

grouped = data_filtered.groupby('session')

filtered_sessions = []
for session, data_filtered in grouped:
    if 'Easy' not in data_filtered['difficulty'].values or len(data_filtered['difficulty'].unique()) > 1:
        filtered_sessions.append(data_filtered)

filtered_data = pd.concat(filtered_sessions)
filtered_data

Unnamed: 0,session,trialno,difficulty,accuracy,period_in_trial,step_reached,durPeriodTotal,FixNb,FixNbInL,FixNbInR,...,LookTotDurInR,LookTotDurInT,NbSwitches,objL,objR,objT,sideChosen,sideMatch,timePeriodStarts,timePeriodEnds
0,curE002a,8,Easy,0.0,exploration,3,11270,20,13,5,...,1076,910,6,V-1-G-1.png,F-4-E-1.png,F-4-E-1.png,left,right,1962.0,13230.0
1,curE002a,8,Easy,0.0,decision,3,924,2,1,0,...,0,298,0,V-1-G-1.png,F-4-E-1.png,F-4-E-1.png,left,right,13232.0,14154.0
3,curE002a,9,Easy,1.0,exploration,3,2498,6,3,2,...,634,532,2,F-1-A-3.png,V-2-H-2.png,F-1-A-3.png,left,left,2024.0,4520.0
4,curE002a,9,Easy,1.0,decision,3,2182,4,1,0,...,0,1644,0,F-1-A-3.png,V-2-H-2.png,F-1-A-3.png,left,left,4522.0,6702.0
6,curE002a,10,Easy,1.0,exploration,3,9912,11,3,5,...,1904,1094,3,F-4-E-1.png,V-1-G-1.png,V-1-G-1.png,right,right,2046.0,11956.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5781,curE122a,35,Hard,1.0,decision,3,1432,0,0,0,...,0,0,0,V-4-F-1.png,V-4-A-4.png,V-4-F-1.png,left,left,18390.0,19820.0
5784,curE122a,38,Hard,1.0,exploration,3,2414,5,2,1,...,270,502,2,O-2-J-1.png,O-2-G-4.png,O-2-J-1.png,left,left,2716.0,5128.0
5785,curE122a,38,Hard,1.0,decision,3,582,2,2,0,...,0,0,1,O-2-J-1.png,O-2-G-4.png,O-2-J-1.png,left,left,5130.0,5710.0
5791,curE122a,48,Medium,0.0,exploration,3,2332,2,1,0,...,0,1714,1,V-2-B-3.png,V-3-H-3.png,V-3-H-3.png,left,right,1488.0,17710.0


In [40]:
# Filter out participants that completed 10 or less trials

trial_counts_per_session = filtered_data.groupby('session')['trialno'].nunique()

trial_counts_df = trial_counts_per_session.reset_index(name='unique_trial_count')

filtered_data = filtered_data[filtered_data['session'].isin(trial_counts_df[trial_counts_df['unique_trial_count'] > 10]['session'])]

filtered_data

Unnamed: 0,session,trialno,difficulty,accuracy,period_in_trial,step_reached,durPeriodTotal,FixNb,FixNbInL,FixNbInR,...,LookTotDurInR,LookTotDurInT,NbSwitches,objL,objR,objT,sideChosen,sideMatch,timePeriodStarts,timePeriodEnds
0,curE002a,8,Easy,0.0,exploration,3,11270,20,13,5,...,1076,910,6,V-1-G-1.png,F-4-E-1.png,F-4-E-1.png,left,right,1962.0,13230.0
1,curE002a,8,Easy,0.0,decision,3,924,2,1,0,...,0,298,0,V-1-G-1.png,F-4-E-1.png,F-4-E-1.png,left,right,13232.0,14154.0
3,curE002a,9,Easy,1.0,exploration,3,2498,6,3,2,...,634,532,2,F-1-A-3.png,V-2-H-2.png,F-1-A-3.png,left,left,2024.0,4520.0
4,curE002a,9,Easy,1.0,decision,3,2182,4,1,0,...,0,1644,0,F-1-A-3.png,V-2-H-2.png,F-1-A-3.png,left,left,4522.0,6702.0
6,curE002a,10,Easy,1.0,exploration,3,9912,11,3,5,...,1904,1094,3,F-4-E-1.png,V-1-G-1.png,V-1-G-1.png,right,right,2046.0,11956.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5781,curE122a,35,Hard,1.0,decision,3,1432,0,0,0,...,0,0,0,V-4-F-1.png,V-4-A-4.png,V-4-F-1.png,left,left,18390.0,19820.0
5784,curE122a,38,Hard,1.0,exploration,3,2414,5,2,1,...,270,502,2,O-2-J-1.png,O-2-G-4.png,O-2-J-1.png,left,left,2716.0,5128.0
5785,curE122a,38,Hard,1.0,decision,3,582,2,2,0,...,0,0,1,O-2-J-1.png,O-2-G-4.png,O-2-J-1.png,left,left,5130.0,5710.0
5791,curE122a,48,Medium,0.0,exploration,3,2332,2,1,0,...,0,1714,1,V-2-B-3.png,V-3-H-3.png,V-3-H-3.png,left,right,1488.0,17710.0


In [41]:
# Dividing dataset into decision and exploration 
decision_data = filtered_data[filtered_data['period_in_trial'] == 'decision']
exploration_data = filtered_data[filtered_data['period_in_trial'] == 'exploration']

In [42]:
# DF DATA PER TRIAL

# Exploration period duration
exploration_df = filtered_data[filtered_data['period_in_trial'] == 'exploration']
durExplorationTotal_df = exploration_df.groupby(['session', 'trialno', 'difficulty'])['durPeriodTotal'].sum().reset_index()
durExplorationTotal_df.rename(columns={'durPeriodTotal': 'durExplorationPeriod'}, inplace=True)                                              

# Decision period duration
decision_df = filtered_data[filtered_data['period_in_trial'] == 'decision']
durDecisionTotal_df = decision_df.groupby(['session', 'trialno', 'difficulty']) ['durPeriodTotal'].sum().reset_index()
durDecisionTotal_df.rename(columns={'durPeriodTotal': 'durDecisionPeriod'}, inplace=True)

# Trial duration
trialduration_df=filtered_data.groupby(['session', 'trialno', 'difficulty'])['durPeriodTotal'].sum().reset_index()

# Accuracy
decision_df = filtered_data[filtered_data['period_in_trial'] == 'decision']
accuracy_df = decision_df.groupby(['session', 'trialno', 'difficulty']) ['accuracy'].sum().reset_index()

# Side Chosen
decision_df = filtered_data[filtered_data['period_in_trial'] == 'decision']
sideChosen_df = decision_df.groupby(['session', 'trialno', 'difficulty']) ['sideChosen'].sum().reset_index()

#Number of switches during exploration period
exploration_df = filtered_data[filtered_data['period_in_trial'] == 'exploration']
SwitchesExploration_df = exploration_df.groupby(['session', 'trialno', 'difficulty'])['NbSwitches'].sum().reset_index()
SwitchesExploration_df.rename(columns={'NbSwitches': 'SwitchesExplorationPeriod'}, inplace=True)   

#Number of switches during decision period
decision_df = filtered_data[filtered_data['period_in_trial'] == 'decision']
SwitchesDecision_df = decision_df.groupby(['session', 'trialno', 'difficulty'])['NbSwitches'].sum().reset_index()
SwitchesDecision_df.rename(columns={'NbSwitches': 'SwitchesDecisionPeriod'}, inplace=True)   

#Number of switches in total
SwitchesTotal_df=filtered_data.groupby(['session', 'trialno', 'difficulty'])['NbSwitches'].sum().reset_index()

#Fixation number during exploration period
exploration_df = filtered_data[filtered_data['period_in_trial'] == 'exploration']
FixNbExploration_df = exploration_df.groupby(['session', 'trialno', 'difficulty'])['FixNb'].sum().reset_index()
FixNbExploration_df.rename(columns={'FixNb': 'FixNbExplorationPeriod'}, inplace=True)   

#Fixation number during decision period
decision_df = filtered_data[filtered_data['period_in_trial'] == 'decision']
FixNbDecision_df = decision_df.groupby(['session', 'trialno', 'difficulty'])['FixNb'].sum().reset_index()
FixNbDecision_df.rename(columns={'FixNb': 'FixNbDecisionPeriod'}, inplace=True)   

#Fixation number in total
FixNbTotal_df=filtered_data.groupby(['session', 'trialno', 'difficulty'])['FixNb'].sum().reset_index()

#Fixation number in top card during decision period
decision_df = filtered_data[filtered_data['period_in_trial'] == 'decision']
FixNbT_df = decision_df.groupby(['session', 'trialno', 'difficulty'])['FixNbInT'].sum().reset_index()
FixNbT_df.rename(columns={'FixNbInT': 'FixNbInTDecision'}, inplace=True)   

#Fixation number in right card durin exploration period
exploration_df = filtered_data[filtered_data['period_in_trial'] == 'exploration']
FixNbR_df = exploration_df.groupby(['session', 'trialno', 'difficulty'])['FixNbInR'].sum().reset_index()
FixNbR_df.rename(columns={'FixNbInR': 'FixNbInRExploration'}, inplace=True)

#Fixation number in left card during exploration period
exploration_df = filtered_data[filtered_data['period_in_trial'] == 'exploration']
FixNbL_df = exploration_df.groupby(['session', 'trialno', 'difficulty'])['FixNbInL'].sum().reset_index()
FixNbL_df.rename(columns={'FixNbInL': 'FixNbInLExploration'}, inplace=True)

# Look total duration in top card during decision period
decision_df = filtered_data[filtered_data['period_in_trial'] == 'decision']
LookDurT_df = filtered_data.groupby(['session', 'trialno', 'difficulty'])['LookTotDurInT'].sum().reset_index()
LookDurT_df.rename(columns={'LookTotDurInT': 'LookDurInTDecision'}, inplace=True)   

#Look Total Duration in right card during exploration period
exploration_df = filtered_data[filtered_data['period_in_trial'] == 'exploration']
LookDurInL_df = exploration_df.groupby(['session', 'trialno', 'difficulty'])['LookTotDurInL'].sum().reset_index()
LookDurInL_df.rename(columns={'LookTotDurInL': 'LookDurInLExploration'}, inplace=True)

#Look Total Duration in left card during exploration period
exploration_df = filtered_data[filtered_data['period_in_trial'] == 'exploration']
LookDurInR_df = exploration_df.groupby(['session', 'trialno', 'difficulty'])['LookTotDurInR'].sum().reset_index()
LookDurInR_df.rename(columns={'LookTotDurInR': 'LookDurInRExploration'}, inplace=True)

merged_filtered_data = accuracy_df.merge(sideChosen_df, on=['session', 'trialno', 'difficulty'], how='outer')
merged_filtered_data = merged_filtered_data.merge(durExplorationTotal_df, on=['session', 'trialno', 'difficulty'], how='outer')
merged_filtered_data = merged_filtered_data.merge(durDecisionTotal_df, on=['session', 'trialno', 'difficulty'], how='outer')
merged_filtered_data = merged_filtered_data.merge(trialduration_df, on=['session', 'trialno', 'difficulty'], how='outer')
merged_filtered_data = merged_filtered_data.merge(SwitchesExploration_df, on=['session', 'trialno', 'difficulty'], how='outer')
merged_filtered_data = merged_filtered_data.merge(SwitchesDecision_df, on=['session', 'trialno', 'difficulty'], how='outer')
merged_filtered_data = merged_filtered_data.merge(SwitchesTotal_df, on=['session', 'trialno', 'difficulty'], how='outer')
merged_filtered_data = merged_filtered_data.merge(FixNbExploration_df, on=['session', 'trialno', 'difficulty'], how='outer')
merged_filtered_data = merged_filtered_data.merge(FixNbDecision_df, on=['session', 'trialno', 'difficulty'], how='outer')
merged_filtered_data = merged_filtered_data.merge(FixNbTotal_df, on=['session', 'trialno', 'difficulty'], how='outer')
merged_filtered_data = merged_filtered_data.merge(FixNbT_df, on=['session', 'trialno', 'difficulty'], how='outer')
merged_filtered_data = merged_filtered_data.merge(FixNbL_df, on=['session', 'trialno', 'difficulty'], how='outer')
merged_filtered_data = merged_filtered_data.merge(FixNbR_df, on=['session', 'trialno', 'difficulty'], how='outer')
merged_filtered_data['FixNbBottomExploration'] = merged_filtered_data['FixNbInLExploration'] + merged_filtered_data['FixNbInRExploration']  #Fixation number in bottom cards during exploration period
merged_filtered_data = merged_filtered_data.merge(LookDurT_df, on=['session', 'trialno', 'difficulty'], how='outer')
merged_filtered_data = merged_filtered_data.merge(LookDurInL_df, on=['session', 'trialno', 'difficulty'], how='outer')
merged_filtered_data = merged_filtered_data.merge(LookDurInR_df, on=['session', 'trialno', 'difficulty'], how='outer')
merged_filtered_data['LookDurBottomExploration'] = merged_filtered_data['LookDurInLExploration'] + merged_filtered_data['LookDurInRExploration']  #Look Total Duration in bottom cards during exploration period

merged_filtered_data

Unnamed: 0,session,trialno,difficulty,accuracy,sideChosen,durExplorationPeriod,durDecisionPeriod,durPeriodTotal,SwitchesExplorationPeriod,SwitchesDecisionPeriod,...,FixNbDecisionPeriod,FixNb,FixNbInTDecision,FixNbInLExploration,FixNbInRExploration,FixNbBottomExploration,LookDurInTDecision,LookDurInLExploration,LookDurInRExploration,LookDurBottomExploration
0,curE002a,8,Easy,0.0,left,11270,924,12194,6,0,...,2,22,1,13,5,18,1208,4404,1076,5480
1,curE002a,9,Easy,1.0,left,2498,2182,4680,2,0,...,4,10,3,3,2,5,2176,1036,634,1670
2,curE002a,10,Easy,1.0,right,9912,2914,12826,3,1,...,6,17,3,3,5,8,2020,754,1904,2658
3,curE002a,11,Medium,1.0,right,2328,1582,3910,2,1,...,5,10,3,2,1,3,1780,584,410,994
4,curE002a,12,Medium,0.0,left,14272,984,15256,5,1,...,3,26,1,14,7,21,1296,4854,2700,7554
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1025,curE122a,27,Medium,1.0,left,13092,1598,14690,2,0,...,5,25,4,15,3,18,1484,8504,894,9398
1026,curE122a,28,Medium,1.0,right,13796,566,14362,3,1,...,2,24,0,5,16,21,288,1178,5768,6946
1027,curE122a,35,Hard,1.0,left,16894,1432,18326,5,0,...,0,38,0,22,15,37,118,6830,4860,11690
1028,curE122a,38,Hard,1.0,left,2414,582,2996,2,1,...,2,7,0,2,1,3,502,1316,270,1586


In [7]:
# Frequency of side bias occurrence per session
# Output: proportion of trials where the same side was consistently chosen

side_bias_frequency = filtered_data[filtered_data['period_in_trial'] == 'decision']
side_bias_frequency = side_bias_frequency.groupby('session')['sideChosen'].apply(lambda x: (x == x.iloc[0]).mean())

# Ratio of exploration time to decision time

merged_filtered_data['exploration_to_decision_ratio'] = merged_filtered_data['durExplorationPeriod'] / merged_filtered_data['durDecisionPeriod']
merged_filtered_data['exploration_to_decision_ratio'] 

ratios_exploration_decision = merged_filtered_data.groupby('session')['exploration_to_decision_ratio'].apply(lambda x: pd.Series(x).median())

In [8]:
# New df per session

# Median for each variable per session
durExplorationTotal_median = durExplorationTotal_df.groupby('session')['durExplorationPeriod'].median().reset_index()
durDecisionTotal_median = durDecisionTotal_df.groupby('session')['durDecisionPeriod'].median().reset_index()
accuracy_median = accuracy_df.groupby('session')['accuracy'].median().reset_index()
switches_median = SwitchesTotal_df.groupby('session')['NbSwitches'].median().reset_index()
switches_exploration_median = SwitchesExploration_df.groupby('session')['SwitchesExplorationPeriod'].median().reset_index()
switches_decision_median = SwitchesDecision_df.groupby('session')['SwitchesDecisionPeriod'].median().reset_index()
FixNb_median = FixNbTotal_df.groupby('session')['FixNb'].median().reset_index()
FixNbExploration_median = FixNbExploration_df.groupby('session')['FixNbExplorationPeriod'].median().reset_index()
FixNbDecision_median = FixNbDecision_df.groupby('session')['FixNbDecisionPeriod'].median().reset_index()
FixNbInTDecision_median = FixNbT_df.groupby('session')['FixNbInTDecision'].median().reset_index()
FixNbInRExploration_median = FixNbR_df.groupby('session')['FixNbInRExploration'].median().reset_index()
FixNbInLExploration_median = FixNbL_df.groupby('session')['FixNbInLExploration'].median().reset_index()
FixNbBottomExploration_median = merged_filtered_data.groupby('session')['FixNbBottomExploration'].median().reset_index()
LookDurInTDecision_median = LookDurT_df.groupby('session')['LookDurInTDecision'].median().reset_index()
LookDurInRExploration_median = LookDurInR_df.groupby('session')['LookDurInRExploration'].median().reset_index()
LookDurInLExploration_median = LookDurInL_df.groupby('session')['LookDurInLExploration'].median().reset_index()
LookDurBottomExploration_median = merged_filtered_data.groupby('session')['LookDurBottomExploration'].median().reset_index()

merged_median_data = accuracy_median.merge(side_bias_frequency, on='session')
merged_median_data = merged_median_data.merge(ratios_exploration_decision, on='session')
merged_median_data = merged_median_data.merge(durExplorationTotal_median, on='session')
merged_median_data = merged_median_data.merge(durDecisionTotal_median, on='session')
merged_median_data = merged_median_data.merge(switches_median, on='session')
merged_median_data = merged_median_data.merge(switches_exploration_median, on='session')
merged_median_data = merged_median_data.merge(switches_decision_median, on='session')
merged_median_data = merged_median_data.merge(FixNb_median, on='session')
merged_median_data = merged_median_data.merge(FixNbExploration_median, on='session')
merged_median_data = merged_median_data.merge(FixNbDecision_median, on='session')
merged_median_data = merged_median_data.merge(FixNbInTDecision_median, on='session')
merged_median_data = merged_median_data.merge(FixNbInRExploration_median, on='session')
merged_median_data = merged_median_data.merge(FixNbInLExploration_median, on='session')
merged_median_data = merged_median_data.merge(FixNbBottomExploration_median, on='session')
merged_median_data = merged_median_data.merge(LookDurInTDecision_median, on='session')
merged_median_data = merged_median_data.merge(LookDurInRExploration_median, on='session')
merged_median_data = merged_median_data.merge(LookDurInLExploration_median, on='session')
merged_median_data = merged_median_data.merge(LookDurBottomExploration_median, on='session')

merged_median_data.rename(columns={'sideChosen': 'sideBiasFreq'}, inplace=True)

merged_median_data    

Unnamed: 0,session,accuracy,sideBiasFreq,exploration_to_decision_ratio,durExplorationPeriod,durDecisionPeriod,NbSwitches,SwitchesExplorationPeriod,SwitchesDecisionPeriod,FixNb,FixNbExplorationPeriod,FixNbDecisionPeriod,FixNbInTDecision,FixNbInRExploration,FixNbInLExploration,FixNbBottomExploration,LookDurInTDecision,LookDurInRExploration,LookDurInLExploration,LookDurBottomExploration
0,curE002a,1.0,0.615385,2.976599,3816.0,2150.0,4.0,3.0,1.0,13.0,8.0,4.0,2.0,3.0,3.0,7.0,1780.0,1016.0,1036.0,2450.0
1,curE004a,1.0,0.68,3.087059,5248.0,1732.0,3.0,2.0,0.0,17.0,11.0,4.0,3.0,5.0,4.0,9.0,1870.0,1404.0,1152.0,3146.0
2,curE008a,0.0,0.25,1.200196,6705.0,5082.0,3.0,2.0,1.0,21.5,9.5,9.5,8.5,3.0,6.0,8.5,3828.0,1895.0,2131.0,3220.0
3,curE009a,1.0,0.416667,1.945337,4786.0,2932.0,3.0,2.0,0.0,19.0,12.5,6.5,5.0,5.0,4.5,10.5,2518.0,1716.0,1596.0,3424.0
4,curE011a,1.0,0.727273,1.853893,4287.0,2091.0,2.0,2.0,0.0,15.0,11.0,4.5,3.0,5.0,4.0,10.0,1868.0,1882.0,998.0,3254.0
5,curE012a,1.0,0.347826,1.769347,3732.0,1682.0,2.0,2.0,0.0,12.0,7.0,4.0,3.0,2.0,3.0,6.0,1642.0,798.0,1166.0,1970.0
6,curE013a,0.0,0.8,4.410599,6139.0,1242.0,3.0,3.0,0.0,16.5,12.5,3.0,2.0,4.5,5.5,10.0,1128.0,1224.0,1968.0,3802.0
7,curE014a,0.0,0.545455,2.568327,10374.0,2550.0,3.0,3.0,0.0,21.0,15.0,5.0,5.0,5.0,6.0,13.0,2186.0,2948.0,2048.0,5474.0
8,curE017a,1.0,0.571429,1.867953,4224.0,2589.0,2.0,1.0,0.0,13.0,9.0,4.0,3.0,2.5,4.0,7.0,1965.0,795.0,1344.0,2593.0
9,curE018a,1.0,0.457143,1.916667,3762.0,1916.0,2.0,2.0,0.0,14.0,8.0,4.0,3.0,4.0,3.0,7.0,1564.0,1442.0,1102.0,2800.0
