# Imports

In [None]:
import json
import numpy as np
import os
import pandas as pd
import plotnine as gg
gg.theme_set(gg.theme_classic)
default_figure_size = (6.4, 4.8)

In [None]:
# data_dir = 'C:/Users/maria/MEGAsync/Berkeley/CHaRLy/data/mTurk1'
data_dir = 'C:/Users/maria/MEGAsync/Berkeley/CHaRLy/data/RPP34'
plot_dir = data_dir + 'figures'
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)

# Get data

In [None]:
rule_data = pd.read_csv(os.path.join(data_dir, 'rule_data.csv'), index_col=0)
rule_data.loc[rule_data.phase == 'high'][:30]

In [None]:
all_data = pd.read_csv(os.path.join(data_dir, 'all_data.csv'), index_col=0)
all_data = all_data.loc[np.invert(all_data['inattentives'] | all_data['psych_disorder'] | all_data['head_trauma'])]
all_data

## Create new dataframes

### first_dat

In [None]:
id_cols = ['sid', 'trial_type', 'phase', 'block', 'middle_item', 'subtrial']
first_dat = all_data.loc[all_data['bool_middle_item']].groupby(id_cols).aggregate('min').reset_index()[id_cols + ['trial']]
first_dat

### first_dat_high

In [None]:
id_cols = ['sid', 'trial_type', 'phase', 'block', 'goal_star']
all_data['bool_unlocked_star'] = (np.invert(np.isnan(all_data['unlocked_star'])) & (all_data['unlocked_star'] > -1))
first_dat_high = all_data.loc[all_data['bool_unlocked_star']].groupby(id_cols).aggregate('min').reset_index()[id_cols + ['trial']]
first_dat_high

### first_dat_w

In [None]:
# Look at difference in discovery between subtrial 1 and 3 -> does it transfer?
first_dat['subtrial'] = first_dat['subtrial'].astype(int).astype(str)
first_dat_w = first_dat.pivot_table(
    index=['sid', 'trial_type', 'phase', 'block', 'middle_item'],
    columns=['subtrial'],
    values=['trial']
).reset_index()
first_dat_w.columns = [''.join([str(e) for e in f]) for f in first_dat_w.columns.values]
first_dat_w['trial_diff_s1s3'] = np.abs(first_dat_w['trial1'] - first_dat_w['trial3'])
first_dat_w

### first_dat_extra

In [None]:
trial_shifts = range(-5, 10)
first_dat_extra = pd.DataFrame()

for row_i in range(first_dat.shape[0]):
    
    if (row_i % 50) == 1:
        print('{} of {} rows ({}%)                                                     '.format(
            row_i+1, first_dat.shape[0], 100 * np.round((row_i+1) / first_dat.shape[0], 3)), end='\r')
    
    for trial_shift in trial_shifts:
        row = first_dat.loc[row_i, ['sid', 'trial_type', 'phase', 'block', 'trial', 'subtrial', 'middle_item']]
        row['trial'] += trial_shift
        row['trial_reltofirst'] = trial_shift
        first_dat_extra = first_dat_extra.append(row)

In [None]:
first_dat_extra = first_dat_extra.rename(columns={'middle_item': 'middle_item_sequence', 'subtrial': 'subtrial_sequence'})
first_dat_extra = first_dat_extra.loc[first_dat_extra['trial'] >= 0]
first_dat_extra = first_dat_extra.reset_index(drop=True)
first_dat_extra

### f_dat

In [None]:
f_dat = pd.merge(first_dat_extra, all_data, how='left')
for item in set(first_dat['middle_item']):
    f_dat['bool_item_{}'.format(item)] = (f_dat['middle_item'] == item).astype(int)
f_dat[['sid', 'trial_type', 'phase', 'block', 'trial', 'subtrial', 'trial_reltofirst', 'rt', 'middle_item',
       'middle_item_sequence', 'subtrial_sequence', 'bool_item_1.0']]
f_dat

In [None]:
f_dat['item_after_first'] = 0
f_dat.loc[(f_dat['middle_item_sequence'] == 0) & (f_dat['bool_item_0.0'] == 1), 'item_after_first'] = 1
f_dat.loc[(f_dat['middle_item_sequence'] == 1) & (f_dat['bool_item_1.0'] == 1), 'item_after_first'] = 1
f_dat.loc[(f_dat['middle_item_sequence'] == 2) & (f_dat['bool_item_2.0'] == 1), 'item_after_first'] = 1
f_dat.loc[(f_dat['middle_item_sequence'] == 3) & (f_dat['bool_item_3.0'] == 1), 'item_after_first'] = 1
f_dat

### perf_dat

In [None]:
subj_dat = first_dat_w.groupby(['sid', 'phase', 'trial_type']).aggregate('mean').reset_index().drop(columns=['block', 'middle_item'])
subj_dat

In [None]:
id_cols = ['sid', 'phase', 'trial_type']
perf_dat = all_data.groupby(id_cols).aggregate('mean').reset_index()[id_cols + ['rt', 'log_rt', 'acc', 'rt_zz_low', 'rt_zz_high']]
perf_dat = pd.merge(subj_dat, perf_dat)
perf_dat

# Results

## Choices around item discovery

In [None]:
f_dat['shape'] = f_dat['acc'].apply(lambda x: 0 if np.isnan(x) or x == 0 else 1)
sub_dat = f_dat.loc[
    (f_dat['sid'].isin(list(set(f_dat['sid']))[:10])) &  # Display 10 participants to keep figure reasonable
    (f_dat['block'] <= 1)  # Display only blocks 0 and 1
     ]
sub_dat

In [None]:
gg.options.figure_size = (10, 6)
g = (gg.ggplot(sub_dat, gg.aes('subtrial', 'trial_reltofirst', color='factor(key_press)', shape='factor(shape)'))
     + gg.geom_point()
     + gg.facet_grid('trial_type ~ block + sid', scales='free_x')
    )
print(g)
g.save(os.path.join(plot_dir, '104_RawKeyPressesAroundItemDiscover.png'))
gg.options.figure_size = default_figure_size

## RTs around item discovery

In [None]:
subj_dat = f_dat.groupby(['sid', 'phase', 'trial_type', 'trial_reltofirst', 'subtrial']).aggregate('mean').reset_index()

# Strong decrease in subtrial0 and subtrial1 RT! => Because we're learning this item :)
gg.options.figure_size = (5, 4)
g = (gg.ggplot(subj_dat, gg.aes('trial_reltofirst', 'rt', color='subtrial', group='subtrial'))
     + gg.stat_summary(position=gg.position_dodge(width=0.5))
     + gg.stat_summary(geom='line', position=gg.position_dodge(width=0.5))
     + gg.geom_vline(xintercept=0, linetype='dotted')
     + gg.facet_grid('phase ~ trial_type')
    )
print(g)
g.save(os.path.join(plot_dir, '104_subtrialRtsOverTrialreltofirst.png'))

In [None]:
g += gg.aes(y='log_rt')
g

In [None]:
# RT zigzag low also seems to go down!
g += gg.aes(y='rt_zz_low', color='trial_reltofirst' == 0)# + gg.stat_summary(gg.aes(y='rt_zz_high'), color='red')
g.data = g.data.loc[g.data['subtrial'] == 0]
g

In [None]:
# But not high zigzag!
g += gg.aes(y='rt_zz_high')
g

## Trials it takes to transfer a sequence from positions01 to 23

In [None]:
gg.options.figure_size = (5, 5)
g = (gg.ggplot(first_dat_w, gg.aes('block', 'trial_diff_s1s3', color='factor(middle_item)'))
     + gg.stat_summary()
     + gg.stat_summary(geom='line')
     + gg.facet_grid('phase ~ trial_type')
    )
g

In [None]:
# Gradual decrease in the number of trials needed to activate middle_items
# In low transfer, 2 & 3 take much longer than 0 & 1 -> because they are new!
# In high transfer, previous knowledge is retained :)
g = (gg.ggplot(first_dat, gg.aes('block', 'trial', color='factor(middle_item)'))
     + gg.stat_summary(position=gg.position_dodge(width=0.5))
     + gg.stat_summary(position=gg.position_dodge(width=0.5), geom='line')
     + gg.facet_grid('phase ~ trial_type')
    )
print(g)
g.save(os.path.join('104_trialOfSequenceDiscoveryOverBlock.png'))

## Time to discover each goal star

In [None]:
# Takes longer in low than high?
g += gg.aes(color='factor(goal_star)')
g.data = first_dat_high
g

## Frequency of middle-layer item after first discovery

In [None]:
# subj_dat = first_dat_extra.groubpy(['sid', 'phase', 'trial_type', 'trial_reltofirst']).aggregate('mean').reset_index()
subj_dat = f_dat.loc[f_dat['subtrial'] == 1]#.groupby(['sid', 'phase', 'trial_type', 'trial_reltofirst', 'subtrial_sequence', 'middle_item_sequence']).aggregate('mean').reset_index()

g = (gg.ggplot(subj_dat, gg.aes('trial_reltofirst', 'item_after_first', color='middle_item_sequence', group='middle_item_sequence'))
     + gg.stat_summary()
     + gg.stat_summary(geom='line')
     + gg.geom_vline(xintercept=0, linetype='dotted')
     + gg.facet_grid('phase + subtrial_sequence ~ trial_type')
    )
print(g)
# g.save(os.path.join(plot_dir, '105_itemFrequencyOverTrialsreltofirst.png'))

In [None]:
# subj_dat = first_dat_extra.groubpy(['sid', 'phase', 'trial_type', 'trial_reltofirst']).aggregate('mean').reset_index()
subj_dat = f_dat.loc[f_dat['subtrial'] == 1].groupby(['sid', 'phase', 'trial_type', 'trial_reltofirst', 'subtrial_sequence', 'middle_item_sequence']).aggregate('mean').reset_index()

g = (gg.ggplot(subj_dat, gg.aes('trial_reltofirst', 'bool_item_0.0', color='middle_item_sequence', group='middle_item_sequence'))
     + gg.stat_summary()
     + gg.stat_summary(geom='line')
     + gg.geom_vline(xintercept=0, linetype='dotted')
     + gg.facet_grid('phase + subtrial_sequence ~ trial_type')
    )
print(g)
g.save(os.path.join(plot_dir, '105_itemFrequencyOverTrialsreltofirst.png'))

In [None]:
g += gg.aes(y='bool_item_1.0')
g

In [None]:
g += gg.aes(y='bool_item_2.0')
g

In [None]:
g += gg.aes(y='bool_item_3.0')
g

## Relationship between sequence transfer (`trial_diff_s1s3`) and overall performance?

In [None]:
gg.options.figure_size = (8, 4)
g = (gg.ggplot(perf_dat, gg.aes('trial_diff_s1s3', 'acc', color='phase'))
     + gg.geom_point()
     + gg.geom_smooth(method='lm')
     + gg.facet_grid('~ trial_type')
    )
print(g)
g.save(os.path.join('104_accOverTrialDiffSequenceDiscorvery.png'))

In [None]:
g += gg.aes(y='rt')
g

In [None]:
g += gg.aes(y='rt_zz_low')
g

In [None]:
g += gg.aes(y='rt_zz_high')
g