# Imports

In [None]:
import json
import numpy as np
import os
import pandas as pd
import plotnine as gg
import statsmodels.formula.api as smf
from Functions import *
gg.theme_set(gg.theme_classic)
default_figure_size = (6.4, 4.8)

In [None]:
# data_dir = 'C:/Users/maria/MEGAsync/Berkeley/CHaRLy/data/mTurk1'
make_firstdatextra = False
data_dir = 'C:/Users/maria/MEGAsync/Berkeley/CHaRLy/data/RPP34'
plot_dir = data_dir + 'figures'
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)

# Get data

In [None]:
rule_data = pd.read_csv(os.path.join(data_dir, 'rule_data.csv'), index_col=0)
rule_data.loc[rule_data.phase == 'high'][:30]

In [None]:
all_data = pd.read_csv(os.path.join(data_dir, 'all_data.csv'), index_col=0)
all_data['subtrial'] = all_data['subtrial'].astype(int)
all_data

# Results

## Prepare data

In [None]:
# Get the trial in which an item was first discovered, for each block, each item, each participant
id_cols = ['sid', 'trial_type', 'phase', 'block', 'middle_item_both', 'subtrial']
first_dat = all_data.groupby(id_cols).aggregate('min').reset_index()[id_cols + ['trial']]
first_dat = first_dat.loc[first_dat['middle_item_both'] < 8]  # Only keep 4 non-items
first_dat['middle_item_both'] = first_dat['middle_item_both'].astype(int)
first_dat = first_dat.reset_index(drop=True)
first_dat

In [None]:
def make_first_dat_extra(first_dat, trial_shifts=range(-5, 10)):
    
    """
    Select 5 trials before and 10 trials after the first discover of an item and put them in a dataframe.
    """

    first_dat_extra = pd.DataFrame()

    for row_i in range(first_dat.shape[0]):

        if (row_i % 50) == 1:
            print('{} of {} rows ({}%)                       '.format(
                row_i+1, first_dat.shape[0], 100 * np.round((row_i+1) / first_dat.shape[0], 3)), end='\r')

        for trial_shift in trial_shifts:
            row = first_dat.loc[row_i, ['sid', 'trial_type', 'phase', 'block', 'trial', 'subtrial', 'middle_item_both']]
            row['trial'] += trial_shift
            row['trial_reltofirst'] = trial_shift
            first_dat_extra = first_dat_extra.append(row)
            
    first_dat_extra = first_dat_extra.rename(columns={'middle_item_both': 'middle_item_both_sequence', 'subtrial': 'subtrial_sequence'})
    first_dat_extra = first_dat_extra.loc[first_dat_extra['trial'] >= 0]
    first_dat_extra['trial_reltofirst'] = first_dat_extra['trial_reltofirst'].astype(int)
    first_dat_extra = first_dat_extra.reset_index(drop=True)
            
    return first_dat_extra

In [None]:
if make_firstdatextra:
    
    first_dat_extra = make_first_dat_extra(first_dat, trial_shifts=range(-5, 10))
    first_dat_extra.to_csv(os.path.join(data_dir, 'first_dat_extra.csv'), index=False)
else:

    first_dat_extra = pd.read_csv(os.path.join(data_dir, 'first_dat_extra.csv'))

In [None]:
first_dat_extra.loc[first_dat_extra['middle_item_both_sequence'] > 4][:30]

In [None]:
# Get info on actions, rts, etc. on these selected trials and add them to the dataframe
f_dat = pd.merge(first_dat_extra, all_data, how='left')

# `sequence_item_bool` indicates whether the current item is the one that this sequence is about, i.e., 
# the one that occurs for the first time at `trial_reltofirst` == 0 and because of which we collected all
# the trials before and after.
f_dat['sequence_item_bool'] = f_dat['middle_item_both_sequence'] == f_dat['middle_item_both']
f_dat['sequence_item_bool'] = f_dat['sequence_item_bool'].astype(int)

# exists indicates whether a 2-key sequence "exists" (leads to an item on the screen) or not (any made-up sequence
# with no consequence for the game).
f_dat['exists'] = f_dat['middle_item_both'] < 4

# Check it out
f_dat[
    ['sid', 'phase', 'trial_type', 'block', 'trial', 'subtrial', 'action_id', 'middle_item_both_sequence', 'middle_item_both', 'subtrial_sequence', 'trial_reltofirst', ]
][40:70]

## Choices around item discovery

In [None]:
sub_dat = f_dat.loc[
    (f_dat['sid'].isin(list(set(f_dat['sid']))[:5])) &  # Display 10 participants to keep figure reasonable
    (f_dat['block'] == 0) &  # Display only blocks 0 and 1
    (f_dat['middle_item_both_sequence'] < 4)
     ]
sub_dat

In [None]:
gg.options.figure_size = (7, 5)
g = (gg.ggplot(sub_dat, gg.aes('subtrial', 'trial_reltofirst', color='factor(key_press)', shape='factor(bool_middle_item)'))
     + gg.geom_point()
     + gg.coord_cartesian(xlim=(-0.5, 3.1))
     + gg.facet_grid('trial_type ~ subtrial_sequence + sid', scales='free_x')#, labeller='label_both')
    )
print(g)
g.save(os.path.join(plot_dir, '104_RawKeyPressesAroundItemDiscover.png'))
gg.options.figure_size = default_figure_size

## RTs around item discovery

In [None]:
subj_dat = f_dat.loc[(f_dat['subtrial_sequence'] == 1)].groupby(
    ['sid', 'phase', 'trial_type', 'trial_reltofirst', 'subtrial']).aggregate('mean').reset_index()

# Strong decrease in subtrial0 and subtrial1 RT! => Because we're learning this item :)
gg.options.figure_size = (5, 4)
g = (gg.ggplot(subj_dat, gg.aes('trial_reltofirst', 'rt', color='subtrial', group='subtrial'))
     + gg.stat_summary(position=gg.position_dodge(width=0.1))
     + gg.stat_summary(geom='line', position=gg.position_dodge(width=0.1))
     + gg.geom_vline(xintercept=0, linetype='dotted')
     + gg.labs(x='Trial (aligned to first item discovery)', y='Response time (msec)')
     + gg.facet_grid('phase ~ trial_type')
    )
print(g)
g.save(os.path.join(plot_dir, '104_subtrialRtsOverTrialreltofirst.png'))

In [None]:
g += gg.aes(y='log_rt')
g

In [None]:
sub_dat_st2 = f_dat.loc[
    f_dat['subtrial_sequence'].isin([1]) & (f_dat['trial_reltofirst'].isin([-1, 0])) & (f_dat['middle_item_both_sequence'] < 4)
]

subj_dat = sub_dat_st2.groupby(['sid', 'phase', 'trial_type', 'subtrial', 'subtrial_sequence', 'trial_reltofirst', 'middle_item_both_sequence']).aggregate('mean').reset_index()

g = (gg.ggplot(subj_dat, gg.aes('factor(trial_reltofirst)', 'log_rt', color='phase', group='phase'))
     + gg.stat_summary(position=gg.position_dodge(width=0.3))
     + gg.stat_summary(position=gg.position_dodge(width=0.3), geom='line')
     + gg.facet_grid('subtrial ~ trial_type', scales='free')
    )
g

In [None]:
def get_diff_dat(sub_dat,
                 id_cols = ['sid', 'phase', 'trial_type', 'block', 'middle_item_both_sequence', 'subtrial_sequence', 'subtrial']):
    
    diff_dat = pd.merge(
        sub_dat.loc[sub_dat['trial_reltofirst'] == -1, id_cols + ['rt']],
        sub_dat.loc[sub_dat['trial_reltofirst'] == 0, id_cols + ['trial', 'trial_', 'rt']],
        on=id_cols, suffixes=['_pre', '_at']
    )
    diff_dat['rt_at_minus_pre'] = diff_dat['rt_at'] - diff_dat['rt_pre']
    diff_dat['changed_item'] = diff_dat['middle_item_both_sequence'].isin([2, 3])  # Which items were modified in low transfer?
    diff_dat = diff_dat.dropna()

    return diff_dat

# # Example use
# get_diff_dat(sub_dat)

In [None]:
gg.options.figure_size = (5, 4)
diff_dat = get_diff_dat(sub_dat_st2)
g = (gg.ggplot(diff_dat, gg.aes('block', 'rt_at_minus_pre', color='factor(middle_item_both_sequence)'))
     + gg.stat_summary(position=gg.position_dodge(width=0.3))
     + gg.stat_summary(position=gg.position_dodge(width=0.3), geom='line')
     + gg.geom_hline(yintercept=0, linetype='dotted')
     + gg.facet_grid('phase ~ subtrial + trial_type')
    )
g

In [None]:
subj_dat = diff_dat.groupby(['sid', 'trial_type', 'phase', 'subtrial']).aggregate('mean').reset_index()

gg.options.figure_size = (5, 3)
g = (gg.ggplot(subj_dat, gg.aes('trial_type', 'rt_at_minus_pre', color='factor(phase)', group='factor(phase)'))
     + gg.stat_summary(position=gg.position_dodge(width=0.3))
     + gg.stat_summary(position=gg.position_dodge(width=0.3), geom='line')
     + gg.geom_hline(yintercept=0, linetype='dotted')
     + gg.labs(x='', color='', y='RT slowing after first item')
     + gg.facet_grid('~ subtrial')
    )
print(g)
g.save(os.path.join(plot_dir, 'rt_at_minus_preOverPhase.png'))

In [None]:
subj_dat = diff_dat.groupby(['sid', 'trial_type', 'phase', 'changed_item', 'subtrial']).aggregate('mean').reset_index()

g = g + gg.aes(color='changed_item', group='changed_item') + gg.facet_grid('phase ~ subtrial')
g.data = subj_dat
g

In [None]:
predictors = [
    '1',
    'block + phase * trial_type',
]

mod_dat = diff_dat.loc[diff_dat['subtrial'] == 2]
for pred in predictors:
    mod = smf.mixedlm(formula='rt_at_minus_pre ~ {}'.format(pred), data=mod_dat, groups=mod_dat['sid']).fit()
    print(mod.summary())

## "Practicing" items after first discovery

### More repetition of existing that non-existing sequences -> item appearance motivates repetition

In [None]:
id_cols = ['sid', 'phase', 'trial_type', 'trial_reltofirst', 'exists']
subj_dat = f_dat.loc[
    (f_dat['subtrial'] == f_dat['subtrial_sequence']) &  # indicates whether the current `subtrial` (indicating the subtrial of the action, rt, etc.) is the subtrial of initial item discovery (`middle_item_both_sequence`).
    (f_dat['middle_item_both'] < 8) &  # 4 "existing" (lead to item) and 4 "non-existing" items (random 2-key sequences), for comparable numbers
    (f_dat['acc'] == 0)  # Only trials in which NO star was achieved, to control for that
].groupby(id_cols).aggregate('mean').reset_index()

g = (gg.ggplot(subj_dat, gg.aes('trial_reltofirst', 'sequence_item_bool', color='exists', group='exists'))
#      + gg.geom_point(alpha=0.1, position='jitter')
     + gg.stat_summary(position=gg.position_dodge(width=0.2))
     + gg.stat_summary(position=gg.position_dodge(width=0.2), geom='line')
     + gg.geom_hline(yintercept=0, linetype='dotted')
     + gg.geom_vline(xintercept=0, linetype='dotted')
     + gg.labs(x='Trial (aligned to first item discovery)', y='Frequency of sequence', color='Item appeared')
     + gg.coord_cartesian(xlim=(-2, 9))
     + gg.facet_grid('phase ~ trial_type')
    )
print(g)
g.save(os.path.join(plot_dir, '104_repeatItemsAfterDiscoveryByExist.png'))

In [None]:
id_cols = ['sid', 'phase', 'trial_type', 'trial_reltofirst', 'exists']
diff_dat = pd.merge(
    subj_dat.loc[subj_dat['exists'], id_cols[:-1] + ['sequence_item_bool']],
    subj_dat.loc[np.invert(subj_dat['exists']), id_cols[:-1] + ['sequence_item_bool']],
    on=id_cols[:-1], suffixes=['_exists', '_existsnot']
)
diff_dat['seq_bool_ex_minus_not'] = diff_dat['sequence_item_bool_exists'] - diff_dat['sequence_item_bool_existsnot']
diff_dat = diff_dat.loc[diff_dat['trial_reltofirst'] > 0]  # Trials AFTER item discovery
diff_dat

In [None]:
m = (gg.ggplot(diff_dat, gg.aes('trial_reltofirst', 'seq_bool_ex_minus_not'))
     + gg.stat_summary()
     + gg.stat_summary(geom='line')
     + gg.geom_hline(yintercept=0, linetype='dotted')
     + gg.geom_vline(xintercept=0, linetype='dotted')
     + gg.facet_grid('phase ~ trial_type')
    )
m

In [None]:
predictors = [
    '1',
    'trial_reltofirst + phase * trial_type',
]

for pred in predictors:
    mod = smf.mixedlm(formula='seq_bool_ex_minus_not ~ {}'.format(pred), data=diff_dat, groups=diff_dat['sid']).fit()
    print(mod.summary())

### Split up by individual items

In [None]:
subj_dat = f_dat.loc[
    (f_dat['subtrial'] == f_dat['subtrial_sequence']) & (f_dat['middle_item_both'] < 8)
].groupby(['sid', 'phase', 'trial_type', 'trial_reltofirst', 'middle_item_both']).aggregate('mean').reset_index()

g += gg.aes(color='factor(middle_item_both)', group='factor(middle_item_both)')
g.data = subj_dat
print(g)
g.save(os.path.join(plot_dir, '104_repeatItemsAfterDiscoveryByItem.png'))

### Differences during transfer -> modified items are practiced more (?)

In [None]:
subj_dat = f_dat.loc[
    (f_dat['subtrial'] == f_dat['subtrial_sequence']) & (f_dat['middle_item_both'] < 4) & (f_dat['trial_type'] == 'transfer')
].groupby(['sid', 'phase', 'trial_type', 'trial_reltofirst', 'middle_item_both', 'subtrial_sequence']).aggregate('mean').reset_index()

g += gg.facet_grid('phase ~ trial_type + subtrial_sequence')
g.data = subj_dat
print(g)
g.save(os.path.join('104_repeatItemsAfterDiscoveryByItemSubtrial.png'))

## Trials it takes to transfer a sequence between positions01 and 23

In [None]:
# Look at difference in discovery between subtrial 1 and 3 -> does it transfer?
first_dat['subtrial'] = first_dat['subtrial'].astype(int)
diff_dat = first_dat.pivot_table(
    index=['sid', 'trial_type', 'phase', 'block', 'middle_item_both'],
    columns=['subtrial'],
    values=['trial']
).reset_index()
diff_dat.columns = [''.join([str(e) for e in f]) for f in diff_dat.columns.values]
diff_dat['trial_diff_s1s3'] = np.abs(diff_dat['trial1'] - diff_dat['trial3'])
diff_dat['exists'] = diff_dat['middle_item_both'] < 4
diff_dat = diff_dat.dropna()
diff_dat

In [None]:
subj_dat = diff_dat.loc[diff_dat['exists']].groupby(['sid', 'phase', 'trial_type', 'block', 'middle_item_both']).aggregate('mean').reset_index()

gg.options.figure_size = (5, 5)
g = (gg.ggplot(subj_dat, gg.aes('block', 'trial_diff_s1s3', color='factor(middle_item_both)'))
     + gg.stat_summary(position=gg.position_dodge(width=0.1))
     + gg.stat_summary(position=gg.position_dodge(width=0.1), geom='line')
     + gg.facet_grid('phase ~ trial_type')
    )
g

In [None]:
subj_dat = diff_dat.groupby(['sid', 'phase', 'trial_type', 'block', 'exists']).aggregate('mean').reset_index()

gg.options.figure_size = (3, 4)
g = (gg.ggplot(subj_dat, gg.aes('1', 'trial_diff_s1s3', fill='exists'))
     + gg.stat_summary(position=gg.position_dodge(width=0.9), geom='bar')
     + gg.stat_summary(position=gg.position_dodge(width=0.9))
     + gg.scale_x_continuous(breaks=[])
     + gg.labs(x='', y='Trials to transfer sequence between positions', fill='Item appeared')
     + gg.facet_grid('phase ~ trial_type')
    )
print(g)
g.save(os.path.join('104_trialsToTransferByExists.png'))

In [None]:
subj_dat.groupby('exists').aggregate('mean').reset_index()[['exists', 'trial_diff_s1s3']]

In [None]:
id_cols = ['sid', 'phase', 'trial_type', 'block']
diffdiff_dat = pd.merge(
    subj_dat.loc[subj_dat['exists'], id_cols + ['trial_diff_s1s3']],
    subj_dat.loc[np.invert(subj_dat['exists']), id_cols + ['trial_diff_s1s3']],
    on=id_cols, suffixes=['_exists', '_existsnot']
)
diffdiff_dat['trial_diff_s1s3_exists_minus_not'] = diffdiff_dat['trial_diff_s1s3_exists'] - diffdiff_dat['trial_diff_s1s3_existsnot']
diffdiff_dat

In [None]:
g = (gg.ggplot(diffdiff_dat, gg.aes('block', 'trial_diff_s1s3_exists_minus_not'))
     + gg.stat_summary()
     + gg.geom_hline(yintercept=0, linetype='dotted')
     + gg.facet_grid('phase ~ trial_type')
    )
g

In [None]:
mod = smf.mixedlm(
    formula='trial_diff_s1s3_exists_minus_not ~ block',
    data=diffdiff_dat, groups=diffdiff_dat['sid']
).fit()
mod.summary()

In [None]:
# Gradual decrease in the number of trials needed to activate middle_items
# In low transfer, 2 & 3 take much longer than 0 & 1 -> because they are new!
# In high transfer, previous knowledge is retained :)
g = (gg.ggplot(first_dat, gg.aes('block', 'trial', color='factor(middle_item_both)'))
     + gg.stat_summary(position=gg.position_dodge(width=0.5))
     + gg.stat_summary(position=gg.position_dodge(width=0.5), geom='line')
     + gg.labs(y='Trial of first activation in block')
     + gg.facet_grid('phase ~ trial_type')
    )
print(g)
g.save(os.path.join('104_trialOfSequenceDiscoveryOverBlock.png'))

## Change in RT patterns with practice

In [None]:
cols = ['sid', 'phase', 'trial_type', 'block', 'middle_item_both', 'subtrial', 'trial', 'rt']

# sub_dat1 = all_data.loc[(all_data['subtrial'] == 1), cols]
# sub_dat3 = all_data.loc[(all_data['subtrial'] == 3), cols]

sub_dat1 = all_data.loc[(all_data['middle_item_both'] < 8) & (all_data['subtrial'] == 1), cols]
sub_dat3 = all_data.loc[(all_data['middle_item_both'] < 8) & (all_data['subtrial'] == 3), cols]

sub_dat1['middle_item_both'] = sub_dat1['middle_item_both'].astype(int)
sub_dat3['middle_item_both'] = sub_dat3['middle_item_both'].astype(int)

sub_dat3[:30]

In [None]:
id_cols = ['sid', 'phase', 'trial_type', 'block', 'trial']

d1 = sub_dat1.pivot_table(index=id_cols, columns='middle_item_both', values='rt').reset_index()
d1.columns = ['rt{}'.format(c) if type(c) == int else c for c in d.columns]

d3 = sub_dat3.pivot_table(index=id_cols, columns='middle_item_both', values='rt').reset_index()
d3.columns = ['rt{}'.format(c) if type(c) == int else c for c in d.columns]

d3

In [None]:
def get_rt_dat(d, subtrial, middle_items=range(8)):
    
    rt_dat = pd.DataFrame()

    for middle_item in middle_items:

        item_dat = d[['sid', 'phase', 'trial_type', 'block', 'trial', 'rt{}'.format(middle_item)]].dropna()

        # Count trials
        id_cols = ['sid', 'phase', 'trial_type', 'block']
        item_dat['ones'] = 1
        item_dat['times_got_item'] = item_dat[id_cols + ['ones']].groupby(id_cols).cumsum()
        item_dat = item_dat.drop(columns=['ones'])

        # Beautify
        item_dat = item_dat.rename(columns={'rt{}'.format(middle_item): 'rt'})
        item_dat['middle_item_both'] = middle_item
        item_dat['subtrial'] = subtrial

        rt_dat = rt_dat.append(item_dat)

    rt_dat['exists'] = rt_dat['middle_item_both'] < 4
    
    return rt_dat

In [None]:
rt_dat = get_rt_dat(d1, 1).append(get_rt_dat(d3, 3))
subj_dat = rt_dat.groupby(['sid', 'phase', 'trial_type', 'times_got_item', 'middle_item_both', 'subtrial']).aggregate('mean').reset_index()

gg.options.figure_size = (6, 4)
g = (gg.ggplot(subj_dat, gg.aes('times_got_item', 'rt', color='middle_item_both', group='middle_item_both'))
     + gg.stat_summary(position=gg.position_dodge(width=0.1))
     + gg.stat_summary(position=gg.position_dodge(width=0.1), geom='line')
     + gg.facet_grid('phase ~ trial_type + subtrial', labeller='labeller_both')
     + gg.coord_cartesian(ylim=(0, 450))
    )
print(g)

In [None]:
subj_dat = rt_dat.loc[rt_dat['exists']
                     ].groupby(['sid', 'phase', 'trial_type', 'times_got_item', 'middle_item_both']).aggregate('mean').reset_index()

g = g + gg.facet_grid('phase ~ trial_type') + gg.coord_cartesian(ylim=(120, 280))
g.data = subj_dat
print(g)
g.save(os.path.join('104_rt2OverTimesSameSequenceExecuted.png'))

In [None]:
subj_dat = rt_dat.groupby(['sid', 'times_got_item', 'exists']).aggregate('mean').reset_index()

gg.options.figure_size = (3, 3)
g = (gg.ggplot(subj_dat, gg.aes('times_got_item', 'rt', color='exists', group='exists'))
     + gg.stat_summary(position=gg.position_dodge(width=0.1))
     + gg.stat_summary(position=gg.position_dodge(width=0.1), geom='line')
     + gg.labs(x='# Sequence executions', y='RT at sequence completion', color='Item appeared')
     + gg.coord_cartesian(xlim=(0, 15), ylim=(120, 280))
    )
print(g)
g.save(os.path.join('104_rt2OverTimesSameSequenceExecutedByExists.png'))

In [None]:
# subj_dat = rt_dat.loc[rt_dat['exists']].groupby(['sid', 'trial_type', 'times_got_item', 'subtrial', 'block']).aggregate('mean').reset_index()
subj_dat = rt_dat.groupby(['sid', 'trial_type', 'times_got_item', 'exists', 'block']).aggregate('mean').reset_index()

mod = smf.mixedlm('rt ~ times_got_item * exists + block * exists + trial_type * exists', subj_dat, groups=subj_dat['sid']).fit()
mod.summary()

## Time to discover each goal star

In [None]:
id_cols = ['sid', 'trial_type', 'phase', 'block', 'goal_star']
all_data['bool_unlocked_star'] = (np.invert(np.isnan(all_data['unlocked_star'])) & (all_data['unlocked_star'] > -1))
first_dat_high = all_data.loc[all_data['bool_unlocked_star']].groupby(id_cols).aggregate('min').reset_index()[id_cols + ['trial']]
first_dat_high['n_unique_items'] = get_n_unique_items(first_dat_high)
first_dat_high

In [None]:
subj_dat = first_dat_high#diff_dat.loc[diff_dat['exists']].groupby(['sid', 'phase', 'trial_type', 'block', 'middle_item_both']).aggregate('mean').reset_index()

gg.options.figure_size = (5, 5)
g = (gg.ggplot(subj_dat, gg.aes('block', 'trial', color='goal_star', group='goal_star'))
     + gg.stat_summary(position=gg.position_dodge(width=0.1))
     + gg.stat_summary(position=gg.position_dodge(width=0.1), geom='line')
     + gg.facet_grid('phase ~ trial_type')
    )
print(g)
g.save(os.path.join('104_trialtofirstOverBlockForStars.png'))

In [None]:
g += gg.aes(color='n_unique_items')
print(g)
g.save(os.path.join('104_trialtofirstOverBlockForNuniqueitems.png'))

## Relationship between sequence transfer (`trial_diff_s1s3`) and overall performance?

### perf_dat

In [None]:
subj_dat = first_dat_w.groupby(['sid', 'phase', 'trial_type']).aggregate('mean').reset_index().drop(columns=['block', 'middle_item_both'])
subj_dat

In [None]:
id_cols = ['sid', 'phase', 'trial_type']
perf_dat = all_data.groupby(id_cols).aggregate('mean').reset_index()[id_cols + ['rt', 'log_rt', 'acc']]
perf_dat = pd.merge(subj_dat, perf_dat)
perf_dat

In [None]:
gg.options.figure_size = (8, 4)
g = (gg.ggplot(perf_dat, gg.aes('trial_diff_s1s3', 'acc', color='phase'))
     + gg.geom_point()
     + gg.geom_smooth(method='lm')
     + gg.facet_grid('~ trial_type')
    )
print(g)
g.save(os.path.join('104_accOverTrialDiffSequenceDiscorvery.png'))

In [None]:
g += gg.aes(y='rt')
g

## Difference in learning curve between first presentation of each star

(First presentation of second star minus first presentation of first star -> is there a difference?)

In [None]:
def get_diff_dat(all_data, blocks):
    
    t_dat = all_data.loc[(all_data['trial_type'] == 'transfer') & (all_data['block'].isin(blocks))]
    id_cols = ['sid', 'trial_type', 'phase', 'trial', 'subtrial']
    
    diff_dat = pd.merge(
        t_dat.loc[t_dat['block'] == blocks[0], id_cols + ['acc', 'rt', 'goal_star']],
        t_dat.loc[t_dat['block'] == blocks[1], id_cols + ['acc', 'rt', 'goal_star']],
        on=id_cols, suffixes=['_block{}'.format(blocks[0]), '_block{}'.format(blocks[1])]
    )
    diff_dat['acc_block{}_minus_{}'.format(blocks[1], blocks[0])] = \
        diff_dat['acc_block{}'.format(blocks[1])] - diff_dat['acc_block{}'.format(blocks[0])]
    
    return diff_dat

# # Example use
# get_diff_dat(all_data, [0, 1])

In [None]:
blocks = [0, 1]
d = get_diff_dat(all_data, blocks)
id_cols = ['sid', 'trial_type', 'trial', 'subtrial']
dd_dat = pd.merge(d.loc[d['phase'] == 'high', id_cols + ['acc_block{}_minus_{}'.format(blocks[1], blocks[0])]],
         d.loc[d['phase'] == 'low', id_cols + ['acc_block{}_minus_{}'.format(blocks[1], blocks[0])]],
         on=id_cols, suffixes=['_high', '_low']
        )
dd_dat['{}minus{}_highminuslow'.format(blocks[1], blocks[0])] = dd_dat['acc_block1_minus_0_high'] - dd_dat['acc_block1_minus_0_low']
dd_dat

In [None]:
g = (gg.ggplot(dd_dat, gg.aes('trial', '1minus0_highminuslow'))
     + gg.stat_summary()
     + gg.geom_hline(yintercept=0, linetype='dotted')
    )
g

In [None]:
# Make sure we're comparing the same / different stars
d = get_diff_dat(all_data, [0, 2])
a = d.loc[np.invert(np.isnan(d['goal_star_block0'])), ['goal_star_block0', 'goal_star_block2']]
assert np.mean(a['goal_star_block0'] == a['goal_star_block2']) == 1

In [None]:
for blocks in ([0, 1], [1, 2], [2, 3], [0, 2], [1, 3]):
    
    subj_dat = get_diff_dat(all_data, blocks).groupby(['sid', 'phase', 'trial']).aggregate('mean').reset_index()
    gg.options.figure_size = (3, 3)
    g = (gg.ggplot(subj_dat, gg.aes('trial', 'acc_block{}_minus_{}'.format(blocks[1], blocks[0]), color='phase'))
         + gg.stat_summary()
         + gg.stat_summary(geom='line')
         + gg.geom_hline(yintercept=0, linetype='dotted')
         + gg.labs(y='Acc. diff. 1st pres. each star (blocks {})'.format(blocks), color='')
        )
    print(g)

# OLD AND BROKEN

## Frequency of middle-layer item after first discovery

In [None]:
first_dat

In [None]:
first_dat1 = first_dat.loc[first_dat['subtrial'] == 1].reset_index()
first_dat3 = first_dat.loc[first_dat['subtrial'] == 3].reset_index()
first_dat1

In [None]:
def make_first_dat_extra(first_dat, trial_shifts=range(-5, 10), subtrial=[]):

    first_dat_extra = pd.DataFrame()

    # Find trials around item discovery and concat into a dataframe
    for row_i in range(first_dat.shape[0]):

        if (row_i % 50) == 1:
            print('{} of {} rows ({}%)                       '.format(
                row_i+1, first_dat.shape[0], 100 * np.round((row_i+1) / first_dat.shape[0], 3)), end='\r')

        for trial_shift in trial_shifts:
            row = first_dat.loc[row_i, ['sid', 'trial_type', 'phase', 'block', 'trial', 'subtrial', 'middle_item']]
            row['trial'] += trial_shift
            if subtrial:
                row['subtrial'] = subtrial
            row['trial_reltofirst'] = trial_shift
            first_dat_extra = first_dat_extra.append(row)
            
    # Beatify dataframe
    first_dat_extra = first_dat_extra.rename(columns={'middle_item': 'middle_item_sequence', 'subtrial': 'subtrial_sequence'})
    first_dat_extra = first_dat_extra.loc[first_dat_extra['trial'] >= 0]
    first_dat_extra['trial_reltofirst'] = first_dat_extra['trial_reltofirst'].astype(int)
    first_dat_extra = first_dat_extra.reset_index(drop=True)
            
    return first_dat_extra

# # Example use
# make_first_dat_extra(first_dat3[:100], subtrial=1)

In [None]:
first_datnot_extra = make_first_dat_extra(first_dat_not[:100].rename(columns={'middle_item1_not': 'middle_item'}))
first_datnot_extra

In [None]:
first_dat1_extra = make_first_dat_extra(first_dat1[:100], subtrial=1)
first_dat3_extra = make_first_dat_extra(first_dat3[:100], subtrial=1)

In [None]:
first_dat1_extra['discovery_trial'] = 1
first_dat3_extra['discovery_trial'] = 3
first_datnot_extra['discovery_trial'] = 'no sequence'

In [None]:
first_dat13_extra = pd.concat([
    first_dat1_extra,
    first_dat3_extra,
    first_datnot_extra
]).reset_index(drop=True)
first_dat13_extra

In [None]:
f_dat13 = pd.merge(first_dat13_extra, all_data, how='left')
f_dat13['same_item_again'] = 0

for item in range(4):
    f_dat13['bool_item_{}'.format(item)] = (f_dat13['middle_item'] == item).astype(int)
    f_dat13.loc[(f_dat13['middle_item_sequence'] == item) & (f_dat13['bool_item_{}'.format(item)] == 1), 'same_item_again'] = 1

f_dat13[['sid', 'trial_type', 'phase', 'block', 'trial', 'subtrial', 'trial_reltofirst', 'rt', 'middle_item',
       'middle_item_sequence', 'subtrial_sequence', 'bool_item_1']]

In [None]:
f_dat13.columns

In [None]:
subj_dat = f_dat13.loc[f_dat13['subtrial'] == 1].groupby([
    'sid', 'phase', 'trial_type', 'trial_reltofirst', 'discovery_trial'
]).aggregate('mean').reset_index()

g = (gg.ggplot(subj_dat, gg.aes('trial_reltofirst', 'same_item_again', color='factor(discovery_trial)'))
     + gg.stat_summary()
     + gg.stat_summary(geom='line')
     + gg.facet_grid('phase ~ trial_type')
    )
g

In [None]:
f_dat['true_item'] = f_dat['middle_item_both'] < 4

In [None]:
f_dat

In [None]:
first_dat[:30]

In [None]:
f_dat.loc[(f_dat['middle_item_both_sequence'] == 0) & (f_dat['trial_reltofirst'] <= 0) & (f_dat['bool_item_0'] == 1),
          ['sid', 'phase', 'trial_type', 'trial', 'block', 'trial_reltofirst', 'subtrial', 'middle_item_both_sequence', 'subtrial_sequence', 'action_id', 'bool_item_0']
         ][:30]

In [None]:
subj_dat = f_dat.loc[(f_dat['middle_item_both_sequence'] == 0) & (f_dat['subtrial_sequence'] == 1)].groupby([
    'sid', 'phase', 'trial_type', 'trial_reltofirst', 'true_item', 'middle_item_both_sequence', 'block', 'subtrial_sequence',
]).aggregate('mean').reset_index()

g = (gg.ggplot(subj_dat, gg.aes('trial_reltofirst', 'bool_item_0', color='true_item'))
     + gg.stat_summary()
     + gg.stat_summary(geom='line')
     + gg.facet_grid('phase + subtrial_sequence ~ trial_type + middle_item_both_sequence')
    )
g

In [None]:
subj_dat = f_dat.loc[f_dat['subtrial'] == 1].groupby([
    'sid', 'phase', 'trial_type', 'trial_reltofirst', 'true_item', 'middle_item_both_sequence', 'block', 'subtrial_sequence',
]).aggregate('mean').reset_index()

g = (gg.ggplot(subj_dat, gg.aes('trial_reltofirst', 'same_item_again', color='true_item'))
     + gg.stat_summary()
     + gg.stat_summary(geom='line')
     + gg.facet_grid('phase + subtrial_sequence ~ trial_type + middle_item_both_sequence')
    )
g

In [None]:
f_dat = pd.merge(first_dat13_extra, all_data, how='left')
f_dat['same_item_again'] = 0

for item in np.unique(f_dat.loc[np.invert(np.isnan(f_dat['middle_item_both'])), 'middle_item_both']):
    f_dat['bool_item_{}'.format(item)] = (f_dat['middle_item'] == item)#.astype(int)
    f_dat.loc[(f_dat['middle_item_sequence'] == item) & (f_dat['bool_item_{}'.format(item)] == 1), 'same_item_again'] = 1

f_dat[['sid', 'trial_type', 'phase', 'block', 'trial', 'subtrial', 'trial_reltofirst', 'rt', 'middle_item',
       'middle_item_sequence', 'subtrial_sequence', 'bool_item_1.0']]

In [None]:
f_dat['same_item_again'] = 0
f_dat.loc[(f_dat['middle_item_sequence'] == 0) & (f_dat['bool_item_0.0'] == 1), 'same_item_again'] = 1
f_dat.loc[(f_dat['middle_item_sequence'] == 1) & (f_dat['bool_item_1.0'] == 1), 'same_item_again'] = 1
f_dat.loc[(f_dat['middle_item_sequence'] == 2) & (f_dat['bool_item_2.0'] == 1), 'same_item_again'] = 1
f_dat.loc[(f_dat['middle_item_sequence'] == 3) & (f_dat['bool_item_3.0'] == 1), 'same_item_again'] = 1
f_dat

In [None]:
np.unique(f_dat.loc[f_dat['subtrial'] == f_dat['subtrial_sequence'], ['subtrial']])

In [None]:
np.unique(subj_dat['subtrial'])

In [None]:
np.mean(f_dat['bool_item_3.0'])

In [None]:
g = (gg.ggplot(f_dat, gg.aes('trial_reltofirst', 'same_item_again'))
     + gg.stat_summary()
    )
g

In [None]:
# subj_dat = first_dat_extra.groubpy(['sid', 'phase', 'trial_type', 'trial_reltofirst']).aggregate('mean').reset_index()
subj_dat = f_dat.loc[f_dat['subtrial'] == f_dat['subtrial_sequence']
                    ].groupby(['sid', 'phase', 'trial_type', 'acc', 'trial_reltofirst', 'subtrial_sequence', 'middle_item_sequence']).aggregate('mean').reset_index()

gg.options.figure_size = (8, 6)
g = (gg.ggplot(subj_dat, gg.aes('trial_reltofirst', 'same_item_again', color='middle_item_sequence', group='middle_item_sequence'))
     + gg.stat_summary()
     + gg.stat_summary(geom='line')
     + gg.coord_cartesian(xlim=(-1, 9))
     + gg.geom_vline(xintercept=0, linetype='dotted')
     + gg.geom_hline(yintercept=0, linetype='dotted')
     + gg.facet_grid('acc + phase ~ trial_type + subtrial_sequence', labeller='label_both')
    )
print(g)
g.save(os.path.join(plot_dir, '105_itemFrequencyOverTrialsreltofirst.png'))

In [None]:
# same_item_again ~ star_performance + modified_item + trial_reltofirst + item_necessary_for_goal_star_at_this_position

In [None]:
mod_dat = f_dat.loc[f_dat['trial_type'] == 'learning',
                    ['sid', 'same_item_again', 'trial_reltofirst', 'trial_type', 'subtrial_sequence', 'phase']].dropna()
mod = smf.mixedlm(
    formula='same_item_again ~ trial_reltofirst',
    data=mod_dat, groups=mod_dat['sid']
).fit()
mod.summary()

In [None]:
mod_dat = f_dat.loc[f_dat['trial_type'] == 'transfer',
                    ['sid', 'same_item_again', 'trial_reltofirst', 'trial_type', 'subtrial_sequence', 'phase']].dropna()
mod = smf.mixedlm(
    formula='same_item_again ~ trial_reltofirst + C(subtrial_sequence) * phase',
    data=mod_dat, groups=mod_dat['sid']
).fit()
mod.summary()

In [None]:
mod_dat = f_dat.loc[f_dat['trial_type'] == 'learning',
                    ['sid', 'same_item_again', 'trial_reltofirst', 'trial_type', 'subtrial_sequence', 'phase']].dropna()
mod = smf.mixedlm(
    formula='same_item_again ~ trial_reltofirst + C(subtrial_sequence) * phase',
    data=mod_dat, groups=mod_dat['sid']
).fit()
mod.summary()

In [None]:
gg.options.figure_size = (5, 4)
subj_dat = f_dat.groupby(['sid', 'phase', 'trial_type', 'trial_reltofirst', 'middle_item_sequence']).aggregate('mean').reset_index()
g = g + gg.facet_grid('phase ~ trial_type')
g.data = subj_dat
g

In [None]:
# subj_dat = first_dat_extra.groubpy(['sid', 'phase', 'trial_type', 'trial_reltofirst']).aggregate('mean').reset_index()
subj_dat = f_dat.loc[f_dat['subtrial'] == 1].groupby(['sid', 'phase', 'trial_type', 'trial_reltofirst', 'subtrial_sequence', 'middle_item_sequence']).aggregate('mean').reset_index()

g = (gg.ggplot(subj_dat, gg.aes('trial_reltofirst', 'bool_item_0.0', color='middle_item_sequence', group='middle_item_sequence'))
     + gg.stat_summary()
     + gg.stat_summary(geom='line')
     + gg.geom_vline(xintercept=0, linetype='dotted')
     + gg.facet_grid('phase + subtrial_sequence ~ trial_type')
    )
print(g)
g.save(os.path.join(plot_dir, '105_itemFrequencyOverTrialsreltofirst.png'))

In [None]:
g += gg.aes(y='bool_item_1.0')
g

In [None]:
g += gg.aes(y='bool_item_2.0')
g

In [None]:
g += gg.aes(y='bool_item_3.0')
g