# Imports etc.

In [1]:
import json
import numpy as np
import os
import pandas as pd
import plotnine as gg
gg.theme_set(gg.theme_classic)

In [2]:
# data_dir = 'C:/Users/maria/MEGAsync/Berkeley/CHaRLy/data/mTurk1'
data_dir = 'C:/Users/maria/MEGAsync/Berkeley/CHaRLy/data/RPP34'  # RPP3 is spring 2020; RPP4 is fall 2020
plot_dir = data_dir + 'figures'
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)

# Get data

## mTurk

In [3]:
def raw_data_row2pd(data_row):
    
    return pd.DataFrame(json.loads(data_row['trialdata']))

# # Use
# pd.concat([raw_data_row2pd(task_data_raw[i[0]]) for i in trial_data_rows])

In [4]:
def raw_data2rules_data_mturk(rule_row):
    
    # Get rules data
    rule_cols = ['middleRules', 'lowTransferRules', 'highTransferRules', 'highRules', 'taskVer', 'phaseNum']
    rules_dat = pd.DataFrame({rule_col: rule_row['trialdata'][rule_col] for rule_col in rule_cols})
    
    # Clean it up
    rules_dat = rules_dat.reset_index().rename(columns={'index': 'goal_id'})
    rules_dat['taskVer'] = rules_dat['taskVer'].replace({'A': 'high', 'B': 'low'})
    rules_dat = rules_dat.rename(columns={'middleRules': 'lowRules', 'taskVer': 'phase'})
    
    return rules_dat

# # Use
# pd.concat([raw_data2rules_data_mturk(rule_row) for rule_row in rule_rows])
# # raw_data2rules_data_mturk(rule_rows[0])

In [5]:
def get_all_mTurk_data(data_dir):
    
    # Read in and process data
    file_names = [f for f in os.listdir(data_dir) if '.json' in f]
    all_data_raw = pd.DataFrame()
    all_rules = pd.DataFrame()

    for sid, file_name in enumerate(file_names):

        # Read in raw data
        with open(os.path.join(data_dir, file_name)) as f:
            raw_data = json.load(f)

        # Get actual task data
        subj_data_raw = raw_data['data']

        trial_data_len = [len(subj_data_raw[i]['trialdata']) for i in range(len(subj_data_raw))]
        trial_data_rows = np.argwhere(np.array(trial_data_len) > 10000).flatten()  # rows with less than 10000 entries contain other stuff (e.g., rules)

        subj_data = pd.concat([raw_data_row2pd(subj_data_raw[i]) for i in trial_data_rows])

        # Add basic info
        subj_data['sid'] = sid
        subj_data['workerId'] = raw_data['workerId']
        subj_data['counterbalance'] = raw_data['counterbalance']
        subj_data['start_dateTime'] = raw_data['data'][0]['dateTime']
        subj_data['end_dateTime'] = raw_data['data'][-1]['dateTime']

        # Combine all subjects
        all_data_raw = pd.concat([all_data_raw, subj_data])

        # Get rule data
        rule_row_idx = np.argwhere(np.array(trial_data_len) == 21).flatten()  # trials with length 21 contain the rules
        rule_rows = [raw_data['data'][i] for i in rule_row_idx]

        subj_rules = pd.concat([raw_data2rules_data_mturk(rule_row) for rule_row in rule_rows])
        subj_rules['sid'] = sid

        all_rules = pd.concat([all_rules, subj_rules])

    return all_data_raw, all_rules

# # Use
# get_all_mTurk_data(data_dir)

In [6]:
def get_all_RPP_data(data_dir):

    file_names = [f for f in os.listdir(data_dir) if '.json' in f]

    all_data_raw = pd.DataFrame()
    for sid, file_name in enumerate(file_names):

        print("Reading in {}".format(file_name))

        # Get raw json
        subj_dat = pd.read_json(os.path.join(data_dir, file_name))
        
        # Add columns for sid, start and end time, phaseNum (which phase came first, which came second?)
        subj_dat['sid'] = int(file_name.split('_')[1][4:])
        subj_dat['testing_date'] = file_name.split('_')[2]
        subj_dat['start_dateTime'] = subj_dat.time_elapsed[subj_dat.time_elapsed.index[0]]
        subj_dat['end_dateTime'] = subj_dat.time_elapsed[subj_dat.time_elapsed.index[-1]]

        # Combine
        all_data_raw = pd.concat([all_data_raw, subj_dat])

    return all_data_raw
    
# # Use
# all_data_raw = get_all_RPP_data(data_dir)
# all_data_raw

In [7]:
def get_rule_data_rpp(all_data_raw):
    
    # Get raw rules data
    rule_rows = all_data_raw.highTransferRules.apply(lambda x : type(x) == dict)
    rule_cols = ['permKeys', 'permKeysOrder', 'permMiddleItems', 'permStars', 'middleRules', 'highRules',
                 'lowTransferRules', 'highTransferRules', 'taskVer', 'phaseNum', 'keys', 'letters',
                 'learnGoals', 'transferGoals']
    rule_data = all_data_raw.loc[rule_rows, ['sid'] + rule_cols].reset_index(drop=True)
    rule_data['taskVer'] = pd.Categorical(rule_data['taskVer']).rename_categories({'A': 'high', 'B': 'low'})
    rule_data = rule_data.rename(columns={'middleRules': 'lowRules', 'taskVer': 'phase'})
    
    return rule_data

# # Example use
# get_rule_data_rpp(all_data_raw)

In [8]:
def get_one_rule_rpp(rule_data, rule_col):
    
    r = rule_data[rule_col].apply(pd.Series)  # Split each cell (containing a dict) into several cells
    r[['sid', 'phase']] = rule_data[['sid', 'phase']]  # Get sid and phase for each row
    r = r.melt(id_vars=['sid', 'phase'], var_name='goal_id', value_name=rule_col)  # Melt
    
    return r

# # Use
# get_one_rule_rpp(rule_data, 'lowRules')

In [9]:
def clean_rule_data_rpp(rule_data_raw):
    
    # Select rule columns
    rule_cols_sub = ['sid', 'lowRules', 'highRules', 'lowTransferRules', 'highTransferRules', 'phase']
    rule_data_raw = rule_data_raw[rule_cols_sub]

    # Melt to split the dictionary across columns
    rule_cols = ['lowRules', 'highRules', 'lowTransferRules', 'highTransferRules']
    rule_data = get_one_rule_rpp(rule_data_raw, rule_cols[0])

    for rule_col in rule_cols[1:]:

        r = rule_data_raw[rule_col].apply(pd.Series)  # Split each cell (containing a dict) into several cells
        r[['sid', 'phase']] = rule_data_raw[['sid', 'phase']]  # Get sid and phase for each row
        r = r.melt(id_vars=['sid', 'phase'], var_name='goal_id', value_name=rule_col)  # Melt

        rule_data = pd.merge(rule_data, r, on=['sid', 'phase', 'goal_id'])

    return rule_data

# # Use
# clean_rule_data_rpp(rule_data_raw)

In [10]:
def msec2min(msec):
    
    sec = msec / 1000
    minutes = sec / 60
    return minutes

# Example use
s19 = 1806710 - 145800
s67 = 1913718 - 56652
msec2min(s19), msec2min(s67)

(27.681833333333334, 30.9511)

In [11]:
def clean_all_data(all_data_raw):

    # Add phaseNum column
    phasenum = all_data_raw.loc[np.invert(np.isnan(all_data_raw.phaseNum)), ['sid', 'taskVer', 'phaseNum']]
    all_data = pd.merge(
        all_data_raw.drop(columns=['taskVer', 'phaseNum'], axis=1), phasenum,
        left_on=['sid', 'phase'], right_on=['sid', 'taskVer'], how='outer'
    ).drop(columns=['taskVer'], axis=1)
    
    # Subset rows
    all_data = all_data.loc[all_data.trial_type.isin(['learning', 'transfer'])]  # remove instructions
    all_data = all_data.loc[all_data.subtrial.isin(range(4))]  # remove 5th (feedback) trial

    # Add more columns
    all_data['phase'] = all_data['phase'].replace({'A': 'high', 'B': 'low'})
    all_data['trial_'] = all_data['block'] * 25 + all_data['trial']  # get continuous trials over blocks
    all_data['duration'] = msec2min(all_data['end_dateTime'] - all_data['start_dateTime'])  # task duration

    # Add star iteration column
    all_data.loc[
        all_data.trial_type == 'learning', 'star_iteration'
    ] = all_data.loc[
        all_data.trial_type == 'learning', 'block'
    ].apply(lambda x : 0 if x <= 3 else (1 if x <= 7 else 2))
    all_data.loc[
        all_data.trial_type == 'transfer', 'star_iteration'
    ] = all_data.loc[
        all_data.trial_type == 'transfer', 'block'
    ].apply(lambda x : 0 if x <= 1 else (1 if x <= 3 else 2))
    
    # Add acc column (`correct` is only for successful star trials)
    acc_mask = all_data['subtrial'] == 3
    all_data.loc[acc_mask, 'acc'] = (all_data.loc[acc_mask, 'goal_star'] == all_data.loc[acc_mask, 'unlocked_star']).astype(int)
    all_data = all_data.reset_index(drop=True)
    
    return all_data

# # Use
# clean_all_data(all_data_raw)

In [12]:
if 'RPP' in data_dir:
    all_data_raw = get_all_RPP_data(data_dir)
    rule_data_raw = get_rule_data_rpp(all_data_raw)
    rule_data = clean_rule_data_rpp(rule_data_raw)
    
elif 'mTurk' in data_dir:
    all_data_raw, rule_data = get_all_mTurk_data(data_dir)

else:
    raise ValueError('Must be RPP or mTurk.')
    
all_data = clean_all_data(all_data_raw)
rule_data

Reading in CHaRLy_subj13885_2020-9-6_18_31.json
Reading in CHaRLy_subj14239_2020-7-22_22_51.json
Reading in CHaRLy_subj16849_2020-9-14_0_45.json
Reading in CHaRLy_subj23638_2020-9-6_7_32.json
Reading in CHaRLy_subj24817_2020-9-5_19_32.json
Reading in CHaRLy_subj24964_2020-7-26_0_35.json
Reading in CHaRLy_subj24979_2020-7-24_15_5.json
Reading in CHaRLy_subj26230_2020-7-23_18_32.json
Reading in CHaRLy_subj26641_2020-9-24_19_30.json
Reading in CHaRLy_subj26770_2020-9-6_16_5.json
Reading in CHaRLy_subj26899_2020-9-25_14_56.json
Reading in CHaRLy_subj27517_2020-9-25_13_21.json
Reading in CHaRLy_subj28033_2020-9-24_22_9.json
Reading in CHaRLy_subj28525_2020-7-24_13_39.json
Reading in CHaRLy_subj31306_2020-7-24_23_51.json
Reading in CHaRLy_subj31552_2020-9-5_15_57.json
Reading in CHaRLy_subj31684_2020-9-25_11_17.json
Reading in CHaRLy_subj31951_2020-9-10_9_52.json
Reading in CHaRLy_subj32713_2020-9-10_12_39.json
Reading in CHaRLy_subj32833_2020-7-25_17_0.json
Reading in CHaRLy_subj33406_2020-

Unnamed: 0,sid,phase,goal_id,lowRules,highRules,lowTransferRules,highTransferRules
0,13885,high,0,"[0, 1]","[3, 0]","[0, 1]","[3, 0]"
1,13885,low,0,"[1, 2]","[3, 2]","[0, 2]","[3, 2]"
2,14239,low,0,"[3, 0]","[1, 2]","[3, 1]","[1, 2]"
3,14239,high,0,"[3, 0]","[1, 0]","[3, 1]","[1, 3]"
4,16849,high,0,"[1, 2]","[3, 0]","[0, 2]","[2, 0]"
...,...,...,...,...,...,...,...
475,39508,low,3,"[2, 3]","[0, 2]","[2, 3]","[0, 3]"
476,39514,low,3,"[1, 2]","[3, 2]","[0, 2]","[3, 0]"
477,39514,high,3,"[1, 2]","[3, 0]","[0, 2]","[3, 2]"
478,39520,high,3,"[3, 0]","[0, 2]","[3, 1]","[3, 2]"


In [13]:
for level_a, level_b in zip(('low', 'high'), ('high', 'low')):
    
    # Add columns for rules that appear only in learning, only in transfer, and in both
    rule_data['{}RulesBoth'.format(level_a)] = rule_data.apply(
        lambda dat: dat['{}Rules'.format(level_a)]
            if dat['{}Rules'.format(level_a)] == dat['{}TransferRules'.format(level_a)]
            else np.nan, axis=1)
    rule_data['{}RulesLearnOnly'.format(level_a)] = rule_data.apply(
        lambda dat: dat['{}Rules'.format(level_a)]
            if dat['{}Rules'.format(level_a)] != dat['{}TransferRules'.format(level_a)]
            else np.nan, axis=1)
    rule_data['{}RulesTransferOnly'.format(level_a)] = rule_data.apply(
        lambda dat: dat['{}TransferRules'.format(level_a)]
            if dat['{}Rules'.format(level_a)] != dat['{}TransferRules'.format(level_a)]
            else np.nan, axis=1)
    
    # Get RulesBoth manually
    rule_data.loc[
        rule_data.phase == level_b,
        '{}RulesBoth'.format(level_a)
    ] = rule_data.loc[
        rule_data.phase == level_b,
        '{}Rules'.format(level_a)
    ]
    
    # Remove hypothetical, never-used transfer rules
    rule_data.loc[
        rule_data.phase == level_b,
        ['{}TransferRules'.format(level_a), '{}RulesTransferOnly'.format(level_a), '{}RulesLearnOnly'.format(level_a)]
    ] = np.nan
    
rule_data.loc[rule_data.phase == 'high']

Unnamed: 0,sid,phase,goal_id,lowRules,highRules,lowTransferRules,highTransferRules,lowRulesBoth,lowRulesLearnOnly,lowRulesTransferOnly,highRulesBoth,highRulesLearnOnly,highRulesTransferOnly
0,13885,high,0,"[0, 1]","[3, 0]",,"[3, 0]","[0, 1]",,,"[3, 0]",,
3,14239,high,0,"[3, 0]","[1, 0]",,"[1, 3]","[3, 0]",,,,"[1, 0]","[1, 3]"
4,16849,high,0,"[1, 2]","[3, 0]",,"[2, 0]","[1, 2]",,,,"[3, 0]","[2, 0]"
7,23638,high,0,"[0, 1]","[0, 2]",,"[0, 2]","[0, 1]",,,"[0, 2]",,
8,24817,high,0,"[3, 0]","[2, 0]",,"[2, 3]","[3, 0]",,,,"[2, 0]","[2, 3]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
471,39139,high,3,"[0, 1]","[1, 3]",,"[1, 3]","[0, 1]",,,"[1, 3]",,
472,39304,high,3,"[0, 1]","[2, 3]",,"[2, 3]","[0, 1]",,,"[2, 3]",,
474,39508,high,3,"[0, 1]","[3, 1]",,"[3, 1]","[0, 1]",,,"[3, 1]",,
477,39514,high,3,"[1, 2]","[3, 0]",,"[3, 2]","[1, 2]",,,,"[3, 0]","[3, 2]"


In [14]:
rule_data.loc[rule_data.phase == 'low']

Unnamed: 0,sid,phase,goal_id,lowRules,highRules,lowTransferRules,highTransferRules,lowRulesBoth,lowRulesLearnOnly,lowRulesTransferOnly,highRulesBoth,highRulesLearnOnly,highRulesTransferOnly
1,13885,low,0,"[1, 2]","[3, 2]","[0, 2]",,,"[1, 2]","[0, 2]","[3, 2]",,
2,14239,low,0,"[3, 0]","[1, 2]","[3, 1]",,,"[3, 0]","[3, 1]","[1, 2]",,
5,16849,low,0,"[3, 0]","[1, 3]","[3, 1]",,,"[3, 0]","[3, 1]","[1, 3]",,
6,23638,low,0,"[1, 2]","[0, 1]","[0, 2]",,,"[1, 2]","[0, 2]","[0, 1]",,
9,24817,low,0,"[2, 3]","[1, 2]","[2, 3]",,"[2, 3]",,,"[1, 2]",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
470,39139,low,3,"[1, 2]","[0, 3]","[0, 2]",,,"[1, 2]","[0, 2]","[0, 3]",,
473,39304,low,3,"[3, 0]","[0, 1]","[3, 1]",,,"[3, 0]","[3, 1]","[0, 1]",,
475,39508,low,3,"[2, 3]","[0, 2]","[2, 3]",,"[2, 3]",,,"[0, 2]",,
476,39514,low,3,"[1, 2]","[3, 2]","[0, 2]",,,"[1, 2]","[0, 2]","[3, 2]",,


In [15]:
rule_data.to_csv(os.path.join(data_dir, 'rule_data.csv'))

## Add hypothetical middle-level items and stars to all_data

In [16]:
def get_goal_rule(trial_rules, goal_id, rule_name):
    
    goal_rule = trial_rules.loc[trial_rules['goal_id'] == goal_id, rule_name]
    return goal_rule[goal_rule.index[0]]

# # Use
# get_goal_rule(trial_rules, goal_id, 'lowRules')

In [17]:
def actions2items(trial_rules, actions, low_rule_name):
    
    items = 4 * [np.nan]

    for goal_id in trial_rules.goal_id:
        goal_rule = get_goal_rule(trial_rules, goal_id, low_rule_name)
        
        for i, action_tuple in enumerate([np.array(actions[:2]), np.array(actions[2:])]):
            if np.all(action_tuple == goal_rule):
                items[2 * i + 1] = int(goal_id)

    return items
        
# # Use
# actions = [3, 0, 2, 3]
# actions = [3, 1, 2, 3]
# actions2items(trial_rules, actions, 'lowRules')

In [18]:
def items2stars(trial_rules, items, high_rule_name):

    for goal_id in trial_rules.goal_id:

        goal_rule = get_goal_rule(trial_rules, goal_id, high_rule_name)
        comp_items = np.array([items[1], items[3]])

        # Valid sequence of two items
        if np.all(goal_rule == comp_items):
            return 3 * [np.nan,] + [int(goal_id)]

    # No valid sequence of two items
    return 4 * [np.nan,]
        
# # Use
# items = [np.nan, 1, np.nan, 2]
# items2stars(trial_rules, items, high_rule_name)

In [None]:
# Add a column for each rule (takes a LONG time)
for i, sid in enumerate(set(all_data.sid)):
    print(' sid: {}/{}'.format(i, len(set(all_data.sid))))
    
    for trial_type in set(all_data.trial_type):
        print('  trial_type: {}'.format(trial_type))
        
        for phase in set(all_data.phase):
            print('   phase: {}'.format(phase))

            # Get rules for this sid and this phase
            trial_rules = rule_data.loc[
                (rule_data.sid == sid) & (rule_data.phase == phase)
            ]
            
            if trial_rules.shape[0] > 0:

                for block in set(all_data.block):

                    for trial in set(all_data.trial):

                        all_data_mask = (all_data.sid == sid) & (all_data.phase == phase) & (
                            all_data.trial_type == trial_type) & (all_data.block == block) & (all_data.trial == trial)

                        # Get actions and items for this sid, this phase, this trial_type, this block, and this trial
                        trial_dat = all_data.loc[all_data_mask]
                        actions = trial_dat.action_id
                        items = list(trial_dat.middle_item_name)

                        if trial_dat.shape[0] == 4:  # Make sure we have 4 valid trials

                            for rule_name in [c for c in rule_data.columns if 'Rules' in c]:
                                if 'low' in rule_name:

                                    pred_middle_items = actions2items(trial_rules, actions, rule_name)
                                    all_data.loc[all_data_mask, 'middle_item_' + rule_name] = pred_middle_items

                                if 'high' in rule_name:

                                    pred_star = items2stars(trial_rules, items, rule_name)
                                    all_data.loc[all_data_mask, 'unlocked_star_' + rule_name] = pred_star

 sid: 0/60
  trial_type: transfer
   phase: low
   phase: high
  trial_type: learning
   phase: low
   phase: high
 sid: 1/60
  trial_type: transfer
   phase: low


In [None]:
# Verifying that assigning theoretical items and stars was accurate
learn_data = all_data.loc[(all_data.trial_type == 'learning') & np.invert(np.isnan(all_data.middle_item_name))]
np.mean(learn_data.middle_item_name == learn_data.middle_item_lowRules)

cols = ['sid', 'phase', 'trial_type', 'trial_', 'trial', 'subtrial', 'block', 'middle_item_name', 'middle_item_lowRules', 'unlocked_star_name', 'unlocked_star_highRules']
learn_data.loc[learn_data.middle_item_name != learn_data.middle_item_lowRules, cols]

learn_data = all_data.loc[(all_data.trial_type == 'learning') & np.invert(np.isnan(all_data.unlocked_star_name))
                          & np.invert(all_data.unlocked_star_name == -1)
                         ]
np.mean(learn_data.unlocked_star_name == learn_data.unlocked_star_highRules)
learn_data.loc[learn_data.unlocked_star_name != learn_data.unlocked_star_highRules, cols]

In [None]:
# Boolean indicator for when items / stars would have appeared for each set of rules (lowRules, lowTransferRules, highRules, highTransferRules)
goals_cols = [
    'middle_item_lowRules', 'middle_item_lowTransferRules',
    'unlocked_star_highRules', 'unlocked_star_highTransferRules',
    'middle_item_lowRulesBoth', 'middle_item_lowRulesLearnOnly', 'middle_item_lowRulesTransferOnly',
    'unlocked_star_highRulesBoth', 'unlocked_star_highRulesLearnOnly', 'unlocked_star_highRulesTransferOnly',
    
]

for col in goals_cols:
    all_data['bool_{}'.format(col)] = np.invert(np.isnan(all_data[col])).astype(int)

# Add rt_zigzag etc.

In [None]:
def zscore(values):
    
    return (values - np.mean(values)) / np.std(values)

# # Example use
# zscore(np.arange(6))

In [None]:
def zigzag(rts):
    
    assert len(rts) == 4
    return rts[0] - rts[1] + rts[2] - rts[3]

# # Example use
# zigzag([1, -1, 1, -1])

In [None]:
def add_zrt_and_rtzigzag(all_data, verbose=True):
    
    for i, sid in enumerate(np.unique(all_data['sid'])):
        print('sid {} / {}'.format(i + 1, len(np.unique(all_data['sid']))))

        for phase in np.unique(all_data['phase']):
            for trial_type in np.unique(all_data['trial_type']):
                for block in np.unique(all_data['block']):

                    # Get indixes for this sid, this phase, this trial_type, and this block
                    sub_idx = (
                        all_data['sid'] == sid) & (
                        all_data['phase'] == phase) & (
                        all_data['trial_type'] == trial_type) & (
                        all_data['block'] == block
                    )

                    if np.sum(sub_idx) > 0:

                        # Add z-scored RTs (z-scored with this subset)
                        all_data.loc[sub_idx, 'z_rt'] = zscore(all_data.loc[sub_idx, 'rt'])
#                         all_data.loc[np.invert(np.isnan(all_data['z_rt']))]

                        # Add RT zigzag
                        for trial in np.unique(all_data['trial']):
                            sub_idxx = sub_idx & (all_data['trial'] == trial)
                            if np.sum(sub_idxx) == 4:
                                all_data.loc[sub_idxx, 'rt_zigzag'] = zigzag(all_data.loc[sub_idxx, 'z_rt'].values)
                                
                    # Print out which parts of the data were missing
                            else:
                                if verbose:
                                    print('\t\tsid {}, phase {}, trial_type {}, block {}, trial {} has {} subtrials.'
                                          .format(sid, phase, trial_type, block, trial, np.sum(sub_idxx)))
                    else:
                        if verbose:
                            print('\tsid {}, phase {}, trial_type {}, block {} does not exist.'.format(sid, phase, trial_type, block))
                        
# Use
add_zrt_and_rtzigzag(all_data, verbose=False)

## Save to csv

In [None]:
all_data.to_csv(os.path.join(data_dir, 'all_data_raw.csv'))