# Prep

In [None]:
import json
import numpy as np
import os
import pandas as pd
import plotnine as gg
gg.theme_set(gg.theme_classic)
default_figure_size = (6.4, 4.8)

In [None]:
# data_dir = 'C:/Users/maria/MEGAsync/Berkeley/CHaRLy/data/mTurk1'
data_dir = 'C:/Users/maria/MEGAsync/Berkeley/CHaRLy/data/RPP34'
plot_dir = data_dir + 'figures'
if not os.path.exists(plot_dir):
    os.makedirs(plot_dir)

## Get all_data

In [None]:
all_data = pd.read_csv(os.path.join(data_dir, 'all_data_raw.csv'), index_col=0)
all_data

## Get demographics questionnaire

In [None]:
demographics = pd.read_csv(os.path.join(data_dir, 'Demographics.csv'), header=1)
demographics = demographics.rename(columns={'id': 'sid'})
demographics.sid = demographics.sid.astype('int')
demographics

In [None]:
# Subset participants of current dataset
demo = demographics.loc[demographics['sid'].isin(np.unique(all_data['sid']))].reset_index(drop=True)
demo

In [None]:
# Remove duplicate entries
already_ids = []

for i, sid in enumerate(demo['sid']):
    if sid in already_ids:
        print("Removing duplicate entry for participant {}.".format(sid))
        demo = demo.drop(i)
    already_ids += [sid]
demo = demo.reset_index(drop=True)
demo.shape

## Get strategy questionnaire

In [None]:
strategies = pd.read_csv(os.path.join(data_dir, 'StrategyQuestionnaire.csv'), header=1)
strategies = strategies[1:]  # Remove first row with "ImportId":"startDate","timeZone":"Ameri, ...
strategies = strategies.rename(columns={'id': 'sid'})
strategies.sid = strategies.sid.astype('int')
strategies

In [None]:
# Subset participants of current dataset
strat = strategies.loc[strategies['sid'].isin(np.unique(all_data['sid']))].reset_index(drop=True)
strat

In [None]:
strat.columns

In [None]:
pen_paper_q = 'Did you use pen and paper or any other external device (e.g., cell phone) to help in this task? E.g., some people write down keys to help them memorize and perform better.'
yes_ans = 'Yes, I used an external device to help (pen & paper, cell phone, etc.).'
no_ans = 'No, I did not use any external device to help with the task.'

pen_paper_sids = strat.loc[strat[pen_paper_q] == yes_ans, 'sid'].values
pen_paper_sids

In [None]:
pen_paper_q2 = 'How did you use the device? Describe your strategy.'
for i, sid in enumerate(pen_paper_sids):
    print(sid, strat.loc[strat[pen_paper_q] == yes_ans, pen_paper_q2].reset_index(drop=True)[i])

# Exclude Participants

In [None]:
min_points_chance_performer = 25
max_missed_trials_inattentive = 40

## Psychological disorder, head trauma

In [None]:
psych_sids = demo.loc[demo['mental/psychiatric illness diagnosis'] == 'Yes', 'sid'].values
trauma_sids = demo.loc[demo['head trauma/neurological disorder'] == 'Yes', 'sid'].values

psych_sids, trauma_sids

## Bad performance

In [None]:
# Add chance performer column to all_data
id_cols = ['sid', 'phase', 'trial_type']
chance_performers1 = all_data.groupby(id_cols).aggregate('mean')['points'].reset_index()
chance_performers1 = chance_performers1.groupby(['sid']).aggregate('count')['phase'].reset_index()
chance_performers1

In [None]:
chance_performers2 = all_data.groupby(id_cols).aggregate('mean').reset_index()[id_cols + ['acc']]
chance_performers2['points'] = chance_performers2['acc'] * (np.max(all_data.trial_) + 1)
chance_performers2

In [None]:
chance_performers = set(
    list(chance_performers1.loc[chance_performers1['phase'] == 1, 'sid']) + \
    list(set(chance_performers2.loc[
        (chance_performers2.trial_type == 'learning') & 
        (chance_performers2.points < min_points_chance_performer), 'sid']))
)

print("Chance performer ids: {}".format(chance_performers))

## Too many missed trials

In [None]:
d = all_data.loc[np.isnan(all_data['key_press'])].groupby(['sid']).aggregate('sum').reset_index()[['sid', 'timeout']]

inattentives = d.loc[d['timeout'] > max_missed_trials_inattentive].sid.values

print("Inattentive participant ids: {}".format(inattentives))

In [None]:
g = (gg.ggplot(d, gg.aes('timeout'))
 + gg.geom_histogram(position=gg.position_dodge(width=4))
 + gg.geom_vline(xintercept=max_missed_trials_inattentive, linetype='dotted', color='red')
)
g.save(os.path.join(plot_dir, '01_ChancePerformers.png'))
print(g)

In [None]:
num_dat = all_data.loc[all_data['phase'] == 'low'].groupby(['sid', 'phaseNum']).aggregate('mean').reset_index()[['sid', 'phaseNum']]

print('Before Excluding: High transfer first: n={}; low transfer first: n={}'.format(
    np.sum(num_dat['phaseNum'] == 0), np.sum(num_dat['phaseNum'] == 1)))

# Remove participants

In [None]:
all_data['chance_performer'] = all_data['sid'].isin(chance_performers)
all_data['inattentives'] = all_data['sid'].isin(inattentives)
all_data['psych_disorder'] = all_data['sid'].isin(psych_sids)
all_data['head_trauma'] = all_data['sid'].isin(trauma_sids)
all_data['pen_paper'] = all_data['sid'].isin(pen_paper_sids)

In [None]:
all_data = all_data.loc[
    np.invert(all_data['inattentives'] | all_data['psych_disorder'] | all_data['head_trauma'] | all_data['pen_paper'])
]
all_data

# Demographics

In [None]:
print('Number of Females: {}; Males: {}; Decline to answer: {}'.format(
    np.sum(demo['sex'] == "Female"), np.sum(demo['sex'] == "Male"), np.sum(demo['sex'] == "Decline to answer")
))

In [None]:
num_dat = all_data.loc[all_data['phase'] == 'low'].groupby(['sid', 'phaseNum']).aggregate('mean').reset_index()[['sid', 'phaseNum']]

print('After Excluding: High transfer first: n={}; low transfer first: n={}'.format(
    np.sum(num_dat['phaseNum'] == 0), np.sum(num_dat['phaseNum'] == 1)))

## Save data

In [None]:
all_data.to_csv(os.path.join(data_dir, 'all_data.csv'))