In [97]:
import os
import math
import pandas as pd
import numpy as np

## Step 1: Data processing function
This prepares three lists of arrays that are formatted for pySINDy: `qvalslist`, `truevalslist`, and `control_inputs_training`. It also prepares a dataframe for summarizing participant demographics and compensation requirements, including whether participants completed the task, their performance on attention checks, and bonus compensation. It saves the lists of arrays as .npz files.

In [124]:
def process(root, savename):

    fileslist = []
    for p, s, f in os.walk(root): # this loop creates a list of all paths to each file in the root directory
        for n in f:
            if '.csv' in n:
                fileslist.append(os.path.join(p, n))

    demo = pd.read_csv("rawdata\demographic\demographic.csv", delimiter=',', index_col=False)

    # make a bunch of empty lists. We will iterate over files to populate these lists
    workerlist = []
    qvalslist = []
    truevalslist = []
    control_inputs_training = []
    attentionlist = []
    wordattentionlist = []
    bonuslist = []
    completelist = []
    genderlist = []
    agelist = []
    racelist = {'black': [], 'hispanic': [], 'white': [], 'asian':[], 'other':[]}

    for x in fileslist:
        try:
            temp = pd.read_csv(x, delimiter=',', index_col=False, encoding= 'unicode_escape') # read in file
            qvals = [x/100 for x in list(temp['response']) if math.isnan(x) == False] # take the reported qval estimates from 0 to 1
            truevals = [x/100 for x in list(temp['prob_of_1_percentage']) if math.isnan(x) == False] # take the true probability of reward from 0 to 1
            reward = [0 if 'factory-phone-red' in x else 1 for x in list(temp['stimulus']) if 'factory-phone' in str(x)]
            time = list(range(0, len(qvals))) # make a list of trials for each reported qval. This is our time variable
            responses = [x for x in list(temp['responses']) if pd.isna(x) == False] # for word attention checks
            distance = [abs(x-truevals[i]) for i, x in enumerate(qvals)]

            # next we want to flag reported qvals that were actually attention checks. We don't want SINDy to mistake these for qvals.
            checklist = []
            checkvals = [6, 52, 85, 37, 61] # these were the values used in attention checks - we'll use them later to confirm
            for i, y in enumerate(list(temp['response'])): # go through all of the reported qvals
                if math.isnan(y) == False: # if it isn't NaN (it would be if the event wasn't the slider)
                    if temp['Name'][i] == 'guess-probability-slider': # if it was a guess-probability event, append 0 to indicate that it was NOT a check
                        checklist.append(0)
                    elif temp['Name'][i] == 'attention-slider': # if it was a attention-slider event, append 1 to indicate it WAS a check. We want to ignore these qvals
                        checklist.append(1)

            # now that we know what qvals were actually checks, we can (a) figure out whether the participant passed these checks
            # and (b) we can drop them from the lists that we will train SINDy with
            attention = []
            attentioncount = 0
            for i, y in enumerate(checklist):
                if y == 1: # if we marked the trial as a check
                    if qvals[i] == checkvals[attentioncount]/100: # check whether the reported qval matches the value in the checkvals list
                        attention.append(1) # if it matches, give the trial a 1 - they passed!
                    else:
                        attention.append(0) # otherwise, they failed that check
                    qvals[i] = np.nan # regardless of passing/failing, we will set all values of importance on the check trial for SINDy to NaN
                    truevals[i] = np.nan
                    reward[i] = np.nan
                    time[i] = np.nan
                    attentioncount += 1

            # we also had two word attention checks
            wordscorrect = 0
            for y in responses:
                if 'but' in y or 'rock' in y:
                    wordscorrect += 1
            wordattentionlist.append(wordscorrect)

            # SINDy needs the previous expected value and shown reward to line up on the same row to estimate the change in qval
            # since we show 1 factory result before ever asking participants their expectations, we will assume that their prior
            # expectation was 0.5, or 50% working. That way, SINDy can use 0.5 and the shown feedback to model the updated qval
            # that is reported in the subsequent guess-probability-slider
            qvals.insert(0, 0.5)
            truevals.insert(0, 0.5)
            time.append(-1)
            reward.append(-1)

            # for payment purposes, let's count up the total number of reported qvals that were within 0.05 of the true probability
            bonus = 0
            for j, q in enumerate(qvals):
                if len(truevals) > j:
                    if q < truevals[j] + 0.05 and q > truevals[j] - 0.05:
                        bonus += 1

            # finally, we add the subject-specific variables to the lists that cover all participants
            attentionlist.append(np.mean(attention)) # append the percentage of attention checks passed
            workerlist.append(temp['participant_id'].unique()[0]) # append their worker id, for reference in a payment dataframe
            bonuslist.append(bonus) # append the total number of trials that this participant earned a bonus
            if len(qvals) == 106:
                completelist.append(True) # if there are 106 qvals present (100 trials + 5 checks + 1 initial q of 0.5), this participant completed the task
            else:
                completelist.append(False) # otherwise, they did not complete the task
            if np.mean(attention) > 0.5 and len(qvals) == 106: # if they passed more than 50% of the attention checks and have 106 qvals (completed), we want to include them for SINDy training
                qvalslist.append(np.stack((qvals), axis=-1)) # qvals for SINDy training
                truevalslist.append(np.stack((truevals), axis=-1)) # true probabilities for plotting
                control_inputs_training.append(np.stack((reward, time), axis=-1)) # control variables for SINDy training

            # we also want to gather demographics data for each participant to include in the summary dataframe
            try:
                gender = demo[demo['Q79']==temp['participant_id'][0]]['Q19'].values[0]
            except:
                gender = "3"
            try:
                age = demo[demo['Q79']==temp['participant_id'][0]]['Q17'].values[0]
            except:
                age = "NAN"
            try:
                black = demo[demo['Q79']==temp['participant_id'][0]]['Q31_1'].values[0]
                hispanic = demo[demo['Q79']==temp['participant_id'][0]]['Q31_2'].values[0]
                white = demo[demo['Q79']==temp['participant_id'][0]]['Q31_3'].values[0]
                asian = demo[demo['Q79']==temp['participant_id'][0]]['Q31_4'].values[0]
                other_race = demo[demo['Q79']==temp['participant_id'][0]]['Q31_5'].values[0]
            except:
                black = np.nan
                hispanic = np.nan
                white = np.nan
                asian = np.nan
                other_race = 1

            if str(gender).isdigit():
                genderlist.append(int(gender))
            else:
                genderlist.append(np.nan)
            if str(age).isdigit():
                agelist.append(int(age))
            else:
                agelist.append(np.nan)
            racelist['black'].append(black)
            racelist['hispanic'].append(hispanic)
            racelist['white'].append(white)
            racelist['asian'].append(asian)
            racelist['other'].append(other_race)

        except Exception as e: 
            print(
                x,
                type(e).__name__,          # TypeError
                e.__traceback__.tb_lineno  # 2
            )

    # now that we know which participants completed the study and passed the attention check criteria, let's determine how much we need to pay them
    compensationlist = []
    for i, c in enumerate(bonuslist):
        if completelist[i] == True and attentionlist[i]>0.5: # if they completed the task and passed more than 50% of checks
            compensationlist.append('$%.2f' % (4+bonuslist[i]*0.03)) # award them $4 plus $0.03 for every trial they earned a bonus
        else:
            compensationlist.append('$%.2f' % (0)) # otherwise, they recieve $0 - we reject their submission

    # make a dataframe with information that is relevant for payment on Prolific, and save as csv
    summarydf = pd.DataFrame({'id': workerlist, 'gender': genderlist, 'age': agelist, 'attention%': attentionlist, 'wordscorrect': wordattentionlist, 'within5#': bonuslist, 'complete': completelist, 'compensation': compensationlist,
                            'black': racelist['black'], 'hispanic': racelist['hispanic'], 'white': racelist['white'], 'asian': racelist['asian'], 'other': racelist['other']})

    np.savez('%s_qvals.npz' % savename, *qvalslist)
    np.savez('%s_truevals.npz' % savename, *truevalslist)
    np.savez('%s_control_inputs_training.npz' % savename, *control_inputs_training)

    return qvalslist, truevalslist, control_inputs_training, summarydf


## Step 2: Process data from Study 1
Data from study 1 is processed using the `process()` function, and three .npz files are saved with lists of arrays needed for SINDy. We also use the summarydf to print a few summary statistics about the sample.

In [125]:
foldername = 'rawdata/study1'
savename = 'study1'

qvalslist, __, __, summarydf = process(foldername, savename)

print('N: %s' % len(qvalslist))
print('Mean Age: %s' % np.mean([x for x in summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['age'] if x > 17]))
print('SD Age: %s' % np.std([x for x in summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['age'] if x > 17]))
print('# Male: %s' % summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['gender'].value_counts()[1])
print('# Female: %s' % summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['gender'].value_counts()[2])
print('# Other Gender or No Response: %s' % summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['gender'].value_counts()[3])
print('# Black: %s' % np.sum([int(x) for x in summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['black'].dropna()]))
print('# Hispanic: %s' % np.sum([int(x) for x in summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['hispanic'].dropna()]))
print('# White: %s' % np.sum([int(x) for x in summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['white'].dropna()]))
print('# Asian: %s' % np.sum([int(x) for x in summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['asian'].dropna()]))
print('# Other Race/Ethnicity or No Response: %s' % np.sum([int(x) for x in summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['other'].dropna()]))

data/study1\57e825b9a4e40600014ea590.csv KeyError 27
data/study1\58c6a8b353f3e70001bd2076.csv IndexError 32
data/study1\5a1d8489ab721b0001ef95cc.csv KeyError 27
data/study1\5b1d867ac9d3cb000110529a.csv KeyError 27
data/study1\5b293dcbd5035800018d86c0.csv IndexError 32
data/study1\602c533e699b7e536f5f3063.csv KeyError 27
data/study1\610165cac73a5bf882ad0535.csv KeyError 27
N: 455
Mean Age: 36.24663677130045
SD Age: 12.937836016750099
# Male: 216
# Female: 216
# Other Gender or No Response: 14
# Black: 21
# Hispanic: 6
# White: 356
# Asian: 35
# Other Race/Ethnicity or No Response: 45


## Step 3: Process data from Study 2
Data from study 2 is processed using the `process()` function, and three .npz files are saved with lists of arrays needed for SINDy. We also use the summarydf to print a few summary statistics about the sample.

In [126]:
foldername = 'rawdata/study2'
savename = 'study2'

qvalslist, __, __, summarydf = process(foldername, savename)

print('N: %s' % len(qvalslist))
print('Mean Age: %s' % np.mean([x for x in summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['age'] if x > 17]))
print('SD Age: %s' % np.std([x for x in summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['age'] if x > 17]))
print('# Male: %s' % summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['gender'].value_counts()[1])
print('# Female: %s' % summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['gender'].value_counts()[2])
print('# Other Gender or No Response: %s' % summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['gender'].value_counts()[3])
print('# Black: %s' % np.sum([int(x) for x in summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['black'].dropna()]))
print('# Hispanic: %s' % np.sum([int(x) for x in summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['hispanic'].dropna()]))
print('# White: %s' % np.sum([int(x) for x in summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['white'].dropna()]))
print('# Asian: %s' % np.sum([int(x) for x in summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['asian'].dropna()]))
print('# Other Race/Ethnicity or No Response: %s' % np.sum([int(x) for x in summarydf.loc[(summarydf['complete']==True) & (summarydf['attention%']>.5)]['other'].dropna()]))

data/study2\603eb11e686c57fd1d127694.csv IndexError 32
data/study2\60ddc7abdf6661b9c294691f.csv IndexError 32
data/study2\614eb0a92921d8c1e64f4873.csv IndexError 32
data/study2\614ee5e28a2f0a07d4d639dd.csv KeyError 27
data/study2\62c2da4e711cf6806c5f89a2.csv KeyError 27
N: 177
Mean Age: 37.875
SD Age: 12.05462236880714
# Male: 87
# Female: 85
# Other Gender or No Response: 3
# Black: 21
# Hispanic: 16
# White: 135
# Asian: 8
# Other Race/Ethnicity or No Response: 4
