In [1]:
# common stats analysis stack
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import itertools
import pingouin as pg

#scikit learn
from sklearn.linear_model import LogisticRegression

# formulat interface to statsmodels (standard linear models)
import statsmodels.api as sm
import statsmodels.formula.api as smf
# BAyesian Model-building Interface (BAMBI)
import bambi as bmb
import arviz as az

# for printing pretty tables and plots
from tabulate import tabulate
from  matplotlib.ticker import FuncFormatter, MaxNLocator

# some stuff for making notebooks look nice
from IPython.core.display import HTML, Markdown, display

# seed RNG
np.random.seed(0)

# hate these things
import warnings
warnings.filterwarnings("ignore")

# display all columns of dataframes
pd.set_option('display.max_columns', None)

# display x rows of dataframes
pd.set_option('display.max_rows', 30) # set to None to show all rows

from datetime import datetime
import os.path

  **kwargs


In [2]:
# how to do the analysis

# exclude: if 2, then exclude subjs with yes help and loops>5
# if 1, only exclude subjs with yes help
# also exclude subjs with experiment error (two in exp 1 and one in exp 4, and delete one erroneous recorded trial in exp 4)
# else no exclusion criteria applied
exclude=1

# filename to save to
datestring = datetime.now().strftime("%m-%d-%Y")
folder = "cleandata/"

exp_fname = folder + "exp-" + datestring + ".csv"
q_fname = folder + "q-" + datestring + ".csv"

# utils

# method to save data to filename but first check it if exists and ask if overwriting is intentional
def save_data_to_file(fname,df):
    if os.path.isfile(fname):
        # file already exists
        resp = input("WARNING: file already exists, do you want to overwrite? y/n: ")
        if resp=='y' or resp=='Y':
            #overwrite file
            datafile = open(fname,mode='w',newline='')
            df.to_csv(datafile)
            print("the data has been saved to " + fname)
            return
        else:
            #exit
            print("CANCELED - the file has not been changed.")
            return
    else:
        # file do not exist
        datafile = open(fname,mode='w',newline='')
        df.to_csv(datafile)
        print("the data has been saved to " + fname)
        return


In [3]:
# Grab the experiment data

fnames = ["data/exp_data_shj0.0.csv","data/exp_data_shj1.0.0.csv","data/exp_data_shj2.0.0.csv",
         "data/exp_data_shj3.0.0.csv","category-recognition/data/exp_data_recognition0.0.2.csv",
         "data/exp_data_shj4.0.0.csv"]

# Create dataframes that contain only the relevant task phases (e.g., remove instructions, quiz, and survey sections)
replaceUid_subjid = [] # placeholder
iids={} # dict of all unique ids as keys with values as their iid in this code
ruletypes = {0:"I",1:"II",2:"III",3:"IV",4:"V",5:"VI"}
ruletypeslist = ["I","II","III","IV","V","VI"]
n=0

for i in range(len(fnames)):
    exp_df = pd.read_csv(fnames[i])
    exp_df = exp_df.loc[(exp_df.phase != 'INSTRUCTIONS') & (exp_df.phase != 'INSTRUCTQUIZ') & (exp_df.phase != 'postquestionnaire')]
    exp_df.dropna(axis=1, how='all', inplace=True)
            
    # uniqueid is so long and annoying, replace with a short subj id
    uniqueids = exp_df.uniqueid.unique()
    replaceids = {}
    for j in range(len(uniqueids)):
        replaceids[uniqueids[j]]=j    
        iids[uniqueids[j]]= n + j
        
    exp_df['uniqueid'] = exp_df['uniqueid'].replace(replaceids)
    exp_df.rename(columns={"uniqueid": "subjid"},inplace=True)
    replaceUid_subjid.append(replaceids) # save the replace ids dict for applying to the qdfs as well

    # add integer rule column
    exp_df["ruleInt"]=(exp_df["rule"].dropna()+1).astype(int)
    
    exp_df["ruleStr"]=exp_df["rule"].map(ruletypes)
    
    # add experiment number column (zero indexed)
    exp_df["expNum"]=i
    
    # add subject number column unique across all exps
#     exp_df["iid"]=exp_df["expNum"].astype(str) + exp_df['subjid'].astype(str)
    exp_df["iid"]= n + exp_df['subjid']
    n += exp_df['subjid'].max() + 1
    
    if i==0:
        exp_df0=exp_df
    elif i==1:
        exp_df1=exp_df
    elif i==2:
        exp_df2=exp_df
    elif i==3:
        exp_df3=exp_df
    elif i==4:
        exp_df4=exp_df
    elif i==5:
        exp_df5=exp_df

# exp_dfs = [exp_df0,exp_df1,exp_df2,exp_df3,exp_df4,exp_df5]

In [4]:
# Get subject level data & postquestionnaire responses

fnames = ["data/questiondata_shj0.0.csv","data/questiondata_shj1.0.0.csv","data/questiondata_shj2.0.0.csv",
         "data/questiondata_shj3.0.0.csv","category-recognition/data/questiondata_recognition0.0.2.csv",
         "data/questiondata_shj4.0.0.csv"]
n=0

for i in range(len(fnames)):
    qdf = pd.read_csv(fnames[i])
    
    qdf['subjid'] = qdf['uniqueid'].replace(replaceUid_subjid[i])
    #rearrange for prettiness
    qdf = qdf[['subjid','instructionloops','strategy','helpstrategy','howtoimprove','engagement','difficulty',
              'externalhelp','gender','education','major','age']]
    # add experiment number column (zero indexed)
    qdf["expNum"]=i
    
    # add subject number column unique across all exps
    qdf["iid"]= n + qdf['subjid']
    n+=qdf['subjid'].max() + 1
#     qdf["iid"]=qdf["expNum"].astype(str) + qdf['subjid'].astype(str)
    
    if i==0:
        qdf0=qdf
    elif i==1:
        qdf1=qdf
    elif i==2:
        qdf2=qdf
    elif i==3:
        qdf3=qdf
    elif i==4:
        qdf4=qdf
    elif i==5:
        qdf5=qdf

# qdfs = [qdf0,qdf1,qdf2,qdf3,qdf4, qdf5]

In [5]:
# fix experiment 5 (cat reg) mislabeled data

def get_moreA_CORRECTED(row):
    moreA_CORRECTED = row["moreA"]
    if row["rule"]==1:
        if row["dimvals"] in [1,2,5,6]:
            moreA_CORRECTED = not row["moreA"]
        
    return moreA_CORRECTED

# df = exp_df4.loc[(exp_df["expNum"]==4) & (exp_df["phase"]=="task")] 

# "moreA_CORRECTED" = True if allcards has more A cards than B cards
exp_df4["moreA_CORRECTED"] = exp_df4.apply(lambda row: get_moreA_CORRECTED(row), axis=1)
exp_df4["moreA"] = exp_df4["moreA_CORRECTED"]

# "correct_CORRECTED" = 0 if moreA==True
exp_df4["correct_CORRECTED"] = exp_df4.apply(lambda row: 0 if row["moreA_CORRECTED"] else 1, axis=1)
exp_df4["correct"] = exp_df4["correct_CORRECTED"]

# "hitormiss_CORRECTED" = True if resp == correct_CORRECTED
exp_df4["hitormiss_CORRECTED"] = exp_df4.apply(lambda row: row["resp"]==row["correct_CORRECTED"], axis=1)
exp_df4["hitormiss"] = exp_df4["hitormiss_CORRECTED"]

# df = df[["subjid","phase","ruleInt","trial","points","hitormiss_CORRECTED","rt"]]
# df["hit"] = df["hitormiss_CORRECTED"].astype(int)

# new_df=pd.get_dummies(df["ruleInt"],drop_first=True)
# # newnewdf = df.merge(new_df)
# df[["ruleCondition2","ruleCondition4"]]=new_df
# display(df)

In [6]:
exp_dfs = [exp_df0,exp_df1,exp_df2,exp_df3,exp_df4,exp_df5]
qdfs = [qdf0,qdf1,qdf2,qdf3,qdf4, qdf5]

exp_df = pd.concat(exp_dfs)
qdf = pd.concat(qdfs)

exp_df.reset_index(drop=True,inplace=True)
exp_df=exp_df.loc[:,~exp_df.columns.str.match("Unnamed")]
exp_df=exp_df.loc[:,~exp_df.columns.str.contains("CORRECTED")]
qdf=qdf.loc[:,~qdf.columns.str.match("Unnamed")]
qdf=qdf.loc[:,~qdf.columns.str.contains("CORRECTED")]
# exp_df=exp_df.drop(["moreA_CORRECTED","correct_CORRECTED","hitormiss_CORRECTED"])


print("N SUBJECTS TOTAL: " + str(len(exp_df["iid"].unique())))
for i in range(6):
    print("experiment "+str(i)+": ")
    print("n="+str(len(exp_df.loc[exp_df["expNum"]==i,"iid"].unique())))


N SUBJECTS TOTAL: 1252
experiment 0: 
n=426
experiment 1: 
n=100
experiment 2: 
n=100
experiment 3: 
n=36
experiment 4: 
n=210
experiment 5: 
n=380


In [7]:
# delete row of subject with extra bonus trial

# display to see the index of the row we need to delete - it should be 21473
display(exp_df.loc[(exp_df["iid"]==iids['A2UUQUAO917V8X:3R8YZBNQ9HJV8KTVDR1YM1TWJEY7QZ']) & (exp_df["phase"]=="bonus")])

try:
    exp_df = exp_df.drop([21473])
    print("row 21473 has been deleted.")
except:
    print("this row has already been deleted.")
    
display(exp_df.loc[(exp_df["iid"]==iids['A2UUQUAO917V8X:3R8YZBNQ9HJV8KTVDR1YM1TWJEY7QZ']) & (exp_df["phase"]=="bonus")])

# delete the experiment 6 subjects that had wrong number of trials
babydf = exp_df.query("expNum==5")
mistakes=[]
for sub in babydf['iid'].unique():
    ntrials = len(babydf.loc[babydf["iid"]==sub])
    if ntrials != 164:
        # print("subj " + str(sub) + " had " + str(len(exp_df.loc[exp_df["subjid"]==sub])) + " trials.")
        mistakes.append(sub)


Unnamed: 0,phase,subjid,incentive,rule,dimorder,dimvals,condition,counterbalance,block,trial,nrepeats,theorystim,actualstim,correct,resp,hitormiss,rt,testscore,timestop,lasttwodigits,bonusprob,bonus,ruleInt,ruleStr,expNum,iid,answervalue,game,ticketvalue,ticketcolor,blocktrial,bonustodate,totalbonustodate,ntrials,points,allcards,moreA,scoretodate,score,bonusvalue,win
21349,bonus,15,,3.0,,,,,,,,,,,,,,7.0,,,,0.06,4.0,IV,3,641,,0.0,0.02,blue,,,0.06,,,,,,,,
21390,bonus,15,,1.0,,,,,,,,,,,,,,6.0,,,,0.04,2.0,II,3,641,,1.0,0.02,blue,,,0.1,,,,,,,,
21431,bonus,15,,3.0,,,,,,,,,,,,,,6.0,,,,2.46,4.0,IV,3,641,,2.0,1.23,gold,,,2.56,,,,,,,,
21472,bonus,15,,1.0,,,,,,,,,,,,,,8.0,,,,4.92,2.0,II,3,641,,3.0,1.23,gold,,,7.48,,,,,,,,
21473,bonus,15,,1.0,,,,,,,,,,,,,,8.0,,,,4.92,2.0,II,3,641,,3.0,1.23,gold,,,12.4,,,,,,,,


row 21473 has been deleted.


Unnamed: 0,phase,subjid,incentive,rule,dimorder,dimvals,condition,counterbalance,block,trial,nrepeats,theorystim,actualstim,correct,resp,hitormiss,rt,testscore,timestop,lasttwodigits,bonusprob,bonus,ruleInt,ruleStr,expNum,iid,answervalue,game,ticketvalue,ticketcolor,blocktrial,bonustodate,totalbonustodate,ntrials,points,allcards,moreA,scoretodate,score,bonusvalue,win
21349,bonus,15,,3.0,,,,,,,,,,,,,,7.0,,,,0.06,4.0,IV,3,641,,0.0,0.02,blue,,,0.06,,,,,,,,
21390,bonus,15,,1.0,,,,,,,,,,,,,,6.0,,,,0.04,2.0,II,3,641,,1.0,0.02,blue,,,0.1,,,,,,,,
21431,bonus,15,,3.0,,,,,,,,,,,,,,6.0,,,,2.46,4.0,IV,3,641,,2.0,1.23,gold,,,2.56,,,,,,,,
21472,bonus,15,,1.0,,,,,,,,,,,,,,8.0,,,,4.92,2.0,II,3,641,,3.0,1.23,gold,,,7.48,,,,,,,,


In [8]:

if exclude==2:
    # Exclude subjects who responded that they used help OR did too many instruction loops
    max_instruct_loops = 5
    exclude_subjs = list(qdf.loc[(qdf["externalhelp"]=="yeshelp") | (qdf["instructionloops"]>max_instruct_loops),"iid"])
    
    exclude_subjs.append(iids['AVI7K876BV3QL:39DD6S19JPCPVR9WNVYHJ4XHZ1ZEZH']) # this subject has two bonus phase trials which should not be possible
    exclude_subjs.append(iids['A1PJEESG7NJ9H0:3FTYUGLFSUMMW7B7KCGHOI810DND5P']) # this subject has only 2 test trials and no bonus trial. what?? weird
    exclude_subjs.append(iids['AV5UZ0UDCMIXC:3MD9PLUKKIFTPXM9GU109RRQOWNNZJ']) # the exp 4 subject with multiple bonus trisls
      
    exclude_subjs.extend(mistakes)

    print("total subjs who will be excluded: " + str(len(exclude_subjs)))
    print(exclude_subjs)

    print("subjs who will be excluded due to admitting using help: ")
    display(qdf.loc[qdf["externalhelp"]=="yeshelp"])

    print("subjs who will be excluded due to wrong number of trials: ")
    print("excluding "+str(len(mistakes)) + " subjects")

    print("subjs who will be excluded due to too many instruction loops: ")
    display(qdf.loc[qdf["instructionloops"]>max_instruct_loops])

    for i in exclude_subjs:
        exp_df.drop(exp_df[exp_df['iid']==i].index, inplace=True)
elif exclude==1:
    # exclude only those who admitted to using memory help
    exclude_subjs = list(qdf.loc[(qdf["externalhelp"]=="yeshelp"),"iid"])
    
    exclude_subjs.append(iids['AVI7K876BV3QL:39DD6S19JPCPVR9WNVYHJ4XHZ1ZEZH']) # this subject has two bonus phase trials which should not be possible
    exclude_subjs.append(iids['A1PJEESG7NJ9H0:3FTYUGLFSUMMW7B7KCGHOI810DND5P']) # this subject has only 2 test trials and no bonus trial. what?? weird
    exclude_subjs.append(iids['AV5UZ0UDCMIXC:3MD9PLUKKIFTPXM9GU109RRQOWNNZJ']) # the exp 4 subject with multiple bonus trials, who is yeshelp anyways so already excluded
        
    exclude_subjs.extend(mistakes)

    print("total subjs who will be excluded: " + str(len(exclude_subjs)))
    print(exclude_subjs)

    print("subjs who will be excluded due to admitting using help: ")
    display(qdf.loc[qdf["externalhelp"]=="yeshelp"])

    print("subjs who will be excluded due to wrong number of trials: ")
    print("excluding "+str(len(mistakes)) + " subjects")
    
    for s in exclude_subjs:
#         print("dropping subject "+str(s))
        exp_df = exp_df.loc[exp_df["iid"]!=s]
#         print("now, n="+str(len(exp_df["iid"].unique())))
else:
    print("no exclusion criteria applied")

    
print("AFTER EXCLUSION:")
print("N SUBJECTS TOTAL: " + str(len(exp_df["iid"].unique())))
for i in range(len(fnames)):
    print("experiment "+str(i)+": ")
    print("n="+str(len(exp_df.loc[exp_df["expNum"]==i,"iid"].unique())))

total subjs who will be excluded: 102
[8, 60, 112, 231, 411, 412, 463, 490, 499, 536, 542, 556, 575, 577, 589, 600, 630, 648, 653, 655, 659, 673, 684, 737, 776, 815, 822, 841, 847, 849, 870, 873, 881, 884, 887, 900, 907, 911, 920, 931, 932, 935, 944, 945, 947, 949, 952, 958, 967, 980, 982, 983, 987, 993, 1004, 1013, 1024, 1027, 1039, 1045, 1071, 1073, 1084, 1086, 1096, 1097, 1123, 1124, 1125, 1144, 1145, 1148, 1157, 1179, 1194, 1214, 1215, 1222, 1228, 1231, 1232, 413, 70, 659, 879, 907, 930, 952, 965, 975, 993, 1009, 1014, 1032, 1045, 1096, 1107, 1173, 1177, 1231, 1237, 1249]
subjs who will be excluded due to admitting using help: 


Unnamed: 0,subjid,instructionloops,strategy,helpstrategy,howtoimprove,engagement,difficulty,externalhelp,gender,education,major,age,expNum,iid
8,8,1,no,good,no,8.0,7.0,yeshelp,woman,masters,noresp,35.0,0,8
60,60,5,like,no,nothing,8.0,8.0,yeshelp,man,bachelors,socialscience,30.0,0,60
112,112,8,Angrybird,Yes,Nothing,9.0,8.0,yeshelp,man,bachelors,business,43.0,0,112
231,231,1,,took a couple notes about what I thought the p...,,8.0,4.0,yeshelp,man,bachelors,socialscience,34.0,0,231
411,411,4,very nice difficult game,yes,no,8.0,8.0,yeshelp,man,bachelors,business,35.0,0,411
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,343,9,Good,notes,This task is good.,3.0,2.0,yeshelp,woman,bachelors,noresp,36.0,5,1215
350,350,1,Good,yes,Good study,10.0,8.0,yeshelp,man,masters,mathscience,42.0,5,1222
356,356,1,nothing,no,good,1.0,4.0,yeshelp,man,bachelors,mathscience,1.0,5,1228
359,359,1,it was good,yes,well,6.0,9.0,yeshelp,man,masters,business,30.0,5,1231


subjs who will be excluded due to wrong number of trials: 
excluding 18 subjects
AFTER EXCLUSION:
N SUBJECTS TOTAL: 1157
experiment 0: 
n=418
experiment 1: 
n=97
experiment 2: 
n=93
experiment 3: 
n=31
experiment 4: 
n=200
experiment 5: 
n=318


In [9]:
# save these dataframes as cleaned csvs

save_data_to_file(exp_fname,exp_df)

save_data_to_file(q_fname,qdf)

the data has been saved to cleandata/exp-08-02-2022.csv
the data has been saved to cleandata/q-08-02-2022.csv


In [10]:
%load_ext watermark
%watermark -n -u -v -iv -w

Last updated: Tue Aug 02 2022

Python implementation: CPython
Python version       : 3.7.13
IPython version      : 7.31.1

sys        : 3.7.13 (default, Mar 28 2022, 07:24:34) 
[Clang 12.0.0 ]
pingouin   : 0.5.1
pandas     : 1.3.5
arviz      : 0.12.1
numpy      : 1.21.5
bambi      : 0.9.0
seaborn    : 0.11.2
statsmodels: 0.13.2
scipy      : 1.7.3
matplotlib : 3.5.1

Watermark: 2.3.0

