# Preprocessing and Stan fitting

## Step 1: Import libraries

In [2]:
# some libraries might not be used for pre-processing and Stan modeling - might be orphans of past notebooks :/
from __future__ import print_function, division
import getpass
import pickle
import pathlib
import os
import sys
import numpy as np
import pandas as pd
import random
from more_itertools import unique_everseen
import matplotlib as mpl
from scipy.stats import ttest_ind
from statistics import mean, stdev, variance
from typing import Sequence, List
from collections import OrderedDict
import arviz as az
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols, mixedlm
%matplotlib notebook
import pystan
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

## Step 2: Load in data and clean for modeling

In [2]:
username = getpass.getuser()
root = pathlib.Path(os.path.join(os.path.join(os.path.expanduser('~')), 'Documents/GitHub/amplification-race-binary-ddm-model') + "/")
raw = pd.read_csv(root / "final_dataset_ratings_with_identity_values.csv")


In [4]:
# create new dataframe df with only the columns that I need for DDM
df = pd.DataFrame({'ID': raw['Random.ID'], 'choice': raw['emo_binary'],
                   'rt': raw['rt'], 'valence': raw['valence'], 
                   'identity': raw['faces'], 'intensity': raw['valence_values']})

# convert the nominal variables into integers for Stan; convert identity and intensity strings to lists; convert rt to seconds
df = df.dropna(axis=0).reset_index(drop=True)
df['choice'] = [1 if x == 'Not Emotional' else 2 for x in df['choice']]
df['valence'] = [1 if x == 'Happy' else 2 for x in df['valence']]
df['identity'] = [eval(x) for x in df['identity']]
df['intensity'] = [eval(x) for x in df['intensity']]
df['rt'] = [x/1000 for x in df['rt']]

# remove any subjects that lack variability in their emo_binary choices - can't use them for DDM
dellist = []
for x in df['ID'].unique():
    if len(df[df['ID']==x]['choice'].unique()) < 2:
        dellist.append(x)
print('subjects with no variation: %s' % dellist)
df = df[~df['ID'].isin(dellist)]

# remove any observations where rt < 100ms - these are likely false starts
df = df[df['rt'] > 0.1]
df = df.reset_index(drop=True)

# collapse face identities into either black or white; append NAs to identity lists that are len < 12 so that all vectors are len 12
# likewise, append 0.0 to intensity lists so that all intensity vectors are len 12
identitydict = {'E': 1, 'F': 1, 'B': 2, 'C': 2, 'NA': 0}
for i, x in enumerate(df['identity']):
    while len(x) < 12:
        x.append('NA')
        df['intensity'][i].append(0.0)
    df['identity'][i] = [identitydict[e] for e in x]
    
# convert df variables into arrays and vectors for Stan data block
grouped = df.groupby(['ID'], sort=False)
trials_per = grouped.size()
subs = list(trials_per.index)
nsubs = len(subs)
tsubs = list(trials_per)
tmax = max(tsubs)
choice = np.full((nsubs, tmax), -1, dtype=int)
rt = np.full((nsubs, tmax), -1, dtype=float)
valence = np.full((nsubs, tmax), -1, dtype=int)
intensity = np.full((nsubs, tmax, 12), -1, dtype=int)
identity = np.full((nsubs, tmax, 12), -1, dtype=int)
sub_group = iter(grouped)
for s in range(nsubs):
    _, sub_data = next(sub_group)
    t = tsubs[s]
    choice[s][:t] = sub_data['choice']
    rt[s][:t] = sub_data['rt']
    valence[s][:t] = sub_data['valence']
    intensity[s][:t] = np.asarray([np.array(x) for x in sub_data['intensity']])
    identity[s][:t] = np.asarray([np.array(x) for x in sub_data['identity']])
rtmin = np.full(nsubs, -1, dtype=float)
rtbound = 0.1
sub_group = iter(grouped)
for s in range(nsubs):
    _, sub_data = next(sub_group)
    rtmin[s] = min(sub_data['rt'])        
data = {
    'N': nsubs,
    'T': tmax,
    'Tsub': tsubs,
    'choice': choice,
    'valence': valence,
    'rt': rt,
    'rtmin': rtmin,
    'rtbound': rtbound,
    'intensity': intensity,
    'identity': identity,
}


subjects with no variation: [86435]


## Step 3: Generate the Stan model with code of choice

In [4]:
# import the Stan code from facesddm_stan.py; remove and reimport if already imported - this is helpful when making changes to Stan code
if "facesddm_stan" in sys.modules:
    sys.modules.pop('facesddm_stan')
from facesddm_stan import facesddmcode

In [None]:
# compile C++ code for model
ddm_sm = pystan.StanModel(model_code=facesddmcode, model_name='DDM')

INFO:pystan:COMPILING THE C++ CODE FOR MODEL DDM_f45104a379bf3a395fd5160683dc7ea9 NOW.


## Step 4: Start sampling with MCMC

In [20]:
# fit the model to the data
# more iterations the better, but at cost of compute and memory; shoot for 100,000 iterations, 50,000 of which are warmup
# Use at least 4 chains; thin the samples so that only every other 5 are included in posterior, to reduce autocorrelation
# Set a seed for reproducability
ddm_fit = ddm_sm.sampling(data=data, iter=500, warmup=250, chains=1, thin=5, refresh=1, seed=101)

To run all diagnostics call pystan.check_hmc_diagnostics(fit)


## Step 5: Save the model and fit object as a pickle file

In [10]:
import pickle
with open("facesddm.pkl", "wb") as f:
    pickle.dump({'model' : ddm_sm, 'fit' : ddm_fit}, f, protocol=-1)