# Empirical aDDM Generation

Model independent analysis confirmed that there are good and bad trends within Liu et al.'s (2025) simulated fixation data sampling from the gamma distribution. Following Tavares et al. (2017), we believe that better data can be generated empirically.

### Generating Empirical Data

In [4]:
import pandas as pd

def rasterize_data(
    df: pd.DataFrame,
    subject_col: str,
    trial_col: str,
    seq_col: str = "fixation",
    fill_codes: set = {0, 4},
    start_col: str = "fix_start",
    end_col: str = "fix_end",
    loc_col: str = "fix_location",
    fixnum_col: str | None = None,
    keep_cols: list[str] | None = None,
) -> pd.DataFrame:
    """
    Expand per-(subject, trial) fixation sequences into fixation-level rows.
    Zero-valued segments are treated as transitions and excluded.
    """

    df = df.copy()

    if keep_cols is None:
        keep_cols = [
            c for c in df.columns
            if c not in {subject_col, trial_col, seq_col}
        ]

    rows = []

    for _, row in df.iterrows():
        seq = np.asarray(row[seq_col])

        changes = np.diff(seq, prepend=seq[0])
        starts = np.where(changes != 0)[0]

        fix_num = 0

        for i, start_idx in enumerate(starts):
            loc = seq[start_idx]

            end_idx = (
                starts[i + 1]
                if i + 1 < len(starts)
                else len(seq)
            )

            # Skip transitions
            if loc in fill_codes:
                continue

            data = {
                subject_col: row[subject_col],
                trial_col: row[trial_col],
                start_col: start_idx,
                end_col: end_idx,
                loc_col: loc,
            }

            if fixnum_col is not None:
                data[fixnum_col] = fix_num
                fix_num += 1

            for col in keep_cols:
                data[col] = row[col]

            rows.append(data)

    return pd.DataFrame(rows)

In [6]:
import os, sys

DDM_dir = os.path.abspath('/Users/bchien37/Desktop/Enkavilab/DDM')
sys.path.append(DDM_dir)

In [52]:
from simulation import get_corrected_empirical_distributions
from ast import literal_eval
import numpy as np

df_raw = pd.read_csv('/Users/bchien37/Desktop/Enkavilab/DDM/1ms_trial_data.csv')
df_raw['choice'] = df_raw['choice'].map({'left':0,'right':1})
df_raw['RT'] = df_raw['RT']*1000 # adjustment for RT
df_raw['fixation'] = df_raw['fixation'].apply(literal_eval)

to_drop = pd.read_csv("/Users/bchien37/Desktop/Enkavilab/DDM/dropped_trials.csv").rename(columns={"parcode": "sub_id"})

df = df_raw[
    ~df_raw.set_index(["sub_id", "trial"]).index.isin(
        to_drop.set_index(["sub_id", "trial"]).index
    )
]

value_diffs = np.unique(df['avgWTP_left'] - df['avgWTP_right'])
legend = {
    "left": {1},
    "right": {2},
    "transition": {0}, 
    "blank_fixation": {4}
}
fixation_col = 'fixation'
left_value_col = 'avgWTP_left'
right_value_col = 'avgWTP_right'

empirical_distributions = get_corrected_empirical_distributions(
    df,
    value_diffs=value_diffs,
    legend=legend,
    fixation_col=fixation_col,
    left_value_col=left_value_col,
    right_value_col=right_value_col,
    cutoff=0.9
)

In [57]:
df.loc[df['RT'] > 15000]

Unnamed: 0,sub_id,trial,hidden,avgWTP_left,avgWTP_right,choice,RT,fixation
10,329,11,True,1.0,1.0,1,16445.0,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6463,301,24,False,3.5,3.75,0,15885.0,"(4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."
6466,301,27,False,3.5,2.5,1,21756.0,"(4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."
6479,301,40,False,4.0,4.5,1,22630.0,"(4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."
6495,301,56,False,5.0,3.0,1,18387.0,"(4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."
6500,301,61,False,1.0,2.0,0,20270.0,"(4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."
6544,301,105,True,5.0,3.0,1,15743.0,"(4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."
6554,301,115,True,3.25,3.25,1,48607.0,"(4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."
6600,301,161,True,4.25,4.5,0,29450.0,"(4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."


### Translating into efficient-fpt data generation

In [10]:
from efficient_fpt.models import DDModel, piecewise_const_func
from efficient_fpt.utils import get_alternating_mu_array

class aDDModel(DDModel):
    """
    One trial of an attentional drift diffusion model with alternating drift mu1 and mu2.
    """
    def __init__(self, mu1, mu2, sacc_array, flag, sigma, a, b, x0):
        super().__init__(x0)
        # drift parameters
        self.mu1 = mu1
        self.mu2 = mu2
        self.sacc_array = sacc_array
        self.flag = flag # indicates whether the process starts with mu1 (flag=0) or mu2 (flag=1)
        self.d = len(sacc_array) # number of stages
        self.mu_array = get_alternating_mu_array(mu1, mu2, self.d, flag)
        # diffusion parameter
        self.sigma = sigma
        # symmetric linear boundary parameters
        self.a = a
        self.b = b

    def drift_coeff(self, X: float, t: float) -> float:
        return piecewise_const_func(t, self.mu_array, self.sacc_array)

    def diffusion_coeff(self, X: float, t: float) -> float:
        return self.sigma

    @property
    def is_update_vectorizable(self) -> bool:
        return True

    def upper_bdy(self, t: float) -> float:
        return self.a - self.b * t

    def lower_bdy(self, t: float) -> float:
        return -self.a + self.b * t

In [17]:
sub_df = df.loc[df['sub_id'] == 304]
num_data, _ = sub_df.shape

The key process we want to change is in this section. For now, I won't worry about parallel processing in cython. I've done some preliminary research and realize that pre-generating data is not useful because we have to "dump" data and share it across parallel threads. Perhaps this is why the researchers wanted to generate things internally. 

On the large scale, of course this will not work, but this is an inherent conflict in the applied math pipeline and the cognitive modeling process.

In [18]:
# Defining constants
a, b = , 0.3
# T = a / b
T = 30 # Max empirical trial not left out is 21
x0 = 0
sigma = 0.7 # As calculated from Eum et al. (2023)

decision_data = np.zeros((num_data, 2))

# r1_data = np.zeros(num_data)
# r2_data = np.zeros(num_data)
r1_data = sub_df['avgWTP_left'].to_numpy()
r2_data = sub_df['avgWTP_left'].to_numpy()

# flag_data = np.zeros(num_data)
p0 = empirical_distributions['probFixLeftFirst']
flag_data = np.random.binomial(n=1, p=1 - p0, size=num_data)

mu_data = [None] * num_data
sacc_data = [None] * num_data

eta = 0.7
kappa = 0.5

# This function is not yet translated into parallel-ready code
def simulate_trial(n, r1_data, r2_data, flag_data, eta, kappa, sigma, a, b, T, x0, shape_param, scale_param):
    rng = np.random.default_rng()
    fixations = rng.gamma(shape_param, scale_param, 1000)
    sacc_array = np.insert(np.cumsum(fixations), 0, 0)
    sacc_array = sacc_array[sacc_array < T]

    # Assigning first fixation location
    # flag = rng.binomial(1, 0.5)
    flag = flag_data[n]

    # Assigning stimuli values
    # r1 = np.random.randint(1, 6)
    # r2 = np.random.randint(1, 6)
    r1 = r1_data[n]
    r2 = r2_data[n]
    
    mu1 = kappa * (r1 - eta * r2)
    mu2 = kappa * (eta * r1 - r2)
    # print(f"r1={r1}, r2={r2}, eta={eta}, kappa={kappa}, mu1={mu1}, mu2={mu2}")

    addm = aDDModel(mu1=mu1, mu2=mu2, sacc_array=sacc_array, flag=flag, sigma=sigma, a=a, b=b, x0=x0)
    decision = addm.simulate_fpt_datum(dt=1e-5)

    sacc_array = sacc_array[sacc_array < decision[0]]
    d = len(sacc_array)
    mu_array = get_alternating_mu_array(mu1, mu2, d, flag)

    return decision, mu_array, sacc_array, r1, r2, flag

print("Available jobs (CPU cores):", cpu_count())

start_time = time.time()
results = Parallel(n_jobs=-1)(
    delayed(simulate_trial)(n, eta, kappa, sigma, a, b, T, x0, shape_param, scale_param)
    for n in range(num_data)
)
print(f"Elapsed time: {time.time() - start_time:.3f} seconds")


# Store results
for n, (decision, mu_array, sacc_array, r1, r2, flag) in enumerate(results):
    decision_data[n] = decision
    mu_data[n] = mu_array
    sacc_data[n] = sacc_array
    r1_data[n] = r1
    r2_data[n] = r2
    flag_data[n] = flag

NameError: name 'cpu_count' is not defined