In [1]:
# Cell 1: Imports and Load Data
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy.stats import iqr

print("--- Notebook 02: Rich Per-Trial Feature Engineering ---")
df_fix = pd.read_csv('../data/processed_fixations_per_trial.csv')
df_sac = pd.read_csv('../data/processed_saccades_per_trial.csv')

# Cell 2: Define AOIs and Normalize Coordinates
max_x, max_y = df_fix['Fixation X'].max(), df_fix['Fixation Y'].max()
df_fix['x_norm'] = df_fix['Fixation X'] / max_x
df_fix['y_norm'] = df_fix['Fixation Y'] / max_y

n_aois = 8
aoi_model = KMeans(n_clusters=n_aois, random_state=42, n_init=10)
df_fix['aoi'] = aoi_model.fit_predict(df_fix[['x_norm', 'y_norm']])

# Cell 3: Extract Features Per Trial
# ... (same feature extraction code as the last full script)
def get_transition_entropy(aoi_sequence):
    if len(aoi_sequence) < 2: return 0
    transitions = list(zip(aoi_sequence[:-1], aoi_sequence[1:]))
    if not transitions: return 0
    counts = pd.Series(transitions).value_counts()
    probs = counts / len(transitions)
    return -np.sum(probs * np.log2(probs))

trial_features = []
# ... (loop through trials and calculate features as before)
for trial_id in df_fix['trial_id'].unique():
    t_fix = df_fix[df_fix['trial_id'] == trial_id]
    t_sac = df_sac[df_sac['trial_id'] == trial_id]
    if len(t_fix) < 2 or t_sac.empty: continue
    
    features = {
        'participant_id': t_fix['participant_id'].iloc[0], 'trial_id': trial_id,
        'num_fixations': len(t_fix), 'mean_fix_duration': t_fix['Fixation Duration'].mean(),
        'spatial_dispersion': np.sqrt(t_fix['Fixation X'].var() + t_fix['Fixation Y'].var()),
        'mean_saccade_amplitude': t_sac['Saccade Amplitude'].mean(), 'saccade_amplitude_iqr': iqr(t_sac['Saccade Amplitude']),
        'scanpath_length': np.sum(np.sqrt(np.diff(t_fix['x_norm'])**2 + np.diff(t_fix['y_norm'])**2)),
        'transition_entropy': get_transition_entropy(t_fix['aoi'].values),
        'revisit_ratio': 1 - (t_fix['aoi'].nunique() / len(t_fix['aoi'])) if len(t_fix['aoi']) > 0 else 0,
    }
    trial_features.append(features)

df_trial_features = pd.DataFrame(trial_features)

# Cell 4: Aggregate to Participant Level and Save
agg_funcs = {col: 'median' for col in df_trial_features.columns if col not in ['participant_id', 'trial_id']}
df_participant_features = df_trial_features.groupby('participant_id').agg(agg_funcs)
df_participant_features.fillna(df_participant_features.median(), inplace=True)

df_participant_features.to_csv('../data/participant_features.csv')
print("\n✅ Aggregated participant features saved to the 'data' folder.")

--- Notebook 02: Rich Per-Trial Feature Engineering ---

✅ Aggregated participant features saved to the 'data' folder.
