In [9]:
# Cell: Feature Preprocessing + Clustering (Fixation Only)
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import joblib
import os

print("--- Notebook 03: Feature Preprocessing + Clustering (Fixation Only) ---")
os.makedirs('../data', exist_ok=True)
os.makedirs('../models', exist_ok=True)

# Load per-trial fixation data
df_fix = pd.read_csv("../data/processed_fixations_per_trial.csv")

# Aggregate trial-level fixation features per participant
df_features = df_fix.groupby("participant_id").agg({
    "Fixation Duration": ["mean", "std", "median", "max", "min"],
    "Fixation X": ["mean", "std", "median", "max", "min"],
    "Fixation Y": ["mean", "std", "median", "max", "min"]
}).reset_index()

# Flatten MultiIndex column names
df_features.columns = ["participant_id"] + [
    "_".join(col).strip() for col in df_features.columns[1:]
]

# Fill NaN-only columns with 0
df_features = df_features.fillna(0)

# Impute remaining missing values
imputer = SimpleImputer(strategy="mean")
features_imputed = imputer.fit_transform(df_features.drop(columns=["participant_id"]))
df_features_imputed = pd.DataFrame(
    features_imputed,
    columns=df_features.drop(columns=["participant_id"]).columns,
    index=df_features.index
)

# Scale features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(df_features_imputed)
df_features_scaled = pd.DataFrame(
    features_scaled,
    columns=df_features_imputed.columns,
    index=df_features_imputed.index
)
df_features_scaled["participant_id"] = df_features["participant_id"]

# ---------- Clustering ----------
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10).fit(df_features_scaled.drop(columns=["participant_id"]))
df_features_scaled['cluster'] = kmeans.labels_

# Identify Piecemeal cluster (more fixations) and assign strategy
cluster_summary = df_features_scaled.groupby('cluster')['Fixation Duration_mean'].mean()
piecemeal_cluster_id = cluster_summary.idxmax()
df_features_scaled['strategy'] = np.where(
    df_features_scaled['cluster'] == piecemeal_cluster_id, 'Piecemeal', 'Holistic'
)

# Save clustered features for downstream notebooks
df_features_scaled.to_csv("../data/clustered_features.csv", index=False)
joblib.dump(kmeans, '../models/kmeans_model.pkl')

print("\n✅ Features preprocessed, clustered, and saved to ../data/clustered_features.csv")


--- Notebook 03: Feature Preprocessing + Clustering (Fixation Only) ---





✅ Features preprocessed, clustered, and saved to ../data/clustered_features.csv
