In [2]:
pip install pandas numpy scikit-learn matplotlib seaborn hmmlearn joblib networkx tqdm python-Levenshtein

Collecting hmmlearn
  Downloading hmmlearn-0.3.3-cp312-cp312-win_amd64.whl.metadata (3.1 kB)
Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp312-cp312-win_amd64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.14.1-cp312-cp312-win_amd64.whl.metadata (12 kB)
Downloading hmmlearn-0.3.3-cp312-cp312-win_amd64.whl (127 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp312-cp312-win_amd64.whl (100 kB)
Downloading rapidfuzz-3.14.1-cp312-cp312-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   -------------------- ------------------- 0.8/1.5 MB 6.7 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 6.3 MB/s eta 0:00:00
Installing collected packages: rapi

In [5]:
# Cell 1: Imports and Load Data
import pandas as pd
import numpy as np
from hmmlearn import hmm
import joblib

print("--- Notebook 04: Sequence Modeling (HMM) ---")
df_fix = pd.read_csv('../data/processed_fixations_per_trial.csv')
df_clustered = pd.read_csv('../data/clustered_features.csv')

# NOTE: Re-run AOI definition to ensure 'aoi' column is present
# This is a small redundancy for notebook independence
from sklearn.cluster import KMeans
n_aois=8
max_x, max_y = df_fix['Fixation X'].max(), df_fix['Fixation Y'].max()
df_fix['x_norm'] = df_fix['Fixation X'] / max_x
df_fix['y_norm'] = df_fix['Fixation Y'] / max_y
aoi_model = KMeans(n_clusters=n_aois, random_state=42, n_init=10)
df_fix['aoi'] = aoi_model.fit_predict(df_fix[['x_norm', 'y_norm']])

# Cell 2: Prepare HMM Data and Train
# ... (same HMM preparation and training code as the last full script)
def prepare_hmm_data(pids, df_fixations):
    sequences = [df_fixations[df_fixations['participant_id'] == pid]['aoi'].values for pid in pids]
    sequences = [s for s in sequences if len(s) > 0]
    lengths = [len(s) for s in sequences]
    return np.concatenate(sequences).reshape(-1, 1), lengths

piecemeal_pids = df_clustered[df_clustered['strategy'] == 'Piecemeal']['participant_id']
holistic_pids = df_clustered[df_clustered['strategy'] == 'Holistic']['participant_id']

X_piecemeal, lengths_piecemeal = prepare_hmm_data(piecemeal_pids, df_fix)
X_holistic, lengths_holistic = prepare_hmm_data(holistic_pids, df_fix)

n_hidden_states = 3
hmm_piecemeal = hmm.MultinomialHMM(n_components=n_hidden_states, random_state=42, n_iter=150).fit(X_piecemeal, lengths_piecemeal)
hmm_holistic = hmm.MultinomialHMM(n_components=n_hidden_states, random_state=42, n_iter=150).fit(X_holistic, lengths_holistic)

# Cell 3: Save Models
joblib.dump(hmm_piecemeal, '../models/hmm_piecemeal.pkl')
joblib.dump(hmm_holistic, '../models/hmm_holistic.pkl') # Note: saving two specific HMMs
print("\n✅ HMM models saved for each strategy.")

--- Notebook 04: Sequence Modeling (HMM) ---


MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340
MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340



✅ HMM models saved for each strategy.


In [6]:
# Cell 1: Imports and Load Data
import pandas as pd
import numpy as np
from hmmlearn import hmm
import joblib
from sklearn.cluster import KMeans

print("--- Notebook 04 (Upgraded): Sequence Modeling (HMM) ---")
df_fix = pd.read_csv('../data/processed_fixations_per_trial.csv')
df_clustered = pd.read_csv('../data/clustered_features.csv')

# NOTE: Re-run AOI definition to ensure consistency
n_aois=8
max_x, max_y = df_fix['Fixation X'].max(), df_fix['Fixation Y'].max()
df_fix['x_norm'] = df_fix['Fixation X'] / max_x
df_fix['y_norm'] = df_fix['Fixation Y'] / max_y
aoi_model = KMeans(n_clusters=n_aois, random_state=42, n_init=10)
df_fix['aoi'] = aoi_model.fit_predict(df_fix[['x_norm', 'y_norm']])

# Cell 2: Prepare HMM Data and Train
def prepare_hmm_data(pids, df_fixations):
    sequences = [df_fixations[df_fixations['participant_id'] == pid]['aoi'].values for pid in pids]
    sequences = [s for s in sequences if len(s) > 0]
    lengths = [len(s) for s in sequences]
    return np.concatenate(sequences).reshape(-1, 1), lengths

piecemeal_pids = df_clustered[df_clustered['strategy'] == 'Piecemeal']['participant_id']
holistic_pids = df_clustered[df_clustered['strategy'] == 'Holistic']['participant_id']

X_piecemeal, lengths_piecemeal = prepare_hmm_data(piecemeal_pids, df_fix)
X_holistic, lengths_holistic = prepare_hmm_data(holistic_pids, df_fix)

# --- Sanity Check (Crucial for Debugging) ---
print("\n--- Data Preparation Sanity Check ---")
print(f"Piecemeal group has {len(piecemeal_pids)} participants, with a total of {X_piecemeal.shape[0]} fixations.")
print(f"Holistic group has {len(holistic_pids)} participants, with a total of {X_holistic.shape[0]} fixations.")
# This check ensures the two datasets are indeed different
if np.array_equal(X_piecemeal, X_holistic):
    print("‼️ WARNING: The datasets for the two groups are identical. Check the clustering step.")
else:
    print("✅ Datasets for the two groups are confirmed to be different.")
# ---------------------------------------------

# --- FIX: Changed n_hidden_states from 3 to 2 for a more robust model ---
n_hidden_states = 2
print(f"\nTraining HMMs with {n_hidden_states} hidden states (Focus vs. Explore)...")
# -------------------------------------------------------------------------

# Increased n_iter for better convergence
hmm_piecemeal = hmm.MultinomialHMM(n_components=n_hidden_states, random_state=42, n_iter=200, tol=1e-3).fit(X_piecemeal, lengths_piecemeal)
hmm_holistic = hmm.MultinomialHMM(n_components=n_hidden_states, random_state=42, n_iter=200, tol=1e-3).fit(X_holistic, lengths_holistic)

print("\nTraining complete.")

# Cell 3: Save Models
joblib.dump(hmm_piecemeal, '../models/hmm_piecemeal.pkl')
joblib.dump(hmm_holistic, '../models/hmm_holistic.pkl')
print("\n✅ HMM models saved for each strategy.")

# Cell 4: (Optional but Recommended) - Check Transition Matrices
print("\n--- Generated Transition Matrices ---")
print("\nPiecemeal HMM Transition Matrix:")
print(pd.DataFrame(hmm_piecemeal.transmat_, columns=[f"To S{i}" for i in range(n_hidden_states)], index=[f"From S{i}" for i in range(n_hidden_states)]))

print("\nHolistic HMM Transition Matrix:")
print(pd.DataFrame(hmm_holistic.transmat_, columns=[f"To S{i}" for i in range(n_hidden_states)], index=[f"From S{i}" for i in range(n_hidden_states)]))

--- Notebook 04 (Upgraded): Sequence Modeling (HMM) ---


MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340



--- Data Preparation Sanity Check ---
Piecemeal group has 34 participants, with a total of 1805715 fixations.
Holistic group has 3 participants, with a total of 111 fixations.
✅ Datasets for the two groups are confirmed to be different.

Training HMMs with 2 hidden states (Focus vs. Explore)...


MultinomialHMM has undergone major changes. The previous version was implementing a CategoricalHMM (a special case of MultinomialHMM). This new implementation follows the standard definition for a Multinomial distribution (e.g. as in https://en.wikipedia.org/wiki/Multinomial_distribution). See these issues for details:
https://github.com/hmmlearn/hmmlearn/issues/335
https://github.com/hmmlearn/hmmlearn/issues/340



Training complete.

✅ HMM models saved for each strategy.

--- Generated Transition Matrices ---

Piecemeal HMM Transition Matrix:
            To S0     To S1
From S0  0.878274  0.121726
From S1  0.998874  0.001126

Holistic HMM Transition Matrix:
            To S0     To S1
From S0  0.878274  0.121726
From S1  0.998874  0.001126
