# Part 2: Modeling and Evaluation

In this section, we use the lagged embeddings created in Part 1 (Bag-of-Words, Word2Vec, and GloVe) to build predictive models of brain activity using fMRI voxel data.

We follow these key steps:
1. Load and validate the preprocessed embeddings (X) for each story.
2. Load corresponding fMRI response matrices (Y) for each subject.
3. Fit a ridge regression model for each embedding type to predict Y from X.
4. Compute the mean correlation coefficient (CC) across all voxels to evaluate model performance.
5. Save the trained models and the evaluation metrics for further analysis.

Each embedding is evaluated on the same story to ensure consistency across comparisons.
This part prepares the ground for more advanced evaluation, such as voxel-level analysis and cross-validation.

Before we start, let's load the data and check they are in the right format by checking story timepoint consistency across embeddings

In [None]:
import joblib
import pickle
from pathlib import Path

# Load X_bow, X_w2v, and X_glove
def load_embeddings(data_dir):
    """Load all three types of embedding data into a dictionary."""
    X_bow = joblib.load(data_dir / "X_lagged_BoW.joblib")
    with open(data_dir / "X_lagged_W2V.pkl", "rb") as f:
        X_w2v = pickle.load(f)
    with open(data_dir / "X_lagged_GloVe.pkl", "rb") as f:
        X_glove = pickle.load(f)
    return {"BoW": X_bow, "Word2Vec": X_w2v, "GloVe": X_glove}

def check_timepoint_consistency(embeddings):
    """Check if timepoint lengths match across BoW, Word2Vec, and GloVe for each story."""
    print("Checking story timepoint consistency across embeddings...\n")
    story_set = set(embeddings["BoW"]) & set(embeddings["Word2Vec"]) & set(embeddings["GloVe"])
    mismatches = []

    for story in sorted(story_set):
        l_bow = embeddings["BoW"][story].shape[0]
        l_w2v = embeddings["Word2Vec"][story].shape[0]
        l_glove = embeddings["GloVe"][story].shape[0]

        if not (l_bow == l_w2v == l_glove):
            mismatches.append((story, l_bow, l_w2v, l_glove))

    if mismatches:
        print("Found mismatches in the following stories:")
        for story, l_b, l_w, l_g in mismatches:
            print(f"- {story}: BoW={l_b}, Word2Vec={l_w}, GloVe={l_g}")
    else:
        print("All stories have consistent timepoint lengths across embeddings.")

# === Run check ===
DATA_DIR = Path("../data")
embeddings = load_embeddings(DATA_DIR)
check_timepoint_consistency(embeddings)

Checking story timepoint consistency across embeddings...

All stories have consistent timepoint lengths across embeddings.


In [None]:
import numpy as np
from pathlib import Path

def load_subject_y_data(subject_dir):
    """
    Load all .npy files in a directory and return a dict {story: np.ndarray}
    """
    subject_dir = Path(subject_dir)
    y_files = list(subject_dir.glob("*.npy"))
    print(f"Loaded")
    return {f.stem: np.load(f, mmap_mode='r') for f in y_files}

# Load Subject Y data
subject2_y = load_subject_y_data("../../tmp_ondemand_ocean_mth240012p_symlink/shared/data/subject2")
subject3_y = load_subject_y_data("../../tmp_ondemand_ocean_mth240012p_symlink/shared/data/subject3")

In [None]:
print("Number of voxels:", subject2_y["tildeath"].shape[1])
print("Number of voxels:", subject3_y["tildeath"].shape[1])

In [None]:
import numpy as np
import joblib
import random
from pathlib import Path

def split_and_save_XY_data(
    X_dicts, subject_y_dicts, save_dir, test_ratio=0.3, seed=42
):
    """
    Split story-level data into training and test sets, and save them with compression.
    
    Parameters
    ----------
    X_dicts : dict
        Dictionary containing embeddings {"BoW": ..., "Word2Vec": ..., "GloVe": ...}
    subject_y_dicts : dict
        Dictionary of Y values per subject, e.g., {"subject2": {...}, "subject3": {...}}
    save_dir : str or Path
        Directory to save the resulting data files
    test_ratio : float
        Fraction of stories to assign to test set (default: 0.3)
    seed : int
        Random seed for reproducibility
    """
    save_dir = Path(save_dir)
    save_dir.mkdir(parents=True, exist_ok=True)

    # Find common story names across all X and Y dictionaries
    story_sets = [set(d.keys()) for d in X_dicts.values()] + [set(v.keys()) for v in subject_y_dicts.values()]
    common_stories = sorted(set.intersection(*story_sets))

    # Randomly split stories into train and test sets
    random.seed(seed)
    n_test = int(len(common_stories) * test_ratio)
    test_stories = sorted(random.sample(common_stories, n_test))
    train_stories = sorted(list(set(common_stories) - set(test_stories)))

    # Save X_train and X_test for each embedding type (with compression)
    for name, Xdict in X_dicts.items():
        X_train = np.concatenate([Xdict[s] for s in train_stories], axis=0)
        X_test = np.concatenate([Xdict[s] for s in test_stories], axis=0)
        joblib.dump(X_train, save_dir / f"X_train_{name}.joblib", compress=3)
        joblib.dump(X_test, save_dir / f"X_test_{name}.joblib", compress=3)

    # Save Y_train and Y_test for each subject (compressed npz)
    for subject_name, Ydict in subject_y_dicts.items():
        Y_train = np.concatenate([Ydict[s] for s in train_stories], axis=0)
        Y_test = np.concatenate([Ydict[s] for s in test_stories], axis=0)
        np.savez_compressed(save_dir / f"Y_train_{subject_name}.npz", Y_train)
        np.savez_compressed(save_dir / f"Y_test_{subject_name}.npz", Y_test)

    print("✅ Saved all training and test data.")
    # print("Train stories:", train_stories)
    # print("Test stories :", test_stories)

    return train_stories, test_stories

X_dicts = {
    "BoW": embeddings["BoW"],
    "Word2Vec": embeddings["Word2Vec"],
    "GloVe": embeddings["GloVe"]
}

subject_y_dicts = {
    "subject2": subject2_y,
    "subject3": subject3_y
}

train_stories, test_stories = split_and_save_XY_data(
    X_dicts=X_dicts,
    subject_y_dicts=subject_y_dicts,
    save_dir="../data",
    test_ratio=0.3,
    seed=42
)

In [None]:
import numpy as np
from pathlib import Path

def print_mismatched_x_y(embeddings, y_dir):
    y_dir = Path(y_dir)
    y_files = list(y_dir.glob("*.npy"))
    
    y_dict = {f.stem: np.load(f, mmap_mode='r') for f in y_files}

    stories = set(embeddings["BoW"]) & set(y_dict)

    mismatch_found = False

    for story in sorted(stories):
        x_tr = embeddings["BoW"][story].shape[0]
        y_tr = y_dict[story].shape[0]
        if x_tr != y_tr:
            print(f"❌ {story}: X_TR={x_tr}, Y_TR={y_tr}")
            mismatch_found = True

    if not mismatch_found:
        print("✅ All stories match")

print_mismatched_x_y(embeddings, "../../tmp_ondemand_ocean_mth240012p_symlink/shared/data/subject2")

## Regression

The code is structured to support efficient and modular execution, primarily due to the large size of the datasets involved. Since memory is a key constraint—especially with high-dimensional BoW features and voxel responses—the script avoids loading all data at once. Instead, it processes each subject and embedding combination one at a time.

To make this manageable, the script is organized into clearly commented blocks, where each block corresponds to one execution of the bootstrap_ridge() function with a specific subject and embedding. These blocks are initially commented out and labeled with instructions so that you can manually uncomment and run them one by one. This helps prevent memory overload and gives you manual control over the execution flow, allowing you to monitor the progress and intermediate results before proceeding to the next computation.

Intermediate outputs such as correlation scores and selected alpha values are saved to disk using compressed .npz files. This ensures that heavy computations do not need to be repeated if the session is interrupted, making the process more fault-tolerant and efficient.

Finally, once all models are run, the script includes a visualization section that loads the saved results and helps compare different embeddings based on metrics like mean, median, and top-percentile correlation coefficients. The BoW model, being the most memory-intensive due to its large feature space, is scheduled to run last.

Overall, the structure of this codebase is designed to be memory-conscious, reproducible, and well-suited for iterative analysis in a constrained computing environment.

In [None]:
import numpy as np
import joblib
import matplotlib.pyplot as plt
from ridge_utils.ridge import bootstrap_ridge
import gc  # For garbage collection

DATA_DIR = "../data"

# Parameters for ridge regression
alphas = np.logspace(1, 3, 10)
nboots = 10
chunklen = 20
nchunks = 5
corrmin = 0.2

# =============== STEP 1: RUN RIDGE ON Word2Vec embedding ===============

# Load data for Word2Vec and subject2 (repeat for subject3)
X_train = joblib.load(f"{DATA_DIR}/X_train_Word2Vec.joblib")
X_test = joblib.load(f"{DATA_DIR}/X_test_Word2Vec.joblib")
Y_train = np.load(f"{DATA_DIR}/Y_train_subject2.npz")["arr_0"]
Y_test = np.load(f"{DATA_DIR}/Y_test_subject2.npz")["arr_0"]

print("Loaded Word2Vec and subject2 data.")

# Run bootstrap ridge regression (return_wt=False)
_, corrs_w2v, valphas_w2v, _, _ = bootstrap_ridge(
    Rstim=X_train, Rresp=Y_train, 
    Pstim=X_test, Presp=Y_test, 
    alphas=alphas, 
    nboots=nboots, chunklen=chunklen, nchunks=nchunks, 
    corrmin=corrmin, single_alpha=False, return_wt=False
)

# Save the results
np.savez_compressed(f"{DATA_DIR}/ridge_corrs_Word2Vec_subject2.npz", corrs_w2v, valphas_w2v)

# Clean up memory
del X_train, X_test, Y_train, Y_test, corrs_w2v, valphas_w2v
gc.collect()

# ===============================================
# Repeat the same STEP 1 block for subject3
# ===============================================

# =============== STEP 2: RUN RIDGE ON GloVe embedding ===============

# Load data for GloVe and subject2
X_train = joblib.load(f"{DATA_DIR}/X_train_GloVe.joblib")
X_test = joblib.load(f"{DATA_DIR}/X_test_GloVe.joblib")
Y_train = np.load(f"{DATA_DIR}/Y_train_subject2.npz")["arr_0"]
Y_test = np.load(f"{DATA_DIR}/Y_test_subject2.npz")["arr_0"]

print("Loaded GloVe and subject2 data.")

_, corrs_glove, valphas_glove, _, _ = bootstrap_ridge(
    Rstim=X_train, Rresp=Y_train,
    Pstim=X_test, Presp=Y_test,
    alphas=alphas,
    nboots=nboots, chunklen=chunklen, nchunks=nchunks,
    corrmin=corrmin, single_alpha=False, return_wt=False
)

np.savez_compressed(f"{DATA_DIR}/ridge_corrs_GloVe_subject2.npz", corrs_glove, valphas_glove)

del X_train, X_test, Y_train, Y_test, corrs_glove, valphas_glove
gc.collect()

# ===============================================
# Repeat the same STEP 2 block for subject3
# ===============================================

# =============== STEP 3: RUN RIDGE ON BoW embedding (largest dataset, last) ===============

# Load data for BoW and subject2
X_train = joblib.load(f"{DATA_DIR}/X_train_BoW.joblib")
X_test = joblib.load(f"{DATA_DIR}/X_test_BoW.joblib")
Y_train = np.load(f"{DATA_DIR}/Y_train_subject2.npz")["arr_0"]
Y_test = np.load(f"{DATA_DIR}/Y_test_subject2.npz")["arr_0"]

print("Loaded BoW and subject2 data.")

_, corrs_bow, valphas_bow, _, _ = bootstrap_ridge(
    Rstim=X_train, Rresp=Y_train,
    Pstim=X_test, Presp=Y_test,
    alphas=alphas,
    nboots=nboots, chunklen=chunklen, nchunks=nchunks,
    corrmin=corrmin, single_alpha=False, return_wt=False
)

np.savez_compressed(f"{DATA_DIR}/ridge_corrs_BoW_subject2.npz", corrs_bow, valphas_bow)

del X_train, X_test, Y_train, Y_test, corrs_bow, valphas_bow
gc.collect()

# ===============================================
# Repeat the same STEP 3 block for subject3
# ===============================================

# =============== STEP 4: Compare Performance of Embeddings ===============

# Load previously computed correlation results
corrs_w2v_sub2 = np.load(f"{DATA_DIR}/ridge_corrs_Word2Vec_subject2.npz")["arr_0"]
corrs_glove_sub2 = np.load(f"{DATA_DIR}/ridge_corrs_GloVe_subject2.npz")["arr_0"]
corrs_bow_sub2 = np.load(f"{DATA_DIR}/ridge_corrs_BoW_subject2.npz")["arr_0"]

# Mean correlation across voxels
print("Word2Vec mean CC:", np.mean(corrs_w2v_sub2))
print("GloVe mean CC:", np.mean(corrs_glove_sub2))
print("BoW mean CC:", np.mean(corrs_bow_sub2))

# Choose best embedding based on mean CC (highest)
# (Perform similar analysis for median, top 1%, top 5%, etc.)

# =============== STEP 5: Detailed Analysis for Best Embedding ===============

# Assuming GloVe is the best embedding (example)
best_corrs = corrs_glove_sub2

# Plot distribution of CC across voxels
plt.hist(best_corrs, bins=100)
plt.xlabel("Correlation Coefficient (CC)")
plt.ylabel("Number of Voxels")
plt.title("Distribution of CC for best embedding (GloVe)")
plt.show()

# Perform stability analysis and interpretation according to PCS
# (Additional analysis as per instructions can be performed here)

# Clean up after everything
del corrs_w2v_sub2, corrs_glove_sub2, corrs_bow_sub2, best_corrs
gc.collect()

Total common stories across embeddings and both subjects: 0
Training stories: 0; Testing stories: 0
Processing subject2 with BoW embeddings...


ValueError: No overlapping stories found!