# Part 2: Modeling and Evaluation

In this section, we use the lagged embeddings created in Part 1 (Bag-of-Words, Word2Vec, and GloVe) to build predictive models of brain activity using fMRI voxel data.

We follow these key steps:
1. Load and validate the preprocessed embeddings (X) for each story.
2. Load corresponding fMRI response matrices (Y) for each subject.
3. Fit a ridge regression model for each embedding type to predict Y from X.
4. Compute the mean correlation coefficient (CC) across all voxels to evaluate model performance.
5. Save the trained models and the evaluation metrics for further analysis.

Each embedding is evaluated on the same story to ensure consistency across comparisons.
This part prepares the ground for more advanced evaluation, such as voxel-level analysis and cross-validation.

Before we start, let's load the data and check they are in the right format by checking story timepoint consistency across embeddings

In [13]:
import joblib
import pickle
from pathlib import Path

def load_embeddings(data_dir):
    """Load all three types of embedding data into a dictionary."""
    X_bow = joblib.load(data_dir / "X_lagged_BoW.joblib")
    with open(data_dir / "X_lagged_W2V.pkl", "rb") as f:
        X_w2v = pickle.load(f)
    with open(data_dir / "X_lagged_GloVe.pkl", "rb") as f:
        X_glove = pickle.load(f)
    return {"BoW": X_bow, "Word2Vec": X_w2v, "GloVe": X_glove}

def check_timepoint_consistency(embeddings):
    """Check if timepoint lengths match across BoW, Word2Vec, and GloVe for each story."""
    print("Checking story timepoint consistency across embeddings...\n")
    story_set = set(embeddings["BoW"]) & set(embeddings["Word2Vec"]) & set(embeddings["GloVe"])
    mismatches = []

    for story in sorted(story_set):
        l_bow = embeddings["BoW"][story].shape[0]
        l_w2v = embeddings["Word2Vec"][story].shape[0]
        l_glove = embeddings["GloVe"][story].shape[0]

        if not (l_bow == l_w2v == l_glove):
            mismatches.append((story, l_bow, l_w2v, l_glove))

    if mismatches:
        print("Found mismatches in the following stories:")
        for story, l_b, l_w, l_g in mismatches:
            print(f"- {story}: BoW={l_b}, Word2Vec={l_w}, GloVe={l_g}")
    else:
        print("All stories have consistent timepoint lengths across embeddings.")

# === Run check ===
DATA_DIR = Path("../data")
embeddings = load_embeddings(DATA_DIR)
check_timepoint_consistency(embeddings)

Checking story timepoint consistency across embeddings...

All stories have consistent timepoint lengths across embeddings.


In [None]:
import numpy as np
import pickle
import joblib
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from pathlib import Path

# Define file paths
DATA_DIR = Path("../data")
RESULTS_DIR = Path("../results")
RESULTS_DIR.mkdir(exist_ok=True)

# Load embeddings (X)
def load_embeddings():
    """Load embeddings from previously processed files."""
    X_bow = joblib.load(DATA_DIR / "X_lagged_BoW.joblib")
    with open(DATA_DIR / "X_lagged_W2V.pkl", "rb") as f:
        X_w2v = pickle.load(f)
    with open(DATA_DIR / "X_lagged_GloVe.pkl", "rb") as f:
        X_glove = pickle.load(f)
    return {"BoW": X_bow, "Word2Vec": X_w2v, "GloVe": X_glove}

# Load fMRI data (Y)
def load_fmri_data(subject_id):
    """Load fMRI voxel data for the given subject."""
    voxel_path = DATA_DIR / f"processed/sub-{subject_id}/func/sub-{subject_id}_task-tasks_bold_space-MNI152NLin2009cAsym_desc-preproc_timeseries_voxels.npy"
    Y = np.load(voxel_path)
    return Y

# Fit ridge regression and calculate mean correlation coefficient (CC)
def fit_ridge_and_eval(X, Y, alpha=1.0):
    """Fit ridge regression and compute mean correlation coefficient."""
    model = Ridge(alpha=alpha)
    model.fit(X, Y)
    Y_pred = model.predict(X)
    cc = np.mean([np.corrcoef(Y[:, i], Y_pred[:, i])[0, 1] for i in range(Y.shape[1])])
    return model, cc

# Main pipeline for ridge regression per embedding
def main(subject_id, story_name):
    """Main function to fit ridge regression model for different embeddings and evaluate."""

    embeddings = load_embeddings()
    Y_full = load_fmri_data(subject_id)

    results = {}

    for emb_name, emb_data in embeddings.items():
        print(f"Processing embedding: {emb_name}")
        X_story = emb_data[story_name]

        # Ensure X and Y dimensions match
        T = min(X_story.shape[0], Y_full.shape[0])
        X = X_story[:T, :]
        Y = Y_full[:T, :]

        print(f"Shapes after matching: X {X.shape}, Y {Y.shape}")

        # Fit Ridge Regression
        model, mean_cc = fit_ridge_and_eval(X, Y)

        print(f"Mean CC for {emb_name}: {mean_cc:.4f}")
        results[emb_name] = mean_cc

        # Save trained model
        model_path = RESULTS_DIR / f"ridge_model_{emb_name}_{story_name}.pkl"
        with open(model_path, "wb") as f:
            pickle.dump(model, f)
        print(f"Saved model to {model_path}")

    # Save mean CC results
    cc_results_path = RESULTS_DIR / f"mean_cc_{story_name}.pkl"
    with open(cc_results_path, "wb") as f:
        pickle.dump(results, f)
    print(f"Saved CC results to {cc_results_path}")


# Example usage
if __name__ == "__main__":
    subject_id = "01"  # change as needed
    story_name = "sweetaspie"  # change as needed
    main(subject_id, story_name)



✅ All stories have matching timepoint lengths across embeddings.


FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/sub-01/func/sub-01_task-tasks_bold_space-MNI152NLin2009cAsym_desc-preproc_timeseries_voxels.npy'