In [1]:
from pathlib import Path

import numpy as np
import h5py
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.metrics import explained_variance_score

In [2]:
DATA_ROOT = Path("/ocean/projects/med220004p/clane2/algonauts25/algonauts_2025.competitors")

In [3]:
def load_timeseries(path: Path) -> np.ndarray:
    file = h5py.File(path)
    timeseries = [scale(file[k][:]) for k in file]
    timeseries = np.concatenate(timeseries)
    return timeseries

In [4]:
train_path = (
    DATA_ROOT
    / "fmri/sub-01/func/sub-01_task-friends_space-MNI152NLin2009cAsym_atlas-Schaefer18_parcel-1000Par7Net_desc-s123456_bold.h5"
)

val_path = (
    DATA_ROOT
    / "fmri/sub-01/func/sub-01_task-movie10_space-MNI152NLin2009cAsym_atlas-Schaefer18_parcel-1000Par7Net_bold.h5"
)

In [None]:
train_timeseries = load_timeseries(train_path)
val_timeseries = load_timeseries(val_path)

In [6]:
class PCATokenizer:
    def __init__(self, estimator: PCA, vmax: float = 2.5, bins: int = 1024):
        self.estimator = estimator
        self.vmax = vmax
        self.bins = bins
    
    def fit(self, X: np.ndarray):
        self.estimator.fit(X)
        return self

    def transform(self, X: np.ndarray) -> np.ndarray:
        embed = self.estimator.transform(X)
        scaled = np.clip((embed + self.vmax) / (2 * self.vmax), 0.0, 1.0)
        ids = ((self.bins - 1) * scaled).astype(np.int64)
        return ids

    def inverse_transform(self, ids: np.ndarray) -> np.ndarray:
        embed = 2 * self.vmax * (ids / (self.bins - 1)) - self.vmax
        X = self.estimator.inverse_transform(embed)
        return X

In [7]:
proj = PCA(n_components=32, whiten=True)
tokenizer = PCATokenizer(proj, vmax=3.0, bins=1024)
tokenizer.fit(train_timeseries)

train_ids = tokenizer.transform(train_timeseries)
train_recon = tokenizer.inverse_transform(train_ids)
train_var = explained_variance_score(train_timeseries, train_recon)

val_ids = tokenizer.transform(val_timeseries)
val_recon = tokenizer.inverse_transform(val_ids)
val_var = explained_variance_score(val_timeseries, val_recon)

print(f"Train: {train_var:.3f}, Val: {val_var:.3f}")

Train: 0.581, Val: 0.506
