In [2]:
from datasets import load_from_disk
import numpy as np
from PIL import Image
import os
from tqdm import tqdm

# === CONFIGURATION ===
DATASET_PATH = '/Volumes/Samsung_PSSD_T7_Shield/NSD dataset/Dataset Pointers'
SAVE_DIR = './prepared_nsd_data_subj01'
TARGET_SUBJECT = 0  # Change this to the subject ID you want

# === PREPARE SAVE DIR ===
os.makedirs(SAVE_DIR, exist_ok=True)

# === Load the dataset ===
dataset = load_from_disk(DATASET_PATH)
train_set = dataset["train"]

# === Filter by subject ID ===
filtered_set = [s for s in train_set if s['subject_id'] == TARGET_SUBJECT]
print(f"Found {len(filtered_set)} samples for subject {TARGET_SUBJECT}.")

# === Prepare storage ===
fMRI_list = []
image_paths = []
captions = []

# === Iterate and save ===
for i, sample in enumerate(tqdm(filtered_set, desc=f"Saving samples for {TARGET_SUBJECT}")):
    # --- fMRI activity ---
    activity = np.array(sample['activity'])  # shape: [n_voxels]
    fMRI_list.append(activity)

    # --- Save image ---
    img: Image.Image = sample['image']
    img_path = os.path.join(SAVE_DIR, f'image_{i:05d}.png')
    img.save(img_path)
    image_paths.append(img_path)

    # --- Caption ---
    captions.append(sample['captions'][0])  # Use the first caption

# === Save everything ===
fMRI_array = np.stack(fMRI_list)
np.save(os.path.join(SAVE_DIR, 'fmri.npy'), fMRI_array)

with open(os.path.join(SAVE_DIR, 'image_paths.txt'), 'w') as f:
    f.writelines([p + '\n' for p in image_paths])

with open(os.path.join(SAVE_DIR, 'captions.txt'), 'w') as f:
    f.writelines([c + '\n' for c in captions])

print("✅ Done saving!")
print(" - fMRI shape:", fMRI_array.shape)
print(" - # Captions:", len(captions))
print(" - Images saved in:", SAVE_DIR)


Found 27750 samples for subject 0.


Saving samples for 0: 100%|██████████████| 27750/27750 [03:56<00:00, 117.28it/s]


✅ Done saving!
 - fMRI shape: (27750, 215, 200)
 - # Captions: 27750
 - Images saved in: ./prepared_nsd_data_subj01
