<a href="https://colab.research.google.com/github/Jatin-Khiyani/Visual-Situmlai-Reconstruction-Using-fMRI-and-Deep-Learning/blob/main/CLIP%20Visual%20Feature%20Extraction/Clip_Visual_Feature_extractions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!unzip /content/drive/MyDrive/NSD_Dataset/prepared_nsd_data_subj01.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: prepared_nsd_data_subj01/image_16164.png  
  inflating: prepared_nsd_data_subj01/image_01418.png  
  inflating: prepared_nsd_data_subj01/image_19257.png  
  inflating: prepared_nsd_data_subj01/image_06377.png  
  inflating: prepared_nsd_data_subj01/image_09044.png  
  inflating: prepared_nsd_data_subj01/image_12302.png  
  inflating: prepared_nsd_data_subj01/image_24868.png  
  inflating: prepared_nsd_data_subj01/image_02111.png  
  inflating: prepared_nsd_data_subj01/image_14773.png  
  inflating: prepared_nsd_data_subj01/image_04560.png  
  inflating: prepared_nsd_data_subj01/image_12316.png  
  inflating: prepared_nsd_data_subj01/image_02105.png  
  inflating: prepared_nsd_data_subj01/image_15479.png  
  inflating: prepared_nsd_data_subj01/image_14767.png  
  inflating: prepared_nsd_data_subj01/image_13008.png  
  inflating: prepared_nsd_data_subj01/image_04574.png  
  inflating: prepared_nsd_data_subj01/i

In [None]:
# === 0. Install & imports ===
# Uncomment and run once if you haven’t installed these already:
# !pip install transformers tqdm joblib

import os
import glob
import numpy as np
from PIL import Image
from tqdm import tqdm
import joblib

import torch
from transformers import CLIPProcessor, CLIPVisionModel
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

# === 1. Config ===
IMG_DIR     = "/content/prepared_nsd_data_subj01"
IMG_PATTERN = "image_*.png"
LAYERS      = [2, 4, 6, 8, 10, 12]
OUTPUT_DIR  = "/content/drive/MyDrive/NSD_Dataset/CLIP_Visual_Feature"
os.makedirs(OUTPUT_DIR, exist_ok=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# === 2. Init CLIP & infer feature dims ===
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
vision    = (CLIPVisionModel
               .from_pretrained("openai/clip-vit-base-patch32", output_hidden_states=True)
               .eval()
               .to(DEVICE))

# Peek once to compute dims_per_layer and total_dim
first_path = glob.glob(f"{IMG_DIR}/{IMG_PATTERN}")[0]
first_img  = Image.open(first_path).convert("RGB")
inputs     = processor(images=first_img, return_tensors="pt").to(DEVICE)
with torch.no_grad():
    out = vision(**inputs)

dims_per_layer = []
for L in LAYERS:
    hs = out.hidden_states[L]           # [1, 1+patches, hidden_dim]
    n_patches  = hs.shape[1] - 1
    hidden_dim = hs.shape[2]
    dims_per_layer.append(n_patches * hidden_dim)
total_dim = sum(dims_per_layer)

del out, inputs, first_img
torch.cuda.empty_cache()

# === 3. Create or load on-disk memmap for CLIP features ===
feat_path = os.path.join(OUTPUT_DIR, "clip_all_layers.dat")
image_paths = sorted(glob.glob(f"{IMG_DIR}/{IMG_PATTERN}"))
N = len(image_paths)

# If you haven't extracted features yet, uncomment the block below to do so.
# Otherwise skip to step 4.

# all_feats = np.memmap(feat_path, mode="w+", dtype="float32", shape=(N, total_dim))
# print(f"Extracting CLIP features for {N} images → '{feat_path}'")
# for idx, path in enumerate(tqdm(image_paths, desc="Images")):
#     img    = Image.open(path).convert("RGB")
#     inputs = processor(images=img, return_tensors="pt").to(DEVICE)
#     with torch.no_grad():
#         out = vision(**inputs)
#     ptr = 0
#     for L, dim in zip(LAYERS, dims_per_layer):
#         patch_feats = out.hidden_states[L][:,1:,:].reshape(-1).cpu().numpy()
#         all_feats[idx, ptr:ptr+dim] = patch_feats
#         ptr += dim
#     del out, inputs, img
# all_feats.flush()

# Now load the memmapped CLIP features for read-only access
clip_feats_mem = np.memmap(
    feat_path,
    mode="r",
    dtype="float32",
    shape=(N, total_dim),
)

# === 4. Load fMRI data & fit scalers in chunks ===
X_path = "/content/drive/MyDrive/NSD_Dataset/fmri.npy"
X_mem  = np.load(X_path, mmap_mode="r")
X_mem  = X_mem.reshape(X_mem.shape[0], -1)
assert X_mem.shape[0] == N

scaler_X = StandardScaler()
scaler_Y = StandardScaler()
chunk    = 500

print("Fitting StandardScalers in chunks…")
for i in tqdm(range(0, N, chunk), desc="Scaler fit"):
    scaler_X.partial_fit(X_mem[i : i + chunk])
    scaler_Y.partial_fit(clip_feats_mem[i : i + chunk])

joblib.dump(scaler_X, os.path.join(OUTPUT_DIR, "scaler_fmri.joblib"))
joblib.dump(scaler_Y, os.path.join(OUTPUT_DIR, "scaler_clip_all_layers.joblib"))

# === 5. Transform & train sklearn.Ridge on CPU ===
print("Transforming data & training Ridge (CPU)…")
X_z = scaler_X.transform(X_mem)                # shape (N, fmri_dim)
Y_z = scaler_Y.transform(clip_feats_mem)       # shape (N, total_dim)

model = Ridge(alpha=1.0, solver="auto")
model.fit(X_z, Y_z)
joblib.dump(model, os.path.join(OUTPUT_DIR, "ridge_all_layers_cpu.joblib"))

print("✅ Done. CPU-trained Ridge model and scalers saved to:", OUTPUT_DIR)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Fitting StandardScalers in chunks…



Scaler fit:   0%|          | 0/56 [00:00<?, ?it/s][A
Scaler fit:   2%|▏         | 1/56 [00:04<03:55,  4.28s/it][A
Scaler fit:   4%|▎         | 2/56 [00:07<03:08,  3.49s/it][A
Scaler fit:   5%|▌         | 3/56 [00:10<02:58,  3.36s/it][A
Scaler fit:   7%|▋         | 4/56 [00:13<02:49,  3.25s/it][A
Scaler fit:   9%|▉         | 5/56 [00:16<02:47,  3.28s/it][A
Scaler fit:  11%|█         | 6/56 [00:19<02:34,  3.10s/it][A
Scaler fit:  12%|█▎        | 7/56 [00:22<02:35,  3.17s/it][A
Scaler fit:  14%|█▍        | 8/56 [00:25<02:21,  2.95s/it][A
Scaler fit:  16%|█▌        | 9/56 [00:28<02:23,  3.04s/it][A
Scaler fit:  18%|█▊        | 10/56 [00:31<02:18,  3.02s/it][A
Scaler fit:  20%|█▉        | 11/56 [00:35<02:23,  3.19s/it][A
Scaler fit:  21%|██▏       | 12/56 [00:38<02:21,  3.22s/it][A
Scaler fit:  23%|██▎       | 13/56 [00:41<02:14,  3.13s/it][A
Scaler fit:  25%|██▌       | 14/56 [00:44<02:08,  3.06s/it][A
Scaler fit:  27%|██▋       | 15/56 [00:47<02:07,  3.11s/it][A
Scaler fi

Transforming data & training Ridge (CPU)…
✅ Done. CPU-trained Ridge model and scalers saved to: /content/drive/MyDrive/NSD_Dataset/CLIP_Visual_Feature


In [None]:
#!/usr/bin/env python3
# stage2_from_existing.py

import os
import glob
import joblib
import numpy as np
from tqdm import tqdm
from PIL import Image
import torch
from scipy.stats import pearsonr
from sklearn.linear_model import Ridge
from transformers import CLIPProcessor, CLIPVisionModel

# === CONFIG ===
IMG_DIR     = "/content/prepared_nsd_data_subj01"
IMG_PATTERN = "image_*.png"
LAYERS      = [2, 4, 6, 8, 10, 12]
FEATURE_DIR = "/content/drive/MyDrive/NSD_Dataset/CLIP_Visual_Feature"
FMRI_PATH   = "/content/drive/MyDrive/NSD_Dataset/fmri.npy"
SCALER_X    = os.path.join(FEATURE_DIR, "scaler_fmri.joblib")
SCALER_Y    = os.path.join(FEATURE_DIR, "scaler_clip_all_layers.joblib")
RIDGE_FULL  = "/content/riedge.joblib"
OUT_MODEL   = os.path.join(FEATURE_DIR, "ridge_clip_structural.joblib")
OUT_IDXS    = os.path.join(FEATURE_DIR, "selected_clip_idxs.npy")
DEVICE      = "cuda" if torch.cuda.is_available() else "cpu"
ALPHA       = 1.0

# === 1. Load scalers, data, and your existing full-dims model ===
scaler_X = joblib.load(SCALER_X)
scaler_Y = joblib.load(SCALER_Y)
model_full = joblib.load(RIDGE_FULL)

# fMRI
X_mem = np.load(FMRI_PATH, mmap_mode="r")
X_mem = X_mem.reshape(X_mem.shape[0], -1)

# CLIP features memmap
image_paths = sorted(glob.glob(f"{IMG_DIR}/{IMG_PATTERN}"))
# first grab dims_per_layer exactly as before
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
vision = CLIPVisionModel.from_pretrained(
    "openai/clip-vit-base-patch32", output_hidden_states=True
).eval().to(DEVICE)

first_img = Image.open(image_paths[0]).convert("RGB")
with torch.no_grad():
    inp = processor(images=first_img, return_tensors="pt").to(DEVICE)
    out = vision(**inp)

dims_per_layer = []
for L in LAYERS:
    hs = out.hidden_states[L]
    dims_per_layer.append((hs.shape[1]-1) * hs.shape[2])
total_dim = sum(dims_per_layer)
del out, inp, first_img
torch.cuda.empty_cache()

feat_path = os.path.join(FEATURE_DIR, "clip_all_layers.dat")
Y_mem = np.memmap(feat_path, mode="r", dtype="float32",
                 shape=(len(image_paths), total_dim))

# scale into memory
X_z = scaler_X.transform(X_mem)
Y_z = scaler_Y.transform(Y_mem)

# === 2. Use your full-dims model to get predictions and compute Pearson r ===
print("Predicting full-dims outputs with your existing model…")
Y_pred = model_full.predict(X_z)

print("Computing per-feature Pearson r…")
r = np.array([
    pearsonr(Y_z[:, d], Y_pred[:, d])[0] if Y_z[:, d].std()>0 else 0.0
    for d in range(total_dim)
])

# === 3. Pick the top 25% per layer ===
print("Selecting top 25% dims per layer…")
selected = []
ptr = 0
for dim in dims_per_layer:
    layer_r = r[ptr:ptr+dim]
    cutoff = np.percentile(layer_r, 75)
    idxs = np.where(layer_r >= cutoff)[0] + ptr
    selected.append(idxs)
    ptr += dim

selected_indices = np.concatenate(selected)
print(f"  → total_dim={total_dim}, selected={len(selected_indices)}")
np.save(OUT_IDXS, selected_indices)

# === 4. Train the new “structural” Ridge on only those dims ===
print("Training new Ridge on selected dims…")
Y_sel = Y_z[:, selected_indices]
model_sel = Ridge(alpha=ALPHA)
model_sel.fit(X_z, Y_sel)
joblib.dump(model_sel, OUT_MODEL)
print("Saved structural model to:", OUT_MODEL)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Predicting full-dims outputs with your existing model…
Computing per-feature Pearson r…
Selecting top 25% dims per layer…
  → total_dim=225792, selected=56448
Training new Ridge on selected dims…
Saved structural model to: /content/drive/MyDrive/NSD_Dataset/CLIP_Visual_Feature/ridge_clip_structural.joblib
