# Setup (be sure to check which cuda:# gpu you are using!)

In [1]:
import os
import math
import json
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Tuple, Optional
from datetime import datetime

import h5py
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR
from tqdm import tqdm

# Import utility functions
from utils import (
    is_interactive,
    to_builtin,
    load_fmri, 
    align_features_and_fmri_samples,
    calculate_metrics,
    normalize_fmri,
    check_fmri_stats,
    load_friends_s7_features,
)

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

if is_interactive():
    # Following allows you to change functions in other files and 
    # have this notebook automatically update with your revisions
    get_ipython().run_line_magic('load_ext', 'autoreload')
    get_ipython().run_line_magic('autoreload', '2')

In [2]:
# CHECK "nvidia-smi" IN TERMINAL AND PICK A GPU THAT IS NOT CURRENTLY ACTIVE
device = torch.device('cuda:4')

# Defining dataset and model and additional utilities

## Dataset and Model

In [3]:
class AlgonautsDataset(Dataset):
    """
    PyTorch Dataset for loading multimodal features and fMRI data.
    
    This dataset handles:
    - Loading visual (InternVL) and audio (Whisper) features
    - Aligning stimulus features with fMRI responses considering HRF delay
    """
    
    def __init__(
        self,
        features_dir: Dict[str, Path],
        fmri_dir: Path,
        movies: List[str],
        subject: int,
        excluded_samples_start: int = 5,
        excluded_samples_end: int = 5,
        hrf_delay: int = 0,
        stimulus_window: int = 12,
        mean: Optional[np.ndarray] = None,
        std: Optional[np.ndarray] = None
    ):
        """
        Initialize the dataset.
        
        Args:
            features_dir: Dictionary mapping modality names to their feature directories
            fmri_dir: Path to fMRI data directory
            movies: List of movie identifiers to include
            subject: Subject ID
            excluded_samples_start: Number of initial TRs to exclude
            excluded_samples_end: Number of final TRs to exclude
            hrf_delay: Hemodynamic response function delay in TRs
            stimulus_window: Number of feature chunks to use for modeling each TR
            mean: Pre-computed mean for normalization (optional)
            std: Pre-computed std for normalization (optional)
        """
        self.features_dir = features_dir
        self.fmri_dir = fmri_dir
        self.movies = movies
        self.subject = subject
        self.excluded_samples_start = excluded_samples_start
        self.excluded_samples_end = excluded_samples_end
        self.hrf_delay = hrf_delay
        self.stimulus_window = stimulus_window
        
        # Load stimulus features
        print(f"Loading stimulus features for subject {subject}...")
        self.stimuli_features = self._load_all_stimulus_features()
        
        # Load and align fMRI data
        print(f"Loading fMRI data for subject {subject}...")
        fmri_data = load_fmri(self.fmri_dir, subject)
        
        # Align features and fMRI
        self.aligned_features, self.aligned_fmri = align_features_and_fmri_samples(
            self.stimuli_features,
            fmri_data,
            self.excluded_samples_start,
            self.excluded_samples_end,
            self.hrf_delay,
            self.stimulus_window,
            self.movies
        )
        
        # Compute or use provided normalization statistics
        if mean is None and std is None:
            print("Computing normalization statistics from training data...")
            _, self.mean, self.std = normalize_fmri(self.aligned_fmri)
        else:
            self.mean = mean
            self.std = std
        
        # Print data statistics for debugging
        check_fmri_stats(self.aligned_fmri, f"Subject {subject} fMRI data")
        
        print(f"Dataset created: {len(self)} samples")
        print(f"fMRI shape: {self.aligned_fmri.shape}")
    
    def _load_all_stimulus_features(self) -> Dict[str, Dict[str, np.ndarray]]:
        """Load all stimulus features for the specified movies."""
        features = defaultdict(dict)
        
        for movie in self.movies:
            if 'friends' in movie:
                self._load_friends_features(movie, features)
            else:
                self._load_movie10_features(movie, features)
        
        # Remove language features since we're not using them
        if 'language' in features:
            del features['language']
        
        return dict(features)
    
    def _load_friends_features(self, movie: str, features: Dict):
        """Load features for Friends episodes."""
        season = movie.split('-')[1]
        
        # Load visual and audio features
        audio_dir = self.features_dir['audio'] / 'audio'
        episode_files = sorted([f for f in os.listdir(audio_dir) 
                               if f"{season}e" in f and '_features_' in f])
        
        for episode_file in episode_files:
            episode_base = episode_file.split('_features_')[0]
            episode_key = episode_base.split('_')[1]
            
            # Visual features
            visual_path = self.features_dir['visual'] / 'visual' / f"{episode_base}_features_visual.h5"
            with h5py.File(visual_path, 'r') as f:
                features['visual'][episode_key] = f['language_model.model.layers.20.post_attention_layernorm'][:]
            
            # Audio features
            audio_path = self.features_dir['audio'] / 'audio' / f"{episode_base}_features_audio.h5"
            with h5py.File(audio_path, 'r') as f:
                features['audio'][episode_key] = f['layers.12.fc2'][:]
    
    def _load_movie10_features(self, movie: str, features: Dict):
        """Load features for movie10 clips."""
        movie_name = movie.replace('movie10-', '')
        
        # Visual and audio features
        audio_dir = self.features_dir['audio'] / 'audio'
        partitions = sorted([f for f in os.listdir(audio_dir) 
                           if movie_name in f and '_features_' in f])
        
        for partition in partitions:
            partition_base = partition.split('_features_')[0]
            
            # Visual features
            visual_path = self.features_dir['visual'] / 'visual' / f"{partition_base}_features_visual.h5"
            with h5py.File(visual_path, 'r') as f:
                features['visual'][partition_base] = f['language_model.model.layers.20.post_attention_layernorm'][:]
            
            # Audio features
            audio_path = self.features_dir['audio'] / 'audio' / f"{partition_base}_features_audio.h5"
            with h5py.File(audio_path, 'r') as f:
                features['audio'][partition_base] = f['layers.12.fc2'][:]
    
    def __len__(self) -> int:
        return self.aligned_fmri.shape[0]
    
    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        return {
            'audio': torch.from_numpy(self.aligned_features['audio'][idx]).float(),
            'video': torch.from_numpy(self.aligned_features['visual'][idx]).float(),
            'fmri': torch.from_numpy(self.aligned_fmri[idx]).float()
        }

In [4]:
from algo_mihir_replication import AudioVisualfMRIModel

ckpt = "/home/mihir/projects/algonauts2025/checkpoints/ALLDATAav_sub1_hrf0_sw15_20250624_000323/last.ckpt"
model_config = {
        'latent_dim': 1024,
        'vision_proj_dim': 1024,
        'audio_proj_dim': 1024,
        'dropout_prob': 0.4,
        'encoder_dropout_prob': 0.2,
        'num_attn_heads': 8,
        'stimulus_window': 15,
        'learning_rate': 1e-5,
        'weight_decay': 0.04,
        'alpha': 0.8
    }
model = AudioVisualfMRIModel.load_from_checkpoint(ckpt, config=model_config, strict=True)
model = model.to(device)
model.eval()

AudioVisualfMRIModel(
  (vision_proj): Sequential(
    (0): Linear(in_features=3584, out_features=1024, bias=True)
    (1): GELU(approximate='none')
    (2): Dropout(p=0.4, inplace=False)
    (3): Linear(in_features=1024, out_features=1024, bias=True)
  )
  (audio_proj): Sequential(
    (0): Linear(in_features=1280, out_features=1024, bias=True)
    (1): GELU(approximate='none')
    (2): Dropout(p=0.4, inplace=False)
    (3): Linear(in_features=1024, out_features=1024, bias=True)
  )
  (av_fusion): AudioVisualFusion(
    (vision_audio_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
    )
    (audio_vision_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
    )
    (av_fusion_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (av_ffn): Sequential(
      (0): Linear(in_features=1024, out_features=1024, bias=True)
      (1):

# Create submission entry for leaderboards

In [5]:
from utils import (
    load_friends_s7_features,
    align_features_and_fmri_samples_friends_s7,
)

In [6]:
my_root_dir = Path.cwd()
mihir_dir =  Path('/home/mihir/projects/')

# Feature directories (no language features)
features_dir = {
    "visual": mihir_dir / 'datasets' / "InternVL3_feat",
    "audio": mihir_dir / 'datasets' / 'whisper_feat' / 'whisper'
}

fmri_dir = mihir_dir / 'datasets' / 'algonauts_2025.competitors' / 'fmri'

# Verify paths exist
for name, path in features_dir.items():
    if not path.exists():
        raise FileNotFoundError(f"{name} features directory not found: {path}")
if not fmri_dir.exists():
    raise FileNotFoundError(f"fMRI directory not found: {fmri_dir}")
features_s7 = load_friends_s7_features(features_dir, use_language=False)

In [7]:
aligned_features_s7 = align_features_and_fmri_samples_friends_s7(
    features_s7,
    str(fmri_dir),  
    hrf_delay=0,
    stimulus_window=15,
)

Aligning stimulus and fMRI features of the four subjects:   0%|          | 0/4 [00:00<?, ?it/s]

Aligning stimulus and fMRI features of the four subjects: 100%|██████████| 4/4 [01:06<00:00, 16.71s/it]


In [9]:
predictions = {}
subjects = [1, 2, 3, 5]
batch_size = 32

with torch.no_grad():
    for subject in subjects:
        subject_key = f"sub-0{subject}"
        predictions[subject_key] = {}
        
        print(f"\nGenerating predictions for {subject_key}")
        
        for episode in tqdm(aligned_features_s7[subject_key].keys(), desc=f"Episodes for {subject_key}"):
            episode_features = aligned_features_s7[subject_key][episode]
            num_samples = episode_features['visual'].shape[0]
            
            # Process in batches
            all_predictions = []
            
            for i in range(0, num_samples, batch_size):
                end_idx = min(i + batch_size, num_samples)
                batch_size_actual = end_idx - i
                
                # Prepare batch
                video_batch = torch.from_numpy(
                    episode_features['visual'][i:end_idx]
                ).float().to(device)
                
                audio_batch = torch.from_numpy(
                    episode_features['audio'][i:end_idx]
                ).float().to(device)
                
                # language_batch = torch.from_numpy(
                #     episode_features['language'][i:end_idx]
                # ).float().to(device)
                
                subject_ids = torch.tensor([subject] * batch_size_actual).to(device)
                
                # Generate predictions
                batch_predictions = model(
                    video_batch, audio_batch,
                )
                # batch_predictions = model(
                #     video_batch, audio_batch, language_batch, subject_ids
                # )
                
                all_predictions.append(batch_predictions.cpu().numpy())
            
            # Concatenate all predictions for this episode
            episode_predictions = np.concatenate(all_predictions, axis=0)
            
            # Store as float32 to reduce file size
            predictions[subject_key][episode] = episode_predictions.astype(np.float32)


Generating predictions for sub-01


Episodes for sub-01:   0%|          | 0/49 [00:00<?, ?it/s]

Episodes for sub-01: 100%|██████████| 49/49 [00:02<00:00, 23.23it/s]



Generating predictions for sub-02


Episodes for sub-02: 100%|██████████| 49/49 [00:02<00:00, 22.73it/s]



Generating predictions for sub-03


Episodes for sub-03: 100%|██████████| 49/49 [00:02<00:00, 22.90it/s]



Generating predictions for sub-05


Episodes for sub-05: 100%|██████████| 49/49 [00:02<00:00, 21.89it/s]


In [10]:
# Save predictions
exp_name = 'av_sw15_hrf0'
submission_dir = Path('/home/mihir/projects/algonauts2025/saved_preds') / exp_name
submission_npy = str(submission_dir)+f"/{exp_name}.npy"
submission_dir.mkdir(parents=True, exist_ok=True)

print(f"\nSaving predictions to {submission_dir}")
np.save(submission_npy, predictions)

# Verify output format
print("\nVerifying output format:")
loaded_predictions = np.load(submission_npy, allow_pickle=True).item()

total_size = 0
all_predictions_flat = []

for subject in ["sub-01", "sub-02", "sub-03", "sub-05"]:
    if subject in loaded_predictions:
        print(f"\n{subject}:")
        subject_size = 0
        for episode, preds in loaded_predictions[subject].items():
            episode_size = preds.nbytes / (1024 * 1024)  # Size in MB
            subject_size += episode_size
            print(f"  {episode}: shape {preds.shape}, dtype {preds.dtype}, size {episode_size:.2f} MB")
            all_predictions_flat.extend(preds.flatten())
        print(f"  Total for {subject}: {subject_size:.2f} MB")
        total_size += subject_size
    else:
        print(f"\nWARNING: {subject} missing from predictions!")

# Check prediction statistics
all_predictions_flat = np.array(all_predictions_flat)
print(f"\nPrediction statistics:")
print(f"  Mean: {np.mean(all_predictions_flat):.4f}")
print(f"  Std: {np.std(all_predictions_flat):.4f}")
print(f"  Min: {np.min(all_predictions_flat):.4f}")
print(f"  Max: {np.max(all_predictions_flat):.4f}")

print(f"\nTotal file size: {total_size:.2f} MB")
print("\nSubmission file created successfully!")
print(f"\nNext steps:")
print(f"1. Create zip file: zip -j {exp_name}.zip {submission_npy}")
print(f"2. Check zip size is under 15GB")
print(f"3. Upload to Codabench 'My Submissions' page")
print(f"4. Wait for processing (~few minutes)")
print(f"5. Check 'Results' page for leaderboard ranking")
print(f"\nGood luck!")


Saving predictions to /home/mihir/projects/algonauts2025/saved_preds/av_sw15_hrf0

Verifying output format:

sub-01:
  s07e01a: shape (460, 1000), dtype float32, size 1.75 MB
  s07e01b: shape (494, 1000), dtype float32, size 1.88 MB
  s07e02a: shape (492, 1000), dtype float32, size 1.88 MB
  s07e02b: shape (526, 1000), dtype float32, size 2.01 MB
  s07e03a: shape (417, 1000), dtype float32, size 1.59 MB
  s07e03b: shape (452, 1000), dtype float32, size 1.72 MB
  s07e04a: shape (447, 1000), dtype float32, size 1.71 MB
  s07e04b: shape (482, 1000), dtype float32, size 1.84 MB
  s07e05a: shape (454, 1000), dtype float32, size 1.73 MB
  s07e05b: shape (488, 1000), dtype float32, size 1.86 MB
  s07e06a: shape (478, 1000), dtype float32, size 1.82 MB
  s07e06b: shape (513, 1000), dtype float32, size 1.96 MB
  s07e07a: shape (473, 1000), dtype float32, size 1.80 MB
  s07e07b: shape (507, 1000), dtype float32, size 1.93 MB
  s07e08a: shape (474, 1000), dtype float32, size 1.81 MB
  s07e08b: s