# Experiment 1: VideoMAE Baseline (Paper-like Settings)

**Objective**: Replace ViT with VideoMAE following paper settings.

**Changes from Baseline**:
- Model: `vit_small_patch16_224` → `VideoMAE-base-finetuned-kinetics`
- Normalization: `[0.5,0.5,0.5]` → VideoMAE processor stats
- **LR**: Linear scaling rule (paper)
- **Scheduler**: Cosine with warmup (paper)
- Rest: Keep baseline data pipeline, 4 epochs for quick test

In [None]:
## 1. Setup & Imports

import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from PIL import Image
from tqdm.auto import tqdm
import torchvision.transforms.functional as TF
from torchvision.transforms import InterpolationMode
from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
from transformers import get_cosine_schedule_with_warmup

# Reproducibility
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(42)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {DEVICE}")

In [None]:
## 2. Configuration (Paper-like Settings)

# Paths (Kaggle)
PATH_DATA_TRAIN = Path('/kaggle/input/action-video/data/data_train')
PATH_DATA_TEST = Path('/kaggle/input/action-video/data/test')

# Model
MODEL_CKPT = "MCG-NJU/videomae-base-finetuned-kinetics"

# Training params
NUM_FRAMES = 16
FRAME_STRIDE = 2
IMAGE_SIZE = 224
BATCH_SIZE = 8
GRAD_ACCUM_STEPS = 4  # Effective batch = 8 * 4 = 32
EPOCHS = 4  # Quick test (paper uses 50)

# Paper-like LR settings
BASE_LR = 1e-3  # Paper base LR
EFFECTIVE_BATCH = BATCH_SIZE * GRAD_ACCUM_STEPS  # 32
LR = BASE_LR * EFFECTIVE_BATCH / 256  # Linear scaling: 1e-3 * 32/256 = 1.25e-4

WEIGHT_DECAY = 0.05
WARMUP_RATIO = 0.1  # 10% warmup (paper uses ~5 epochs for 50 epochs)

# Get normalization stats from processor
processor = VideoMAEImageProcessor.from_pretrained(MODEL_CKPT)
MEAN = processor.image_mean
STD = processor.image_std

print(f"Paper-like Settings:")
print(f"  Effective Batch Size: {EFFECTIVE_BATCH}")
print(f"  LR (scaled): {LR:.2e}")
print(f"  Warmup Ratio: {WARMUP_RATIO}")
print(f"  Weight Decay: {WEIGHT_DECAY}")
print(f"  Normalization - Mean: {MEAN}, Std: {STD}")

In [None]:
## 3. Dataset & Transforms (Same as Baseline)

class VideoTransform:
    """Same as baseline but with VideoMAE normalization."""
    def __init__(self, image_size=224, is_train=True):
        self.image_size = image_size
        self.is_train = is_train
        self.mean = MEAN
        self.std = STD
    
    def __call__(self, frames):
        # frames: [T, C, H, W]
        if self.is_train:
            h, w = frames.shape[-2:]
            scale = random.uniform(0.8, 1.0)
            new_h, new_w = int(h * scale), int(w * scale)
            frames = TF.resize(frames, [new_h, new_w], interpolation=InterpolationMode.BILINEAR)
            i = random.randint(0, max(0, new_h - self.image_size))
            j = random.randint(0, max(0, new_w - self.image_size))
            frames = TF.crop(frames, i, j, min(self.image_size, new_h), min(self.image_size, new_w))
            frames = TF.resize(frames, [self.image_size, self.image_size], interpolation=InterpolationMode.BILINEAR)
            if random.random() < 0.5:
                frames = TF.hflip(frames)
        else:
            frames = TF.resize(frames, [self.image_size, self.image_size], interpolation=InterpolationMode.BILINEAR)
        
        normalized = [TF.normalize(frame, self.mean, self.std) for frame in frames]
        return torch.stack(normalized)


class VideoDataset(Dataset):
    """Same as baseline dataset."""
    def __init__(self, root, transform=None, num_frames=16, frame_stride=2):
        self.root = Path(root)
        self.transform = transform
        self.num_frames = num_frames
        self.frame_stride = frame_stride
        
        self.classes = sorted([d.name for d in self.root.iterdir() if d.is_dir()])
        self.class_to_idx = {c: i for i, c in enumerate(self.classes)}
        
        self.samples = []
        for class_name in self.classes:
            class_dir = self.root / class_name
            for video_dir in class_dir.iterdir():
                if video_dir.is_dir():
                    self.samples.append((video_dir, self.class_to_idx[class_name]))
    
    def __len__(self):
        return len(self.samples)
    
    def _select_indices(self, total):
        if total <= 0:
            raise ValueError("No frames")
        if total == 1:
            return torch.zeros(self.num_frames, dtype=torch.long)
        steps = max(self.num_frames * self.frame_stride, self.num_frames)
        grid = torch.linspace(0, total - 1, steps=steps)
        idxs = grid[::self.frame_stride].long()
        if idxs.numel() < self.num_frames:
            pad = idxs.new_full((self.num_frames - idxs.numel(),), idxs[-1].item())
            idxs = torch.cat([idxs, pad], dim=0)
        return idxs[:self.num_frames]
    
    def __getitem__(self, idx):
        video_dir, label = self.samples[idx]
        frame_files = sorted(video_dir.glob('*.jpg'))
        
        if len(frame_files) == 0:
            raise ValueError(f"No frames in {video_dir}")
        
        indices = self._select_indices(len(frame_files))
        frames = []
        for i in indices:
            img = Image.open(frame_files[i]).convert('RGB')
            frames.append(TF.to_tensor(img))
        
        frames = torch.stack(frames)  # [T, C, H, W]
        
        if self.transform:
            frames = self.transform(frames)
        
        return frames, label

In [None]:
## 4. Create Datasets & DataLoaders

train_transform = VideoTransform(image_size=IMAGE_SIZE, is_train=True)
test_transform = VideoTransform(image_size=IMAGE_SIZE, is_train=False)

train_dataset = VideoDataset(PATH_DATA_TRAIN, transform=train_transform, 
                              num_frames=NUM_FRAMES, frame_stride=FRAME_STRIDE)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=2, pin_memory=True, drop_last=True)

print(f"Classes: {len(train_dataset.classes)}")
print(f"Training samples: {len(train_dataset)}")
print(f"Batches per epoch: {len(train_loader)}")

In [None]:
## 5. Load VideoMAE Model

# Create label mappings
label2id = train_dataset.class_to_idx
id2label = {v: k for k, v in label2id.items()}

# Load pretrained VideoMAE
model = VideoMAEForVideoClassification.from_pretrained(
    MODEL_CKPT,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,  # Head: 400 -> 51 classes
    num_frames=NUM_FRAMES
)
model = model.to(DEVICE)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total params: {total_params:,}")
print(f"Trainable params: {trainable_params:,}")

In [None]:
## 6. Training Loop (with Scheduler)

def train_one_epoch(model, loader, optimizer, scheduler, scaler, device, grad_accum_steps=1):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0
    
    optimizer.zero_grad()
    progress = tqdm(loader, desc="Training", leave=False)
    
    for batch_idx, (videos, labels) in enumerate(progress):
        videos = videos.to(device)  # [B, T, C, H, W]
        labels = labels.to(device)
        
        with torch.amp.autocast(device_type='cuda', enabled=True):
            outputs = model(videos)
            logits = outputs.logits
            loss = F.cross_entropy(logits, labels)
        
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
        total_loss += loss.item() * labels.size(0)
        
        loss = loss / grad_accum_steps
        scaler.scale(loss).backward()
        
        should_step = ((batch_idx + 1) % grad_accum_steps == 0) or (batch_idx + 1 == len(loader))
        if should_step:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()  # Step scheduler after each optimizer step
        
        current_lr = scheduler.get_last_lr()[0]
        progress.set_postfix({'loss': total_loss/total, 'acc': correct/total, 'lr': f'{current_lr:.2e}'})
    
    return total_loss / total, correct / total

In [None]:
## 7. Train Model (with Cosine Schedule + Warmup)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
scaler = torch.amp.GradScaler(enabled=True)

# Calculate scheduler steps
num_training_steps = len(train_loader) * EPOCHS // GRAD_ACCUM_STEPS
num_warmup_steps = int(num_training_steps * WARMUP_RATIO)

scheduler = get_cosine_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=num_warmup_steps, 
    num_training_steps=num_training_steps
)

print(f"Training steps: {num_training_steps}")
print(f"Warmup steps: {num_warmup_steps}")

best_acc = 0.0

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    loss, acc = train_one_epoch(model, train_loader, optimizer, scheduler, scaler, DEVICE, GRAD_ACCUM_STEPS)
    print(f"  Loss: {loss:.4f} | Acc: {acc:.4f}")
    
    if acc > best_acc:
        best_acc = acc
        model.save_pretrained('./videomae_exp1_best')
        print(f"  >>> Saved Best (Acc: {best_acc:.4f})")

print(f"\nTraining complete. Best acc: {best_acc:.4f}")

In [None]:
## 8. Test Dataset

class TestDataset(Dataset):
    def __init__(self, root, transform=None, num_frames=16, frame_stride=2):
        self.root = Path(root)
        self.transform = transform
        self.num_frames = num_frames
        self.frame_stride = frame_stride
        
        self.samples = []
        for video_dir in self.root.iterdir():
            if video_dir.is_dir():
                video_id = int(video_dir.name)
                self.samples.append((video_dir, video_id))
        self.samples.sort(key=lambda x: x[1])
    
    def __len__(self):
        return len(self.samples)
    
    def _select_indices(self, total):
        if total <= 0:
            raise ValueError("No frames")
        if total == 1:
            return torch.zeros(self.num_frames, dtype=torch.long)
        steps = max(self.num_frames * self.frame_stride, self.num_frames)
        grid = torch.linspace(0, total - 1, steps=steps)
        idxs = grid[::self.frame_stride].long()
        if idxs.numel() < self.num_frames:
            pad = idxs.new_full((self.num_frames - idxs.numel(),), idxs[-1].item())
            idxs = torch.cat([idxs, pad], dim=0)
        return idxs[:self.num_frames]
    
    def __getitem__(self, idx):
        video_dir, video_id = self.samples[idx]
        frame_files = sorted(video_dir.glob('*.jpg'))
        
        if len(frame_files) == 0:
            raise ValueError(f"No frames in {video_dir}")
        
        indices = self._select_indices(len(frame_files))
        frames = []
        for i in indices:
            img = Image.open(frame_files[i]).convert('RGB')
            frames.append(TF.to_tensor(img))
        
        frames = torch.stack(frames)
        
        if self.transform:
            frames = self.transform(frames)
        
        return frames, video_id

In [None]:
## 9. Inference

# Load best model
model = VideoMAEForVideoClassification.from_pretrained(
    './videomae_exp1_best',
    num_frames=NUM_FRAMES
)
model = model.to(DEVICE)
model.eval()

# Create test loader
test_dataset = TestDataset(PATH_DATA_TEST, transform=test_transform,
                           num_frames=NUM_FRAMES, frame_stride=FRAME_STRIDE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

print(f"Test samples: {len(test_dataset)}")

# Run inference
predictions = []
with torch.no_grad():
    for videos, video_ids in tqdm(test_loader, desc="Inference"):
        videos = videos.to(DEVICE)
        outputs = model(videos)
        preds = outputs.logits.argmax(dim=1)
        
        for vid, pred in zip(video_ids.tolist(), preds.tolist()):
            pred_class = id2label[pred]
            predictions.append((vid, pred_class))

predictions.sort(key=lambda x: x[0])
print(f"\nPredictions: {len(predictions)}")

In [None]:
## 10. Evaluate on Test Set

# Download test labels from Drive
!gdown "1Xv2CWOqdBj3kt0rkNJKRsodSIEd3-wX_" -O test_labels.csv -q

import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

# Load ground truth
gt_df = pd.read_csv("test_labels.csv")
test_labels = dict(zip(gt_df['id'].astype(str), gt_df['class']))

# Match predictions with ground truth
y_pred = []
y_true = []
for video_id, pred_class in predictions:
    video_id_str = str(video_id)
    if video_id_str in test_labels:
        y_pred.append(pred_class)
        y_true.append(test_labels[video_id_str])

# Calculate accuracy
accuracy = accuracy_score(y_true, y_pred)

print("=" * 50)
print("EXP 1: VideoMAE Baseline (Paper Settings) - TEST RESULTS")
print("=" * 50)
print(f"Total: {len(y_true)} | Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print()
print(classification_report(y_true, y_pred, zero_division=0))

In [None]:
## 11. Save Submission

with open('submission_exp1.csv', 'w') as f:
    f.write('id,class\n')
    for video_id, pred_class in predictions:
        f.write(f'{video_id},{pred_class}\n')

print("Submission saved to: submission_exp1.csv")