# VideoMAE Ablation Study - Dual GPU (T4 x2)

**8 Experiments, 10 epochs each, 2 GPUs running in parallel**

| GPU 0 | GPU 1 |
|-------|-------|
| Exp 0: ViT Baseline | Exp 1: VideoMAE Paper |
| Exp 2: Multi-Seg TTA | Exp 3: Consistent Transform |
| Exp 4: Mixup | Exp 5: Label Smoothing |
| Exp 6: 2-Stage | Exp 7: Flip TTA |

In [None]:
## 1. Imports & Setup

import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.multiprocessing as mp
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from PIL import Image
from tqdm.auto import tqdm
import torchvision.transforms as T
import torchvision.transforms.functional as TF
from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
from transformers import get_cosine_schedule_with_warmup
import timm
import pandas as pd
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Check GPUs
NUM_GPUS = torch.cuda.device_count()
print(f"Available GPUs: {NUM_GPUS}")
for i in range(NUM_GPUS):
    print(f"  GPU {i}: {torch.cuda.get_device_name(i)}")

In [None]:
## 2. Configuration

PATH_DATA_TRAIN = Path('/kaggle/input/action-video/data/data_train')
PATH_DATA_TEST = Path('/kaggle/input/action-video/data/test')

MODEL_CKPT = "MCG-NJU/videomae-base-finetuned-kinetics"
VIT_CKPT = "vit_small_patch16_224"

NUM_FRAMES = 16
IMAGE_SIZE = 224
RESIZE_SIZE = 256
BATCH_SIZE = 16
GRAD_ACCUM_STEPS = 2
EPOCHS = 10  # Increased from 4
WEIGHT_DECAY = 0.05
WARMUP_RATIO = 0.1

# LR scaling
LR = 5e-5  # Fine-tuning LR

processor = VideoMAEImageProcessor.from_pretrained(MODEL_CKPT)
MEAN = processor.image_mean
STD = processor.image_std



In [None]:
## 3. ViT Baseline Model

class LightweightViTForAction(nn.Module):
    """ViT baseline model (same as baseline-btc.ipynb)."""
    def __init__(self, num_classes=51):
        super().__init__()
        self.vit = timm.create_model(VIT_CKPT, pretrained=True, num_classes=0)
        self.embed_dim = self.vit.num_features
        self.head = nn.Linear(self.embed_dim, num_classes)
    
    def forward(self, video):
        B, T, C, H, W = video.shape
        x = video.view(B * T, C, H, W)
        features = self.vit(x)  # [B*T, embed_dim]
        features = features.view(B, T, -1)
        features = features.mean(dim=1)  # Temporal pooling
        return self.head(features)

In [None]:
## 4. Transform Classes

class VideoTransformBaseline:
    def __init__(self, is_train=True):
        self.is_train = is_train
    
    def __call__(self, frames):
        if self.is_train:
            h, w = frames.shape[-2:]
            scale = random.uniform(0.8, 1.0)
            new_h, new_w = int(h * scale), int(w * scale)
            frames = TF.resize(frames, [new_h, new_w])
            i = random.randint(0, max(0, new_h - IMAGE_SIZE))
            j = random.randint(0, max(0, new_w - IMAGE_SIZE))
            frames = TF.crop(frames, i, j, min(IMAGE_SIZE, new_h), min(IMAGE_SIZE, new_w))
            frames = TF.resize(frames, [IMAGE_SIZE, IMAGE_SIZE])
            if random.random() < 0.5:
                frames = TF.hflip(frames)
        else:
            frames = TF.resize(frames, [IMAGE_SIZE, IMAGE_SIZE])
        return torch.stack([TF.normalize(f, MEAN, STD) for f in frames])


class VideoTransformConsistent:
    def __init__(self, is_train=True):
        self.is_train = is_train
    
    def __call__(self, frames):
        frames = [TF.resize(img, RESIZE_SIZE) for img in frames]
        if self.is_train:
            i, j, h, w = T.RandomResizedCrop.get_params(frames[0], scale=(0.8, 1.0), ratio=(0.75, 1.33))
            is_flip = random.random() > 0.5
            transformed = []
            for img in frames:
                img = TF.resized_crop(img, i, j, h, w, size=(IMAGE_SIZE, IMAGE_SIZE))
                if is_flip:
                    img = TF.hflip(img)
                img = TF.to_tensor(img)
                img = TF.normalize(img, MEAN, STD)
                transformed.append(img)
            return torch.stack(transformed)
        else:
            return torch.stack([TF.normalize(TF.to_tensor(TF.center_crop(img, IMAGE_SIZE)), MEAN, STD) for img in frames])

In [None]:
## 5. Dataset Classes

class VideoDataset(Dataset):
    def __init__(self, root, transform, use_pil=False):
        self.root = Path(root)
        self.transform = transform
        self.use_pil = use_pil
        self.classes = sorted([d.name for d in self.root.iterdir() if d.is_dir()])
        self.class_to_idx = {c: i for i, c in enumerate(self.classes)}
        self.samples = []
        for cls in self.classes:
            for vid in (self.root / cls).iterdir():
                if vid.is_dir():
                    self.samples.append((vid, self.class_to_idx[cls]))
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        vid_dir, label = self.samples[idx]
        files = sorted(vid_dir.glob('*.jpg'))
        indices = torch.linspace(0, len(files)-1, NUM_FRAMES).long()
        if self.use_pil:
            frames = [Image.open(files[i]).convert('RGB') for i in indices]
        else:
            frames = torch.stack([TF.to_tensor(Image.open(files[i]).convert('RGB')) for i in indices])
        return self.transform(frames), label

In [None]:
## 6. Test Dataset Classes (same as before)

class TestDatasetSingle(Dataset):
    def __init__(self, root):
        self.root = Path(root)
        self.samples = [(d, int(d.name)) for d in self.root.iterdir() if d.is_dir()]
        self.samples.sort(key=lambda x: x[1])
    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        vid_dir, vid_id = self.samples[idx]
        files = sorted(vid_dir.glob('*.jpg'))
        indices = torch.linspace(0, len(files)-1, NUM_FRAMES).long()
        frames = [TF.normalize(TF.to_tensor(TF.resize(TF.center_crop(TF.resize(Image.open(files[i]).convert('RGB'), RESIZE_SIZE), IMAGE_SIZE), [IMAGE_SIZE, IMAGE_SIZE])), MEAN, STD) for i in indices]
        return torch.stack(frames), vid_id

class TestDatasetMultiSegment(Dataset):
    def __init__(self, root, num_segments=2):
        self.root, self.num_segments = Path(root), num_segments
        self.samples = [(d, int(d.name)) for d in self.root.iterdir() if d.is_dir()]
        self.samples.sort(key=lambda x: x[1])
    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        vid_dir, vid_id = self.samples[idx]
        files = sorted(vid_dir.glob('*.jpg'))
        total, views = len(files), []
        for seg in range(self.num_segments):
            start, end = (total // self.num_segments) * seg, min((total // self.num_segments) * (seg + 1), total)
            indices = torch.linspace(start, max(start, end-1), NUM_FRAMES).long()
            frames = [TF.resize(Image.open(files[i]).convert('RGB'), RESIZE_SIZE) for i in indices]
            w, h = frames[0].size
            for top, left in [(0, 0), ((h-IMAGE_SIZE)//2, (w-IMAGE_SIZE)//2), (max(0,h-IMAGE_SIZE), max(0,w-IMAGE_SIZE))]:
                views.append(torch.stack([TF.normalize(TF.to_tensor(TF.crop(img, top, left, IMAGE_SIZE, IMAGE_SIZE)), MEAN, STD) for img in frames]))
        return torch.stack(views), vid_id

class TestDatasetFlipTTA(Dataset):
    def __init__(self, root):
        self.root = Path(root)
        self.samples = [(d, int(d.name)) for d in self.root.iterdir() if d.is_dir()]
        self.samples.sort(key=lambda x: x[1])
    def __len__(self): return len(self.samples)
    def __getitem__(self, idx):
        vid_dir, vid_id = self.samples[idx]
        files = sorted(vid_dir.glob('*.jpg'))
        indices = torch.linspace(0, len(files)-1, NUM_FRAMES).long()
        frames = [TF.resize(Image.open(files[i]).convert('RGB'), RESIZE_SIZE) for i in indices]
        w, h = frames[0].size
        views = []
        for top, left in [((h-IMAGE_SIZE)//2, (w-IMAGE_SIZE)//2), (0, (w-IMAGE_SIZE)//2), (max(0,h-IMAGE_SIZE), (w-IMAGE_SIZE)//2)]:
            views.append(torch.stack([TF.normalize(TF.to_tensor(TF.crop(img, top, left, IMAGE_SIZE, IMAGE_SIZE)), MEAN, STD) for img in frames]))
            views.append(torch.stack([TF.normalize(TF.to_tensor(TF.hflip(TF.crop(img, top, left, IMAGE_SIZE, IMAGE_SIZE))), MEAN, STD) for img in frames]))
        return torch.stack(views), vid_id

In [None]:
## 7. Mixup Collate

class MixupCollate:
    def __init__(self, num_classes, alpha=0.8):
        self.num_classes, self.alpha = num_classes, alpha
    def __call__(self, batch):
        inputs, targets = torch.utils.data.default_collate(batch)
        lam = np.random.beta(self.alpha, self.alpha)
        idx = torch.randperm(inputs.size(0))
        inputs = lam * inputs + (1 - lam) * inputs[idx]
        onehot = F.one_hot(targets, self.num_classes).float()
        return inputs, lam * onehot + (1 - lam) * onehot[idx]

In [None]:
## 8. Training Functions

def train_epoch(model, loader, optimizer, scheduler, scaler, device, use_mixup=False, label_smoothing=0.0, is_vit=False):
    model.train()
    total_loss, correct, total = 0.0, 0, 0
    optimizer.zero_grad()
    for batch_idx, (videos, targets) in enumerate(loader):
        videos, targets = videos.to(device), targets.to(device)
        with torch.amp.autocast(device_type='cuda'):
            if is_vit:
                logits = model(videos)
            else:
                logits = model(videos).logits
            if use_mixup:
                loss = -torch.sum(targets * F.log_softmax(logits, dim=1), dim=1).mean()
                true_labels = targets.argmax(dim=1)
            else:
                loss = F.cross_entropy(logits, targets, label_smoothing=label_smoothing)
                true_labels = targets
        correct += (logits.argmax(dim=1) == true_labels).sum().item()
        total += true_labels.size(0)
        total_loss += loss.item() * true_labels.size(0)
        scaler.scale(loss / GRAD_ACCUM_STEPS).backward()
        if (batch_idx + 1) % GRAD_ACCUM_STEPS == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()
    return total_loss / total, correct / total

def evaluate(model, loader, device, multi_view=False, id2label=None, is_vit=False):
    model.eval()
    predictions = []
    with torch.no_grad():
        for data, vid_ids in loader:
            if multi_view:
                B, V, T, C, H, W = data.shape
                data = data.view(B * V, T, C, H, W).to(device)
                logits = model(data) if is_vit else model(data).logits
                logits = logits.view(B, V, -1).mean(dim=1)
            else:
                data = data.to(device)
                logits = model(data) if is_vit else model(data).logits
            for vid, pred in zip(vid_ids.tolist(), logits.argmax(dim=1).tolist()):
                predictions.append((vid, id2label[pred]))
    return predictions

In [None]:
## 9. Load Test Labels

!gdown "1Xv2CWOqdBj3kt0rkNJKRsodSIEd3-wX_" -O test_labels.csv -q
gt_df = pd.read_csv("test_labels.csv")
GT_LABELS = dict(zip(gt_df['id'].astype(str), gt_df['class']))

def calc_accuracy(predictions):
    y_pred, y_true = [], []
    for vid_id, pred_cls in predictions:
        if str(vid_id) in GT_LABELS:
            y_pred.append(pred_cls)
            y_true.append(GT_LABELS[str(vid_id)])
    return accuracy_score(y_true, y_pred)

print(f"Loaded {len(GT_LABELS)} test labels")

In [None]:
## 10. Single GPU Experiment Runner (with per-epoch test eval)

def run_single_experiment(exp_config, gpu_id, results_queue):
    """Run a single experiment on specified GPU with per-epoch test eval."""
    import matplotlib.pyplot as plt
    
    torch.cuda.set_device(gpu_id)
    device = torch.device(f'cuda:{gpu_id}')
    
    exp_name = exp_config['name']
    print(f"[GPU {gpu_id}] Starting: {exp_name}")
    
    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)
    
    # Create model
    if exp_config.get('is_vit', False):
        model = LightweightViTForAction(num_classes=51).to(device)
        is_vit = True
    else:
        label2id = exp_config['train_ds'].class_to_idx
        id2label = {v: k for k, v in label2id.items()}
        model = VideoMAEForVideoClassification.from_pretrained(
            MODEL_CKPT, label2id=label2id, id2label=id2label,
            ignore_mismatched_sizes=True, num_frames=NUM_FRAMES
        ).to(device)
        is_vit = False
    
    train_ds = exp_config['train_ds']
    test_ds = exp_config['test_ds']
    label2id = train_ds.class_to_idx
    id2label = {v: k for k, v in label2id.items()}
    
    # DataLoaders
    collate_fn = exp_config.get('mixup_collate', None)
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True, drop_last=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_ds, batch_size=4 if exp_config.get('multi_view') else BATCH_SIZE, shuffle=False, num_workers=2)
    
    # Optimizer & Scheduler
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    scaler = torch.amp.GradScaler()
    num_steps = len(train_loader) * EPOCHS // GRAD_ACCUM_STEPS
    scheduler = get_cosine_schedule_with_warmup(optimizer, int(num_steps * WARMUP_RATIO), num_steps)
    
    # Training history
    history = {'epoch': [], 'loss_train': [], 'acc_train': [], 'acc_test': []}
    best_acc = 0.0
    
    for epoch in range(EPOCHS):
        # Train
        loss, acc = train_epoch(
            model, train_loader, optimizer, scheduler, scaler, device,
            use_mixup=exp_config.get('use_mixup', False),
            label_smoothing=exp_config.get('label_smoothing', 0.0),
            is_vit=is_vit
        )
        
        # Evaluate on test
        predictions = evaluate(model, test_loader, device, 
                               multi_view=exp_config.get('multi_view', False), 
                               id2label=id2label, is_vit=is_vit)
        test_acc = calc_accuracy(predictions)
        
        print(f"[GPU {gpu_id}] {exp_name} Epoch {epoch+1}/{EPOCHS}: Loss_train={loss:.4f}, Acc_train={acc:.4f}, Acc_test={test_acc:.4f}")
        
        history['epoch'].append(epoch + 1)
        history['loss_train'].append(loss)
        history['acc_train'].append(acc)
        history['acc_test'].append(test_acc)
        
        if acc > best_acc:
            best_acc = acc
            torch.save(model.state_dict(), f'{exp_name}_best.pt')
    
    # 2-Stage Phase 2
    if exp_config.get('two_stage', False):
        print(f"[GPU {gpu_id}] {exp_name} Phase 2...")
        model.load_state_dict(torch.load(f'{exp_name}_best.pt'))
        optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6, weight_decay=WEIGHT_DECAY)
        p2_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True, drop_last=True)
        scheduler = get_cosine_schedule_with_warmup(optimizer, 0, len(p2_loader) * 3 // GRAD_ACCUM_STEPS)
        for epoch in range(3):
            loss, acc = train_epoch(model, p2_loader, optimizer, scheduler, scaler, device, label_smoothing=0.1, is_vit=is_vit)
            predictions = evaluate(model, test_loader, device, multi_view=exp_config.get('multi_view', False), id2label=id2label, is_vit=is_vit)
            test_acc = calc_accuracy(predictions)
            print(f"[GPU {gpu_id}] {exp_name} P2 Epoch {epoch+1}/3: Loss_train={loss:.4f}, Acc_train={acc:.4f}, Acc_test={test_acc:.4f}")
            history['epoch'].append(EPOCHS + epoch + 1)
            history['loss_train'].append(loss)
            history['acc_train'].append(acc)
            history['acc_test'].append(test_acc)
            best_acc = max(best_acc, acc)
    else:
        model.load_state_dict(torch.load(f'{exp_name}_best.pt'))
    
    # Final test
    final_test_acc = history['acc_test'][-1]
    
    # Plot training curves for this experiment
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    ax1 = axes[0]
    ax1.plot(history['epoch'], history['loss_train'], 'b-o', markersize=4)
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.set_title(f'{exp_name} - Training Loss')
    ax1.grid(True, alpha=0.3)
    
    ax2 = axes[1]
    ax2.plot(history['epoch'], [a*100 for a in history['acc_train']], 'b-o', label='Train', markersize=4)
    ax2.plot(history['epoch'], [a*100 for a in history['acc_test']], 'r-s', label='Test', markersize=4)
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy (%)')
    ax2.set_title(f'{exp_name} - Accuracy')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    plt.suptitle(f'{exp_name} Training Curves (GPU {gpu_id})', fontsize=12, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    print(f"[GPU {gpu_id}] {exp_name} >>> FINAL TEST ACC: {final_test_acc:.4f}")
    
    results_queue.put({
        'exp': exp_name, 
        'train_acc': best_acc, 
        'test_acc': final_test_acc, 
        'gpu': gpu_id,
        'history': history
    })
    
    del model
    torch.cuda.empty_cache()


In [None]:
## 11. Create Datasets

train_ds_baseline = VideoDataset(PATH_DATA_TRAIN, VideoTransformBaseline(is_train=True), use_pil=False)
train_ds_consistent = VideoDataset(PATH_DATA_TRAIN, VideoTransformConsistent(is_train=True), use_pil=True)

test_ds_single = TestDatasetSingle(PATH_DATA_TEST)
test_ds_multi = TestDatasetMultiSegment(PATH_DATA_TEST, num_segments=2)
test_ds_flip = TestDatasetFlipTTA(PATH_DATA_TEST)

mixup_collate = MixupCollate(num_classes=len(train_ds_consistent.classes), alpha=0.8)

print(f"Train samples: {len(train_ds_baseline)}, Test samples: {len(test_ds_single)}")

In [None]:
## 12. Define All 8 Experiments

EXPERIMENTS = [
    # GPU 0 experiments
    {'name': 'Exp0_ViT_Baseline', 'train_ds': train_ds_baseline, 'test_ds': test_ds_single, 'is_vit': True},
    {'name': 'Exp2_MultiSegment_TTA', 'train_ds': train_ds_baseline, 'test_ds': test_ds_multi, 'multi_view': True},
    {'name': 'Exp4_Mixup', 'train_ds': train_ds_consistent, 'test_ds': test_ds_single, 'use_mixup': True, 'mixup_collate': mixup_collate},
    {'name': 'Exp6_2Stage', 'train_ds': train_ds_consistent, 'test_ds': test_ds_single, 'use_mixup': True, 'mixup_collate': mixup_collate, 'two_stage': True},
    
    # GPU 1 experiments
    {'name': 'Exp1_VideoMAE_Paper', 'train_ds': train_ds_baseline, 'test_ds': test_ds_single},
    {'name': 'Exp3_Consistent_Transform', 'train_ds': train_ds_consistent, 'test_ds': test_ds_single},
    {'name': 'Exp5_LabelSmoothing', 'train_ds': train_ds_consistent, 'test_ds': test_ds_single, 'label_smoothing': 0.1},
    {'name': 'Exp7_FlipTTA', 'train_ds': train_ds_consistent, 'test_ds': test_ds_flip, 'use_mixup': True, 'mixup_collate': mixup_collate, 'two_stage': True, 'multi_view': True},
]

In [None]:
## 13. Run Experiments (Sequential - more reliable in Jupyter)

# Note: Dual GPU parallel using multiprocessing has issues in Jupyter notebooks
# Running sequentially on GPU 0 for reliability

from queue import Queue

RESULTS = []
q = Queue()

# Run experiments sequentially on GPU 0
# For parallel execution, use a Python script instead of notebook

for exp_config in EXPERIMENTS:
    print(f"\n{'='*60}")
    print(f"Running: {exp_config['name']}")
    print('='*60)
    run_single_experiment(exp_config, 0, q)
    while not q.empty():
        RESULTS.append(q.get())


In [None]:
## 14. Results Summary & Plot

import matplotlib.pyplot as plt

print("\n" + "="*70)
print("ALL EXPERIMENTS RESULTS (8 Experiments, 10 Epochs, Dual GPU)")
print("="*70)

results_df = pd.DataFrame(RESULTS)
results_df = results_df.sort_values('exp').reset_index(drop=True)

# Calculate delta from VideoMAE baseline (Exp1)
baseline_acc = results_df[results_df['exp'] == 'Exp1_VideoMAE_Paper']['test_acc'].values[0]
results_df['delta'] = (results_df['test_acc'] - baseline_acc) * 100
results_df['delta_str'] = results_df['delta'].apply(lambda x: f"+{x:.2f}%" if x > 0 else f"{x:.2f}%")

print(results_df[['exp', 'train_acc', 'test_acc', 'delta_str', 'gpu']].to_string(index=False))

vit_acc = results_df[results_df['exp'] == 'Exp0_ViT_Baseline']['test_acc'].values[0]
print(f"\nViT Baseline (Exp0): {vit_acc:.4f}")
print(f"VideoMAE Baseline (Exp1): {baseline_acc:.4f}")
print(f"Best: {results_df['test_acc'].max():.4f}")

# Plot
fig, ax = plt.subplots(figsize=(12, 6))
exp_names = [e.replace('Exp', '').replace('_', '\n') for e in results_df['exp']]
colors = ['#3498db' if 'ViT' in e else '#2ecc71' for e in results_df['exp']]
bars = ax.bar(exp_names, results_df['test_acc'] * 100, color=colors, edgecolor='black')

# Add baseline line
ax.axhline(y=baseline_acc * 100, color='red', linestyle='--', label=f'VideoMAE Baseline ({baseline_acc*100:.1f}%)')
ax.axhline(y=vit_acc * 100, color='blue', linestyle=':', label=f'ViT Baseline ({vit_acc*100:.1f}%)')

# Add value labels
for bar, acc in zip(bars, results_df['test_acc']):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, f'{acc*100:.1f}%', 
            ha='center', va='bottom', fontsize=9, fontweight='bold')

ax.set_ylabel('Test Accuracy (%)')
ax.set_xlabel('Experiment')
ax.set_title('VideoMAE Ablation Study Results (10 Epochs, Dual T4 GPU)', fontsize=14, fontweight='bold')
ax.legend()
ax.set_ylim([50, 100])
ax.grid(axis='y', alpha=0.3)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()