# VideoMAE + RandAugment (Option A)

**Strategy:**
1. VideoRandAugment for strong data augmentation
2. Standard Cross Entropy (NO Mixup, NO Focal Loss)
3. 2-Stage Training: RandAugment → Label Smoothing
4. Test evaluation + History tracking + Plots

**Target:** ≥ 85% Test Accuracy

In [None]:
!pip install -q transformers accelerate evaluate gdown

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as T
import torchvision.transforms.functional as TF
from transformers import VideoMAEForVideoClassification
from transformers import get_cosine_schedule_with_warmup
from pathlib import Path
from PIL import Image
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import random
import os
import gc

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {DEVICE}')

# Paths
PATH_DATA_TRAIN = '/kaggle/input/action-video/data/data_train'
PATH_DATA_TEST = '/kaggle/input/action-video/data/test'

## 1. Configuration

In [None]:
# Model Config
MODEL_CKPT = 'MCG-NJU/videomae-base-finetuned-kinetics'
NUM_FRAMES = 16
IMG_SIZE = 224
RESIZE_SIZE = 256

# Phase 1 Config (RandAugment)
EPOCHS_P1 = 30
LR_P1 = 5e-5
RANDAUG_NUM_OPS = 2
RANDAUG_MAGNITUDE = 9

# Phase 2 Config (Label Smoothing)
EPOCHS_P2 = 10
LR_P2 = 1e-6
LABEL_SMOOTHING = 0.1

# Common Config
BATCH_SIZE = 8
ACCUM_STEPS = 4
WEIGHT_DECAY = 0.05
WARMUP_RATIO = 0.1

# Normalization
MEAN = [0.485, 0.456, 0.406]
STD = [0.229, 0.224, 0.225]

## 2. VideoRandAugment Class

In [None]:
class VideoRandAugment:
    """
    RandAugment for video clips.
    Applies SAME random augmentation policy to ALL frames in a clip.
    """
    def __init__(self, num_ops=2, magnitude=9):
        self.num_ops = num_ops
        self.magnitude = magnitude
        
        # Define augmentation operations
        self.ops = [
            ('AutoContrast', lambda img, m: TF.autocontrast(img)),
            ('Equalize', lambda img, m: TF.equalize(img)),
            ('Solarize', lambda img, m: TF.solarize(img, int(256 - m * 25.6))),
            ('Color', lambda img, m: TF.adjust_saturation(img, max(0.1, 1 + m / 10 - 0.5))),
            ('Contrast', lambda img, m: TF.adjust_contrast(img, max(0.1, 1 + m / 10 - 0.5))),
            ('Brightness', lambda img, m: TF.adjust_brightness(img, max(0.1, 1 + m / 10 - 0.5))),
            ('Sharpness', lambda img, m: TF.adjust_sharpness(img, max(0.1, 1 + m / 10 - 0.5))),
        ]
    
    def __call__(self, frames):
        """
        Apply SAME random augmentation to all frames.
        
        Args:
            frames: List of PIL Images
        Returns:
            augmented_frames: List of augmented PIL Images
        """
        # Sample operations ONCE for this video clip
        selected_ops = random.sample(self.ops, k=min(self.num_ops, len(self.ops)))
        magnitude = random.uniform(0, self.magnitude)
        
        # Apply SAME ops with SAME magnitude to ALL frames
        augmented_frames = []
        for frame in frames:
            for op_name, op_func in selected_ops:
                try:
                    frame = op_func(frame, magnitude)
                except:
                    pass  # Skip if operation fails
            augmented_frames.append(frame)
        
        return augmented_frames

print('VideoRandAugment class defined')

## 3. Dataset Classes

In [None]:
class VideoDataset(Dataset):
    def __init__(self, root, num_frames=16, is_train=True, use_randaug=True):
        self.root = Path(root)
        self.num_frames = num_frames
        self.is_train = is_train
        self.classes = sorted([d.name for d in self.root.iterdir() if d.is_dir()])
        self.class_to_idx = {c: i for i, c in enumerate(self.classes)}
        self.samples = []
        for cls in self.classes:
            cls_dir = self.root / cls
            for video_dir in sorted([d for d in cls_dir.iterdir() if d.is_dir()]):
                self.samples.append((video_dir, self.class_to_idx[cls]))
        
        # RandAugment
        self.video_aug = VideoRandAugment(RANDAUG_NUM_OPS, RANDAUG_MAGNITUDE) if (is_train and use_randaug) else None
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        video_dir, label = self.samples[idx]
        frame_paths = sorted(video_dir.glob('*.jpg'))
        indices = np.linspace(0, len(frame_paths) - 1, self.num_frames, dtype=int)
        frames = [Image.open(frame_paths[i]).convert('RGB') for i in indices]
        
        if self.is_train:
            # 1. Apply RandAugment (SAME policy for all frames)
            if self.video_aug:
                frames = self.video_aug(frames)
            
            # 2. Resize
            frames = [TF.resize(img, RESIZE_SIZE) for img in frames]
            
            # 3. Get random crop params ONCE
            i, j, h, w = T.RandomResizedCrop.get_params(frames[0], (0.8, 1.0), (0.75, 1.33))
            do_flip = random.random() > 0.5
            
            # 4. Apply SAME crop and flip to all frames
            processed = []
            for img in frames:
                img = TF.resized_crop(img, i, j, h, w, (IMG_SIZE, IMG_SIZE))
                if do_flip:
                    img = TF.hflip(img)
                img = TF.normalize(TF.to_tensor(img), MEAN, STD)
                processed.append(img)
        else:
            # Test: Simple center crop
            frames = [TF.resize(img, RESIZE_SIZE) for img in frames]
            processed = [TF.normalize(TF.to_tensor(TF.center_crop(img, IMG_SIZE)), MEAN, STD) for img in frames]
        
        return torch.stack(processed), label

class TestDataset(Dataset):
    def __init__(self, root, num_frames=16):
        self.root = Path(root)
        self.num_frames = num_frames
        self.samples = sorted([(d, int(d.name)) for d in self.root.iterdir() if d.is_dir()], key=lambda x: x[1])
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        video_dir, video_id = self.samples[idx]
        frame_paths = sorted(video_dir.glob('*.jpg'))
        indices = np.linspace(0, len(frame_paths) - 1, self.num_frames, dtype=int)
        frames = [TF.resize(Image.open(frame_paths[i]).convert('RGB'), RESIZE_SIZE) for i in indices]
        processed = [TF.normalize(TF.to_tensor(TF.center_crop(img, IMG_SIZE)), MEAN, STD) for img in frames]
        return torch.stack(processed), video_id

print('Dataset classes defined')

## 4. Load Datasets

In [None]:
# Download test labels
!gdown "1Xv2CWOqdBj3kt0rkNJKRsodSIEd3-wX_" -O test_labels.csv -q

# Load datasets
train_dataset_p1 = VideoDataset(PATH_DATA_TRAIN, NUM_FRAMES, is_train=True, use_randaug=True)
train_dataset_p2 = VideoDataset(PATH_DATA_TRAIN, NUM_FRAMES, is_train=True, use_randaug=False)  # No aug in P2
test_dataset = TestDataset(PATH_DATA_TEST, NUM_FRAMES)

# Ground truth
gt_df = pd.read_csv('test_labels.csv')
gt_dict = dict(zip(gt_df['id'].astype(str), gt_df['class']))

print(f'Train samples: {len(train_dataset_p1)}')
print(f'Test samples: {len(test_dataset)}')
print(f'Classes: {len(train_dataset_p1.classes)}')

# DataLoaders
train_loader_p1 = DataLoader(train_dataset_p1, BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True, drop_last=True)
train_loader_p2 = DataLoader(train_dataset_p2, BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True, drop_last=True)
test_loader = DataLoader(test_dataset, BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

print('DataLoaders created')

In [None]:
# Load model
model = VideoMAEForVideoClassification.from_pretrained(
    MODEL_CKPT,
    num_labels=len(train_dataset_p1.classes),
    ignore_mismatched_sizes=True,
    num_frames=NUM_FRAMES
).to(DEVICE)
print('Model loaded')

## 5. Training & Evaluation Functions

In [None]:
def train_epoch(model, loader, optimizer, scheduler, scaler, label_smoothing=0.0):
    model.train()
    total_loss, total_correct, total_samples = 0.0, 0, 0
    pbar = tqdm(loader, desc='Training', leave=False)
    optimizer.zero_grad()
    
    for step, (inputs, targets) in enumerate(pbar):
        inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
        
        with torch.amp.autocast('cuda'):
            logits = model(inputs).logits
            loss = F.cross_entropy(logits, targets, label_smoothing=label_smoothing)
        
        total_correct += (logits.argmax(1) == targets).sum().item()
        total_samples += inputs.size(0)
        
        scaler.scale(loss / ACCUM_STEPS).backward()
        
        if (step + 1) % ACCUM_STEPS == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()
        
        total_loss += loss.item()
        pbar.set_postfix({'loss': f'{total_loss/(step+1):.4f}', 'acc': f'{total_correct/total_samples:.4f}'})
    
    return total_loss / len(loader), total_correct / total_samples

@torch.no_grad()
def evaluate(model, loader, classes, gt_dict):
    model.eval()
    predictions = []
    for videos, video_ids in tqdm(loader, desc='Evaluating', leave=False):
        videos = videos.to(DEVICE)
        preds = model(videos).logits.argmax(1).cpu().tolist()
        predictions.extend(zip(video_ids.tolist(), preds))
    
    y_true = [gt_dict[str(vid)] for vid, _ in predictions]
    y_pred = [classes[p] for _, p in predictions]
    return accuracy_score(y_true, y_pred)

print('Training functions defined')

## 6. Training Loop

In [None]:
# Initialize
history = []
best_acc = 0.0
scaler = torch.amp.GradScaler()

# Phase 1: RandAugment
print('=' * 50)
print(f'PHASE 1: RandAugment Training (Epochs: {EPOCHS_P1}, LR: {LR_P1})')
print('=' * 50)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR_P1, weight_decay=WEIGHT_DECAY)
total_steps = len(train_loader_p1) * EPOCHS_P1 // ACCUM_STEPS
scheduler = get_cosine_schedule_with_warmup(optimizer, int(total_steps * WARMUP_RATIO), total_steps)

for epoch in range(1, EPOCHS_P1 + 1):
    loss, train_acc = train_epoch(model, train_loader_p1, optimizer, scheduler, scaler, label_smoothing=0.0)
    test_acc = evaluate(model, test_loader, train_dataset_p1.classes, gt_dict)
    
    history.append({'epoch': epoch, 'phase': 1, 'loss': loss, 'train_acc': train_acc, 'test_acc': test_acc})
    
    status = '>>> BEST' if test_acc > best_acc else ''
    if test_acc > best_acc:
        best_acc = test_acc
        torch.save(model.state_dict(), 'best_p1.pt')
    print(f'Ep {epoch}/{EPOCHS_P1}: L={loss:.4f} TrAcc={train_acc:.4f} TeAcc={test_acc:.4f} {status}')
    
    gc.collect()
    torch.cuda.empty_cache()

# Phase 2: Label Smoothing
print('\n' + '=' * 50)
print(f'PHASE 2: Label Smoothing (Epochs: {EPOCHS_P2}, LR: {LR_P2})')
print('=' * 50)

model.load_state_dict(torch.load('best_p1.pt'))
scaler = torch.amp.GradScaler()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR_P2, weight_decay=WEIGHT_DECAY)
total_steps = len(train_loader_p2) * EPOCHS_P2 // ACCUM_STEPS
scheduler = get_cosine_schedule_with_warmup(optimizer, int(total_steps * WARMUP_RATIO), total_steps)

for epoch in range(1, EPOCHS_P2 + 1):
    loss, train_acc = train_epoch(model, train_loader_p2, optimizer, scheduler, scaler, label_smoothing=LABEL_SMOOTHING)
    test_acc = evaluate(model, test_loader, train_dataset_p1.classes, gt_dict)
    
    history.append({'epoch': EPOCHS_P1 + epoch, 'phase': 2, 'loss': loss, 'train_acc': train_acc, 'test_acc': test_acc})
    
    status = '>>> BEST' if test_acc > best_acc else ''
    if test_acc > best_acc:
        best_acc = test_acc
        torch.save(model.state_dict(), 'best_final.pt')
    print(f'P2 Ep {epoch}/{EPOCHS_P2}: L={loss:.4f} TrAcc={train_acc:.4f} TeAcc={test_acc:.4f} {status}')
    
    gc.collect()
    torch.cuda.empty_cache()

# Save history
df_history = pd.DataFrame(history)
df_history.to_csv('training_history.csv', index=False)
print(f'\nTraining Complete! Best Test Acc: {best_acc:.4f}')

##7. Plot Training Curves

In [None]:
df = pd.read_csv('training_history.csv')

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Test Accuracy
axes[0].plot(df['epoch'], df['test_acc'], 'b-o', markersize=4, label='Test Acc')
axes[0].axvline(x=EPOCHS_P1, color='gray', linestyle='--', alpha=0.5, label='P1→P2')
axes[0].axhline(y=0.851, color='red', linestyle='--', alpha=0.5, label='Baseline (85.1%)')
axes[0].set_title('Test Accuracy')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Train Accuracy
axes[1].plot(df['epoch'], df['train_acc'], 'g-s', markersize=4, label='Train Acc')
axes[1].axvline(x=EPOCHS_P1, color='gray', linestyle='--', alpha=0.5, label='P1→P2')
axes[1].set_title('Train Accuracy')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Loss
axes[2].plot(df['epoch'], df['loss'], 'r-^', markersize=4, label='Loss')
axes[2].axvline(x=EPOCHS_P1, color='gray', linestyle='--', alpha=0.5, label='P1→P2')
axes[2].set_title('Loss')
axes[2].set_xlabel('Epoch')
axes[2].set_ylabel('Loss')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('training_curves.png', dpi=150)
plt.show()

print('\n' + '=' * 50)
print('TRAINING SUMMARY')
print('=' * 50)
print(df.to_string(index=False))