# Improved Baseline: VideoMAE + Advanced Augmentation + 2-Stage Training

Notebook này nâng cấp baseline từ ViT thuần (ImageNet pretrain) sang **VideoMAE (Kinetics-400 pretrain)**. 
Mục tiêu: **Accuracy > 0.83** trên tập HMDB51.

### Các cải tiến chính:
1. **Model:** `VideoMAE-base` (SOTA cho dataset nhỏ).
2. **Augmentation:** `Mixup` + Consistent Spatial Transform.
3. **Training Strategy:** **2-Stage Fine-tuning**
    * **Phase 1 (30 Epochs):** Mixup + High LR.
    * **Phase 2 (10 Epochs):** **Label Smoothing (0.1)** + Low LR (Tránh overfit ở giai đoạn cuối).
4. **Inference (New):** **6-View TTA** (3 Crops + 3 Flipped Crops) - Tăng độ ổn định dự đoán.


In [None]:
!nvidia-smi

Wed Jan 28 10:43:13 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.172.08             Driver Version: 570.172.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   36C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [None]:
!pip install -q transformers accelerate evaluate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as T
import torchvision.transforms.functional as TF
from transformers import VideoMAEForVideoClassification, VideoMAEImageProcessor
from transformers import get_cosine_schedule_with_warmup
from pathlib import Path
from PIL import Image
from tqdm.auto import tqdm
import numpy as np
import random
import os

import warnings
warnings.filterwarnings("ignore")

# Setup Device
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {DEVICE}')
NUM_GPUS = torch.cuda.device_count()
USE_MULTI_GPU = NUM_GPUS > 1
print(f'Number of GPUs available: {NUM_GPUS}')
if USE_MULTI_GPU:
    print(f'Multi-GPU training ENABLED with DataParallel')
# Paths (Keep original paths)
PATH_DATA_TRAIN = r'/kaggle/input/action-video/data/data_train'
PATH_DATA_TEST = r'/kaggle/input/action-video/data/test'


2026-01-28 10:43:46.046160: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769597026.473672      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769597026.605842      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769597027.739583      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769597027.739643      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769597027.739646      55 computation_placer.cc:177] computation placer alr

Using device: cuda
Number of GPUs available: 2
Multi-GPU training ENABLED with DataParallel


## 1. Configuration & Hyperparameters

In [None]:
# Model Config
MODEL_CKPT = "MCG-NJU/videomae-base-finetuned-kinetics"
NUM_FRAMES = 8
IMG_SIZE = 224
RESIZE_SIZE = 256

# Phase 1 Config (Heavy Augmentation)
EPOCHS_P1 = 20
LR_P1 = 5e-5

# Phase 2 Config (Fine-tuning / Polishing)
EPOCHS_P2 = 10
LR_P2 = 1e-6       # Rất nhỏ để không phá vỡ weights đã học
LABEL_SMOOTHING = 0.1 # New: Giúp model không bị overconfident

# Common Config
BATCH_SIZE = 8
ACCUM_STEPS = 4
WEIGHT_DECAY = 0.05
WARMUP_RATIO = 0.1

# Augmentation Config
MIXUP_ALPHA = 0.8
MIXUP_PROB = 1.0

## 2. Prepare Dataset with Consistent Transforms

In [None]:
# Load Processor
image_processor = VideoMAEImageProcessor.from_pretrained(MODEL_CKPT)
MEAN = image_processor.image_mean
STD = image_processor.image_std

# --- Manual Mixup Implementation ---
class MixupCollate:
    def __init__(self, num_classes, alpha=0.8, prob=1.0):
        self.num_classes = num_classes
        self.alpha = alpha
        self.prob = prob

    def __call__(self, batch):
        inputs, targets = torch.utils.data.default_collate(batch)
        
        if np.random.rand() > self.prob:
            return inputs, F.one_hot(targets, num_classes=self.num_classes).float()

        batch_size = inputs.size(0)
        index = torch.randperm(batch_size)
        lam = np.random.beta(self.alpha, self.alpha)
        
        inputs = lam * inputs + (1 - lam) * inputs[index, :]
        targets_one_hot = F.one_hot(targets, num_classes=self.num_classes).float()
        targets = lam * targets_one_hot + (1 - lam) * targets_one_hot[index, :]

        return inputs, targets

class VideoDataset(Dataset):
    def __init__(self, root, num_frames=16, is_train=True, img_size=224):
        self.root = Path(root)
        self.num_frames = num_frames
        self.is_train = is_train
        self.img_size = img_size
        
        self.classes = sorted([d.name for d in self.root.iterdir() if d.is_dir()])
        self.class_to_idx = {name: idx for idx, name in enumerate(self.classes)}
        
        self.samples = []
        for cls in self.classes:
            cls_dir = self.root / cls
            for video_dir in sorted([d for d in cls_dir.iterdir() if d.is_dir()]):
                frame_paths = sorted([p for p in video_dir.iterdir() if p.suffix.lower() in {'.jpg', '.png'}])
                if frame_paths:
                    self.samples.append((frame_paths, self.class_to_idx[cls]))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        frame_paths, label = self.samples[idx]
        total_frames = len(frame_paths)
        
        if self.is_train:
            max_stride = max(1, (total_frames - 1) // (self.num_frames - 1))
            stride = random.randint(1, min(max_stride, 4)) if max_stride >= 1 else 1
        else:
            stride = max(1, (total_frames - 1) // (self.num_frames - 1))
            
        frame_indices = np.linspace(0, total_frames - 1, self.num_frames, dtype=int)
        
        frames = []
        for i in frame_indices:
            img = Image.open(frame_paths[i]).convert("RGB")
            frames.append(img)
            
        # FIX: Resize short edge to 256, then crop. Preserves Aspect Ratio better.
        frames = [TF.resize(img, RESIZE_SIZE) for img in frames]
        
        if self.is_train:
            i, j, h, w = T.RandomResizedCrop.get_params(frames[0], scale=(0.8, 1.0), ratio=(0.75, 1.33))
            is_flip = random.random() > 0.5
            
            transformed_frames = []
            for img in frames:
                img = TF.resized_crop(img, i, j, h, w, size=(self.img_size, self.img_size))
                if is_flip:
                    img = TF.hflip(img)
                img = TF.to_tensor(img)
                img = TF.normalize(img, mean=MEAN, std=STD)
                transformed_frames.append(img)
        else:
            # Validation: Center Crop
            transformed_frames = []
            for img in frames:
                img = TF.center_crop(img, (self.img_size, self.img_size))
                img = TF.to_tensor(img)
                img = TF.normalize(img, mean=MEAN, std=STD)
                transformed_frames.append(img)
                
        return torch.stack(transformed_frames), label

preprocessor_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

In [None]:
# Khởi tạo Dataset
train_dataset = VideoDataset(PATH_DATA_TRAIN, num_frames=NUM_FRAMES, is_train=True, img_size=IMG_SIZE)

print(f"Classes: {len(train_dataset.classes)}")
# Map label cho model
id2label = {i: label for i, label in enumerate(train_dataset.classes)}
label2id = {label: i for i, label in enumerate(train_dataset.classes)}

# Khởi tạo Collator thủ công
mixup_collate = MixupCollate(num_classes=len(train_dataset.classes), alpha=MIXUP_ALPHA, prob=MIXUP_PROB)

# Loader Phase 1 (With Mixup)
train_loader_p1 = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    num_workers=2, 
    pin_memory=True,
    collate_fn=mixup_collate,
    drop_last=True,
    persistent_workers=True

)

# Loader Phase 2 (No Mixup - Standard Collate)
train_loader_p2 = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    num_workers=2, 
    pin_memory=True,
    drop_last=True,
    persistent_workers=True

)

Classes: 51


## 3. Load VideoMAE Model

In [None]:
print("Loading VideoMAE...")
model = VideoMAEForVideoClassification.from_pretrained(
    MODEL_CKPT,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True, 
    num_frames=NUM_FRAMES
)


model.to(DEVICE)
if USE_MULTI_GPU:
    model = nn.DataParallel(model)
    print(f"Model wrapped with DataParallel across {NUM_GPUS} GPUs")
    print("Model loaded.")

Loading VideoMAE...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base-finetuned-kinetics and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([51]) in the model instantiated
- classifier.weight: found shape torch.Size([400, 768]) in the checkpoint and torch.Size([51, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model wrapped with DataParallel across 2 GPUs
Model loaded.


## 4. Training Loops (Phase 1 & Phase 2)

In [None]:
def train_epoch(model, loader, optimizer, scheduler, scaler, device, accum_steps, use_mixup=True, label_smoothing=0.0):
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0
    
    progress = tqdm(loader, desc="Training", leave=False)
    optimizer.zero_grad()
    
    for step, (inputs, targets) in enumerate(progress):
        inputs = inputs.to(device) 
        targets = targets.to(device)
        
        with torch.amp.autocast(device_type='cuda', enabled=True):
            outputs = model(inputs)
            logits = outputs.logits
            
            if use_mixup:
                # Mixup creates soft labels already
                log_probs = F.log_softmax(logits, dim=1)
                loss = -torch.sum(targets * log_probs, dim=1).mean()
                true_labels = targets.argmax(dim=1)
            else:
                # Phase 2: Use Label Smoothing instead of Hard CE
                loss = F.cross_entropy(logits, targets, label_smoothing=label_smoothing)
                true_labels = targets

        # Accuracy Calculation
        preds = logits.argmax(dim=1)
        correct = (preds == true_labels).sum().item()
        total_correct += correct
        total_samples += inputs.size(0)

        # Backward
        loss = loss / accum_steps
        scaler.scale(loss).backward()

        if (step + 1) % accum_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()
            
        total_loss += loss.item() * accum_steps
        current_lr = scheduler.get_last_lr()[0]
        progress.set_postfix(loss=f"{total_loss/(step+1):.4f}", acc=f"{total_correct/total_samples:.4f}", lr=f"{current_lr:.6f}")
        
    avg_loss = total_loss / len(loader)
    avg_acc = total_correct / total_samples
    return avg_loss, avg_acc

In [None]:
# Helper function to get underlying model (handles DataParallel wrapper)
def get_underlying_model(model):
    """Returns the underlying model, unwrapping DataParallel if necessary."""
    return model.module if isinstance(model, nn.DataParallel) else model


# ---------------- PHASE 1: TRAINING WITH MIXUP ----------------
print("\n" + "=" * 40)
print(f"STARTING PHASE 1 (Mixup Enabled, LR={LR_P1}, Epochs={EPOCHS_P1})")
if USE_MULTI_GPU:
    print(f"Training on {NUM_GPUS} GPUs with DataParallel")
print("=" * 40)

optimizer = torch.optim.AdamW(
    model.parameters(), lr=LR_P1, weight_decay=WEIGHT_DECAY
)
scaler = torch.amp.GradScaler(enabled=True)

num_training_steps = len(train_loader_p1) * EPOCHS_P1 // ACCUM_STEPS
num_warmup_steps = int(num_training_steps * WARMUP_RATIO)
scheduler = get_cosine_schedule_with_warmup(
    optimizer, num_warmup_steps, num_training_steps
)

best_acc_p1 = 0.0

for epoch in range(EPOCHS_P1):
    print(f"\nEpoch {epoch + 1}/{EPOCHS_P1} (Phase 1)")
    loss, acc = train_epoch(
        model,
        train_loader_p1,
        optimizer,
        scheduler,
        scaler,
        DEVICE,
        ACCUM_STEPS,
        use_mixup=True,
    )
    print(f"  Result: Loss = {loss:.4f} | Acc = {acc:.4f}")

    if acc > best_acc_p1:
        best_acc_p1 = acc
        get_underlying_model(model).save_pretrained("./videomae_phase1_best")
        print(f"  >>> Saved Phase 1 Best (Acc: {best_acc_p1:.4f})")


# ---------------- PHASE 2: FINE-TUNING (NO MIXUP + LABEL SMOOTHING) ----------------
print("\n" + "=" * 40)
print(
    f"STARTING PHASE 2 (No Mixup, Label Smooth={LABEL_SMOOTHING}, Low LR={LR_P2})"
)
if USE_MULTI_GPU:
    print(f"Training on {NUM_GPUS} GPUs with DataParallel")
print("=" * 40)

print("Loading best model from Phase 1...")
model = VideoMAEForVideoClassification.from_pretrained(
    "./videomae_phase1_best",
    label2id=label2id,
    id2label=id2label,
    num_frames=NUM_FRAMES,
    ignore_mismatched_sizes=False,
).to(DEVICE)

if USE_MULTI_GPU:
    model = nn.DataParallel(model)
    print(f"Model wrapped with DataParallel across {NUM_GPUS} GPUs")

optimizer = torch.optim.AdamW(
    model.parameters(), lr=LR_P2, weight_decay=WEIGHT_DECAY
)
scaler = torch.amp.GradScaler(enabled=True)

num_training_steps = len(train_loader_p2) * EPOCHS_P2 // ACCUM_STEPS
scheduler = get_cosine_schedule_with_warmup(
    optimizer, 0, num_training_steps
)

best_acc_p2 = best_acc_p1

for epoch in range(EPOCHS_P2):
    print(f"\nEpoch {epoch + 1}/{EPOCHS_P2} (Phase 2)")
    loss, acc = train_epoch(
        model,
        train_loader_p2,
        optimizer,
        scheduler,
        scaler,
        DEVICE,
        ACCUM_STEPS,
        use_mixup=False,
        label_smoothing=LABEL_SMOOTHING,
    )
    print(f"  Result: Loss = {loss:.4f} | Acc = {acc:.4f}")

    if acc > best_acc_p2:
        best_acc_p2 = acc
        get_underlying_model(model).save_pretrained("./videomae_final_best")
        print(f"  >>> Saved Phase 2 Best (Acc: {best_acc_p2:.4f})")
    else:
        if epoch == EPOCHS_P2 - 1:
            get_underlying_model(model).save_pretrained("./videomae_final_last")
            print("  >>> Saved Last Model")



STARTING PHASE 1 (Mixup Enabled, LR=5e-05, Epochs=20)
Training on 2 GPUs with DataParallel

Epoch 1/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 3.5971 | Acc = 0.1560
  >>> Saved Phase 1 Best (Acc: 0.1560)

Epoch 2/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 2.1389 | Acc = 0.6277
  >>> Saved Phase 1 Best (Acc: 0.6277)

Epoch 3/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 1.5182 | Acc = 0.7412
  >>> Saved Phase 1 Best (Acc: 0.7412)

Epoch 4/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 1.2656 | Acc = 0.7910
  >>> Saved Phase 1 Best (Acc: 0.7910)

Epoch 5/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 1.1257 | Acc = 0.8236
  >>> Saved Phase 1 Best (Acc: 0.8236)

Epoch 6/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 1.0573 | Acc = 0.8351
  >>> Saved Phase 1 Best (Acc: 0.8351)

Epoch 7/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.9678 | Acc = 0.8494
  >>> Saved Phase 1 Best (Acc: 0.8494)

Epoch 8/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.9138 | Acc = 0.8649
  >>> Saved Phase 1 Best (Acc: 0.8649)

Epoch 9/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.8762 | Acc = 0.8702
  >>> Saved Phase 1 Best (Acc: 0.8702)

Epoch 10/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.8428 | Acc = 0.8776
  >>> Saved Phase 1 Best (Acc: 0.8776)

Epoch 11/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.8049 | Acc = 0.8734

Epoch 12/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.7809 | Acc = 0.8934
  >>> Saved Phase 1 Best (Acc: 0.8934)

Epoch 13/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.7625 | Acc = 0.8872

Epoch 14/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.7549 | Acc = 0.8836

Epoch 15/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.7508 | Acc = 0.8889

Epoch 16/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.6972 | Acc = 0.9024
  >>> Saved Phase 1 Best (Acc: 0.9024)

Epoch 17/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.6796 | Acc = 0.8987

Epoch 18/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.6851 | Acc = 0.8934

Epoch 19/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.6683 | Acc = 0.9075
  >>> Saved Phase 1 Best (Acc: 0.9075)

Epoch 20/20 (Phase 1)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.6873 | Acc = 0.8875

STARTING PHASE 2 (No Mixup, Label Smooth=0.1, Low LR=1e-06)
Training on 2 GPUs with DataParallel
Loading best model from Phase 1...
Model wrapped with DataParallel across 2 GPUs

Epoch 1/10 (Phase 2)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.7672 | Acc = 0.9995
  >>> Saved Phase 2 Best (Acc: 0.9995)

Epoch 2/10 (Phase 2)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.7345 | Acc = 0.9997
  >>> Saved Phase 2 Best (Acc: 0.9997)

Epoch 3/10 (Phase 2)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.7279 | Acc = 0.9998
  >>> Saved Phase 2 Best (Acc: 0.9998)

Epoch 4/10 (Phase 2)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.7253 | Acc = 0.9997

Epoch 5/10 (Phase 2)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.7239 | Acc = 0.9997

Epoch 6/10 (Phase 2)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.7226 | Acc = 0.9998

Epoch 7/10 (Phase 2)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.7224 | Acc = 0.9995

Epoch 8/10 (Phase 2)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.7220 | Acc = 0.9997

Epoch 9/10 (Phase 2)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.7219 | Acc = 0.9997

Epoch 10/10 (Phase 2)


Training:   0%|          | 0/781 [00:00<?, ?it/s]

  Result: Loss = 0.7218 | Acc = 0.9998
  >>> Saved Last Model


## 5. Advanced 6-View Inference

In [12]:
class MultiViewTestDataset(Dataset):
    """
    Tạo ra 6 góc nhìn (3 Crops + 3 Flipped Crops) cho mỗi video.
    """
    def __init__(self, root, num_frames=NUM_FRAMES, img_size=IMG_SIZE, resize_size=RESIZE_SIZE):
        self.root = Path(root)
        self.num_frames = num_frames
        self.img_size = img_size
        self.resize_size = resize_size
        self.video_dirs = sorted([d for d in self.root.iterdir() if d.is_dir()], key=lambda x: int(x.name))
        self.video_ids = [int(d.name) for d in self.video_dirs]
    
    def __len__(self):
        return len(self.video_dirs)
    
    def __getitem__(self, idx):
        video_dir = self.video_dirs[idx]
        video_id = self.video_ids[idx]
        frame_paths = sorted([p for p in video_dir.iterdir() if p.suffix.lower() in {'.jpg', '.png'}])
        
        total_frames = len(frame_paths)
        frame_indices = np.linspace(0, total_frames - 1, self.num_frames, dtype=int)
        
        frames = []
        for i in frame_indices:
            img = Image.open(frame_paths[i]).convert("RGB")
            frames.append(img)
            
        # 1. Resize Shortest Side to 256
        frames = [TF.resize(img, self.resize_size) for img in frames]
        
        views = []
        w, h = frames[0].size
        crop_size = self.img_size
        
        # --- STANDARD 3 CROPS ---
        # Center
        views.append([TF.center_crop(img, (crop_size, crop_size)) for img in frames])
        # Side Crops
        if w > h:
            views.append([TF.crop(img, 0, 0, crop_size, crop_size) for img in frames])
            views.append([TF.crop(img, 0, w - crop_size, crop_size, crop_size) for img in frames])
        else:
            views.append([TF.crop(img, 0, 0, crop_size, crop_size) for img in frames])
            views.append([TF.crop(img, h - crop_size, 0, crop_size, crop_size) for img in frames])
            
        # --- FLIPPED 3 CROPS (NEW) ---
        flipped_views = []
        for v_frames in views:
            flipped_views.append([TF.hflip(img) for img in v_frames])
            
        all_views = views + flipped_views # Total 6 views
            
        # Convert all to tensors
        # Final shape: (6, T, C, H, W)
        view_tensors = []
        for view_frames in all_views:
            view_tensor = torch.stack([
                TF.normalize(TF.to_tensor(img), mean=MEAN, std=STD) for img in view_frames
            ])
            view_tensors.append(view_tensor)
            
        return torch.stack(view_tensors), video_id

test_dataset = MultiViewTestDataset(PATH_DATA_TEST, num_frames=NUM_FRAMES, img_size=IMG_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE//4, shuffle=False, num_workers=2)

# Load Best Accuracy Model
if os.path.exists("./videomae_final_best"):
    TEST_MODEL_PATH = "./videomae_final_best"
    print("Using Phase 2 Best Model.")
elif os.path.exists("./videomae_phase1_best"):
    TEST_MODEL_PATH = "./videomae_phase1_best"
    print("Using Phase 1 Best Model.")
else:
    TEST_MODEL_PATH = None
    print("Warning: No saved model found, using current model in memory.")

if TEST_MODEL_PATH:
    print(f"Loading model from {TEST_MODEL_PATH} for inference...")
    model = VideoMAEForVideoClassification.from_pretrained(
        TEST_MODEL_PATH,
        label2id=label2id,
        id2label=id2label,
        num_frames=NUM_FRAMES
    )
    model.to(DEVICE)

print("Running 6-View Inference...")
model.eval()
predictions = []

with torch.no_grad():
    for multi_view_videos, video_ids in tqdm(test_loader):
        # Shape: (B, 6, T, C, H, W)
        B, V, T, C, H, W = multi_view_videos.shape
        
        # Flatten views into batch dimension: (B*6, T, C, H, W)
        flat_videos = multi_view_videos.view(B * V, T, C, H, W).to(DEVICE)
        
        outputs = model(flat_videos)
        logits = outputs.logits # (B*6, Num_Classes)
        
        # Reshape back to (B, 6, Num_Classes)
        logits = logits.view(B, V, -1)
        
        # Average pooling over 6 views
        avg_logits = logits.mean(dim=1) # (B, Num_Classes)
        
        preds = avg_logits.argmax(dim=1)
        
        for vid, pred in zip(video_ids, preds):
            pred_class = id2label[pred.item()]
            predictions.append((vid.item(), pred_class))

predictions.sort(key=lambda x: x[0])

# Save submission
with open('submission_multiview_6crops.csv', 'w') as f:
    f.write('id,class\n')
    for vid, cls in predictions:
        f.write(f'{vid},{cls}\n')

print("Submission saved: submission_multiview_6crops.csv")

Using Phase 2 Best Model.
Loading model from ./videomae_final_best for inference...
Running 6-View Inference...


  0%|          | 0/255 [00:00<?, ?it/s]

Submission saved: submission_multiview_6crops.csv


In [13]:
!zip -r outputs.zip /kaggle/working

In [None]:
print(1)