In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [50]:
import torch
import os
import wandb
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from torchvision.transforms import InterpolationMode
import torchvision.transforms.functional as TF
from pathlib import Path
from PIL import Image
from tqdm.auto import tqdm
import timm
import random
from torch.utils.data import random_split
import torchvision.models as models
import pytorch_lightning as pl
import torchmetrics
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger


DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {DEVICE}')

Using device: cuda


In [19]:
# Data paths
PATH_DATA_TRAIN = r'/kaggle/input/action-video/data/data_train'
PATH_DATA_TEST = r'/kaggle/input/action-video/data/test'

# Model parameters 
NUM_FRAMES = 16
FRAME_STRIDE = 2
IMG_SIZE = 224

# Training parameters
BATCH_SIZE = 16 
EPOCHS = 16 
BASE_LR = 1e-4
HEAD_LR = 5e-5
WEIGHT_DECAY = 1e-4
GRAD_ACCUM_STEPS = 4

# PRETRAINED_NAME = 'vit_small_patch16_224'
PRETRAINED_NAME = 'MCG-NJU/videomae-base'
generator = torch.Generator().manual_seed(42)
    
print(f"Train data: {PATH_DATA_TRAIN}")
print(f"Test data: {PATH_DATA_TEST}")
print(f"Model: {PRETRAINED_NAME}")
print(f"Frames per video: {NUM_FRAMES}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Epochs: {EPOCHS}")

Train data: /kaggle/input/action-video/data/data_train
Test data: /kaggle/input/action-video/data/test
Model: MCG-NJU/videomae-base
Frames per video: 16
Batch size: 16
Epochs: 16


In [4]:
class VideoTransform:
    def __init__(self, image_size=224, is_train=True):
        self.image_size = image_size
        self.is_train = is_train
        self.mean = [0.485, 0.456, 0.406]
        self.std = [0.229, 0.224, 0.225]
    
    def __call__(self, frames):
        if self.is_train:
            h, w = frames.shape[-2:]
            scale = random.uniform(0.8, 1.0)
            new_h, new_w = int(h * scale), int(w * scale)
            frames = TF.resize(frames, [new_h, new_w], interpolation=InterpolationMode.BILINEAR)
            i = random.randint(0, max(0, new_h - self.image_size))
            j = random.randint(0, max(0, new_w - self.image_size))
            frames = TF.crop(frames, i, j, min(self.image_size, new_h), min(self.image_size, new_w))
            frames = TF.resize(frames, [self.image_size, self.image_size], interpolation=InterpolationMode.BILINEAR)
            if random.random() < 0.5:
                frames = TF.hflip(frames)
        else:
            frames = TF.resize(frames, [self.image_size, self.image_size], interpolation=InterpolationMode.BILINEAR)
        normalized = [TF.normalize(frame, self.mean, self.std) for frame in frames]
        return torch.stack(normalized)

print("Augmentation defined")

Augmentation defined


In [5]:
class VideoDataset(Dataset):
    def __init__(self, root, num_frames=16, frame_stride=2, image_size=224, is_train=True):
        self.root = Path(root)
        self.num_frames = num_frames
        self.frame_stride = frame_stride
        self.transform = VideoTransform(image_size, is_train)
        self.to_tensor = transforms.ToTensor()
        self.classes = sorted([d.name for d in self.root.iterdir() if d.is_dir()])
        self.class_to_idx = {name: idx for idx, name in enumerate(self.classes)}
        self.samples = []
        for cls in self.classes:
            cls_dir = self.root / cls
            for video_dir in sorted([d for d in cls_dir.iterdir() if d.is_dir()]):
                frame_paths = sorted([p for p in video_dir.iterdir() if p.suffix.lower() in {'.jpg', '.jpeg', '.png'}])
                if frame_paths:
                    self.samples.append((frame_paths, self.class_to_idx[cls]))
    
    def __len__(self):
        return len(self.samples)
    
    def _select_indices(self, total):
        if total <= 0:
            raise ValueError("No frames")
        if total == 1:
            return torch.zeros(self.num_frames, dtype=torch.long)
        steps = max(self.num_frames * self.frame_stride, self.num_frames)
        grid = torch.linspace(0, total - 1, steps=steps)
        idxs = grid[::self.frame_stride].long()
        if idxs.numel() < self.num_frames:
            pad = idxs.new_full((self.num_frames - idxs.numel(),), idxs[-1].item())
            idxs = torch.cat([idxs, pad], dim=0)
        return idxs[:self.num_frames]
    
    def __getitem__(self, idx):
        frame_paths, label = self.samples[idx]
        total = len(frame_paths)
        idxs = self._select_indices(total)
        frames = []
        for i in idxs:
            path = frame_paths[int(i.item())]
            with Image.open(path) as img:
                img = img.convert("RGB")
                frames.append(self.to_tensor(img))
        video = torch.stack(frames)
        video = self.transform(video)
        return video, label


class TestDataset(Dataset):
    def __init__(self, root, num_frames=16, frame_stride=2, image_size=224):
        self.root = Path(root)
        self.num_frames = num_frames
        self.frame_stride = frame_stride
        self.transform = VideoTransform(image_size, is_train=False)
        self.to_tensor = transforms.ToTensor()
        self.video_dirs = sorted([d for d in self.root.iterdir() if d.is_dir()], key=lambda x: int(x.name))
        self.video_ids = [int(d.name) for d in self.video_dirs]
    
    def __len__(self):
        return len(self.video_dirs)
    
    def _select_indices(self, total):
        if total <= 0:
            raise ValueError("No frames")
        if total == 1:
            return torch.zeros(self.num_frames, dtype=torch.long)
        steps = max(self.num_frames * self.frame_stride, self.num_frames)
        grid = torch.linspace(0, total - 1, steps=steps)
        idxs = grid[::self.frame_stride].long()
        if idxs.numel() < self.num_frames:
            pad = idxs.new_full((self.num_frames - idxs.numel(),), idxs[-1].item())
            idxs = torch.cat([idxs, pad], dim=0)
        return idxs[:self.num_frames]
    
    def __getitem__(self, idx):
        video_dir = self.video_dirs[idx]
        video_id = self.video_ids[idx]
        frame_paths = sorted([p for p in video_dir.iterdir() if p.suffix.lower() in {'.jpg', '.jpeg', '.png'}])
        total = len(frame_paths)
        idxs = self._select_indices(total)
        frames = []
        for i in idxs:
            path = frame_paths[int(i.item())]
            with Image.open(path) as img:
                img = img.convert("RGB")
                frames.append(self.to_tensor(img))
        video = torch.stack(frames)
        video = self.transform(video)
        return video, video_id

print("Dataset classes defined")

Dataset classes defined


In [8]:
full_dataset = VideoDataset(PATH_DATA_TRAIN, num_frames=NUM_FRAMES, frame_stride=FRAME_STRIDE, image_size=IMG_SIZE, is_train=True)

In [20]:
train_size = int(0.9 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_ds, val_ds = random_split(full_dataset, [train_size, val_size], generator = generator)
val_ds.dataset.is_train = False

In [29]:
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_ds, batch_size = BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

In [30]:
print(f"Train samples: {len(train_ds)}")
print(f"Classes: {len(train_ds.dataset.classes)}")
print(f"Class names: {train_ds.dataset.classes[:10]}...")

Train samples: 5628
Classes: 51
Class names: ['brush_hair', 'cartwheel', 'catch', 'chew', 'clap', 'climb', 'climb_stairs', 'dive', 'draw_sword', 'dribble']...


In [34]:
batch = next(iter(train_loader))
video, label = batch

print("Train video shape:", video.shape)
print("Train label shape:", label.shape)
print("Train video dtype:", video.dtype)
print("Train labels:", label)

batch = next(iter(val_loader))
video, label = batch

print("Validation video shape:", video.shape)
print("Validation label shape:", label.shape)
print("Validation video dtype:", video.dtype)
print("Validation labels:", label)

Train video shape: torch.Size([16, 16, 3, 224, 224])
Train label shape: torch.Size([16])
Train video dtype: torch.float32
Train labels: tensor([45, 17, 42,  9, 21, 45, 45, 12, 41, 28,  2, 20, 29, 41, 14, 29])
Validation video shape: torch.Size([16, 16, 3, 224, 224])
Validation label shape: torch.Size([16])
Validation video dtype: torch.float32
Validation labels: tensor([49, 48, 45, 23,  9,  5, 49,  2, 49,  1, 34,  1, 37, 42, 33, 32])


In [70]:
class VideoClassifier(nn.Module):
    def __init__(self, num_classes, backbone="resnet50", dropout=0.5):
        super().__init__()

        if backbone == "resnet18":
            net = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
            feat_dim = 512
        elif backbone == "resnet50":
            net = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
            feat_dim = 2048
        else:
            raise ValueError("Unsupported backbone")

        # Remove final FC
        self.backbone = nn.Sequential(*list(net.children())[:-1])

        self.temporal_pool = nn.AdaptiveAvgPool1d(1)

        self.classifier = nn.Sequential(
            nn.Linear(feat_dim, feat_dim // 2),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(feat_dim // 2, num_classes)
        )

    def forward(self, x):
        """
        x: [B, T, C, H, W]
        """
        B, T, C, H, W = x.shape
        x = x.view(B * T, C, H, W)

        feats = self.backbone(x)           # [B*T, D, 1, 1]
        feats = feats.flatten(1)           # [B*T, D]

        feats = feats.view(B, T, -1)        # [B, T, D]
        feats = feats.transpose(1, 2)       # [B, D, T]

        video_feat = self.temporal_pool(feats).squeeze(-1)  # [B, D]

        out = self.classifier(video_feat)
        return out


In [None]:
# Replace this import with whatever VideoMAE implementation you prefer
from timm.models.video_mae import video_mae_base_patch16_224

class VideoMAEForAction(nn.Module):
    def __init__(self, num_classes=51, pretrained=True):
        super().__init__()

        # Load VideoMAE backbone
        # `video_mae_base_patch16_224` is from timm
        # It returns per-frame features if configured that way
        self.backbone = video_mae_base_patch16_224(
            pretrained=pretrained,
            num_classes=0  # remove classification head
        )

        # Dimension of the backbone features
        embed_dim = self.backbone.embed_dim  # usually 768 for base

        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(embed_dim, embed_dim//2),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(embed_dim//2, num_classes)
        )

    def forward(self, x):
        """
        x: [B, T, C, H, W]
        """

        # VideoMAE expects a batch of videos
        # It returns features of shape [B, T, EMBED_DIM]
        feats = self.backbone.forward_features(x)

        # Average over time
        # feats: [B, T, EMBED_DIM]
        # pooled: [B, EMBED_DIM]
        pooled = feats.mean(dim=1)

        # Final classification
        out = self.classifier(pooled)

        return out


In [71]:
class LightningClassifier(pl.LightningModule):
    def __init__(self, model, num_classes = 51, lr=BASE_LR, weight_decay=WEIGHT_DECAY):
        super().__init__()
        self.save_hyperparameters(ignore=["model"])

        self.model = model
        self.criterion = nn.CrossEntropyLoss()

        self.train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
        self.val_acc   = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        video, label = batch
        logits = self(video)
        loss = self.criterion(logits, label)

        acc = self.train_acc(logits, label)

        self.log("train/loss", loss, prog_bar=True)
        self.log("train/acc", acc, prog_bar=True)

        return loss

    def validation_step(self, batch, batch_idx):
        video, label = batch
        logits = self(video)
        loss = self.criterion(logits, label)

        acc = self.val_acc(logits, label)

        self.log("val/loss", loss, prog_bar=True)
        self.log("val/acc", acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(),
            lr=self.hparams.lr,
            weight_decay=self.hparams.weight_decay
        )
        return optimizer


In [72]:
model_name = "ResNet-AvgPool-Classifier" # CHANGE THIS WHEN USE ANOTHER MODEL

checkpoint_dir = f"/kaggle/working/checkpoints/{model_name}"
ckpt_path = None

if os.path.exists(checkpoint_dir):
    ckpts = [os.path.join(checkpoint_dir, f) for f in os.listdir(checkpoint_dir) if f.endswith(".ckpt")]
    if ckpts:
        ckpt_path = max(ckpts, key=os.path.getctime)  # latest file by creation time

print("Resuming from checkpoint:" if ckpt_path else "No checkpoint found.", ckpt_path)

No checkpoint found. None


In [73]:
early_stop = EarlyStopping(
    monitor="val/loss",
    patience=5,
    mode="min",
    verbose=True
)


In [74]:
checkpoint_cb = ModelCheckpoint(
    dirpath=f'/kaggle/working/checkpoints/{model_name}',
    monitor="val/loss",
    mode="min",
    save_top_k=1,
    save_last=True,              # ðŸ”‘ allows resume
    filename="epoch{epoch:02d}-val_loss{val/loss:.4f}"
)


In [75]:

wandb_logger = WandbLogger(
    project="video-classification",
    log_model=True
)


In [76]:
student_id = "10423057"  # TODO: replace with your student ID
api_key = os.environ.get("WANDB_API_KEY", "83f4544a22543e319c6009abceaac90b634c68a3")  # configure your wandb key here

if api_key == "":
    raise ValueError("Please set your wandb key in the code or in the environment variable WANDB_API_KEY")
else:
    print("WandB API key is set. Proceeding with login...")
    
wandb.login(key=api_key)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


WandB API key is set. Proceeding with login...


True

In [77]:
trainer = pl.Trainer(
    max_epochs=100,
    accelerator="gpu",
    devices=1,
    precision="16-mixed",          # optional but recommended
    callbacks=[early_stop, checkpoint_cb],
    logger=wandb_logger,
    log_every_n_steps=10
)


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [None]:
base_model = VideoClassifier(51) # CHANGE THIS IF USE ANOTHER BASE MODEL
lightning_model = LightningClassifier(base_model)
trainer.fit(
    lightning_model,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader,
    ckpt_path = ckpt_path
)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/utilities/model_summary/model_summary.py:231: Precision 16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.

  | Name      | Type               | Params | Mode 
---------------------------------------------------------
0 | model     | VideoClassifier    | 25.7 M | train
1 | criterion | CrossEntropyLoss   | 0      | train
2 | train_acc | MulticlassAccuracy | 0      | train
3 | val_acc   | MulticlassAccuracy | 0      | train
---------------------------------------------------------
25.7 M    Trainable params
0         Non-trainable params
25.7 M    Total params
102.634   Total estimated model params size (MB)
160       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val/loss improved. New best score: 3.324


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val/loss improved by 0.524 >= min_delta = 0.0. New best score: 2.800


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val/loss improved by 0.539 >= min_delta = 0.0. New best score: 2.261


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val/loss improved by 0.367 >= min_delta = 0.0. New best score: 1.894


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val/loss improved by 0.210 >= min_delta = 0.0. New best score: 1.684


In [None]:
print("\nLoading model for testing...")
# Recreate base model
base_testing_model = VideoClassifier(num_classes=51) # CHANGE THIS IF USE ANOTHER BASE MODEL

# Load Lightning model from checkpoint
lightning_testing_model = LightningClassifier.load_from_checkpoint(
    ckpt_path,
    model=base_testing_model   # must pass the wrapped model
)

lightning_model.eval()
lightning_model.to(DEVICE)

# Get the actual PyTorch model
testing_model = lightning_model.model
testing_model.eval()

classes = train_ds.dataset.classes
print("\nLoading test dataset...")
test_dataset = TestDataset(PATH_DATA_TEST, num_frames=NUM_FRAMES, frame_stride=FRAME_STRIDE, image_size=IMG_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
print(f"Test samples: {len(test_dataset)}")

In [None]:
print("\nRunning inference...")

predictions = []

with torch.no_grad():
    for videos, video_ids in tqdm(test_loader, desc="Inference"):
        videos = videos.to(DEVICE)           # [B, T, C, H, W]

        logits = testing_model(videos)               # [B, num_classes]
        preds = logits.argmax(dim=1)         # [B]

        for vid, pred_idx in zip(video_ids.cpu().numpy(),
                                 preds.cpu().numpy()):
            pred_class = classes[pred_idx]
            predictions.append((vid, pred_class))


In [None]:
predictions.sort(key=lambda x: x[0])
print(f"\nTotal predictions: {len(predictions)}")


In [None]:
submission_path = Path("./submission.csv")

with open(submission_path, "w") as f:
    f.write("id,class\n")
    for video_id, pred_class in predictions:
        f.write(f"{video_id},{pred_class}\n")

print("=" * 40)
print(f"Submission saved to: {submission_path}")
