In [1]:
import sys
import os

# Add the project root to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.abspath(''), "..")))

In [2]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


CUDA available: True
Device: NVIDIA GeForce GTX 1660


In [3]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset
import torchvision.io as io


# Currently only for the rgb_frames along with narration
class UCF50Dataset(Dataset):
    def __init__(self, root_dir, annotations_csv, transform=None):
        self.root_dir = root_dir
        self.annotations_df = pd.read_csv(annotations_csv)
        self.transform = transform

    def __len__(self):
        return len(self.annotations_df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        row = self.annotations_df.iloc[idx]

        video_name, action, action_label = (
            row["video_name"],
            row["action"],
            row["action_label"],
        )

        video_frames = io.read_video(
            os.path.join(self.root_dir, action, video_name),
            pts_unit="sec",
            output_format="TCHW",
        )[0]

        sample = {"frames": video_frames, "action_label": action_label}

        if self.transform:
            sample = self.transform(sample)

        return sample

In [4]:
class DevConfig:
    ROOT_DIR = r"C:\Users\Jani\.cache\kagglehub\datasets\vineethakkinapalli\ucf50-action-recognition-dataset\versions\1\UCF50"
    ANNOTATIONS_DIR_LOCAL = "annotations"
    FULL_ANNOTATIONS_FILE_LOCAL = "annotations/annotations.csv" 
    TRAIN_ANNOTATIONS_FILE_LOCAL = "annotations/train_annotations.csv"
    VAL_ANNOTATIONS_FILE_LOCAL = "annotations/val_annotations.csv"
    TEST_ANNOTATIONS_FILE_LOCAL = "annotations/test_annotations.csv"
    MODELS_DIR_LOCAL = "outputs/models"
    LOGS_DIR_LOCAL = "outputs/logs"
    RANDOM_STATE = 42

In [5]:
import torch.nn as nn

class FixedSizeClipSampler(nn.Module):
    """
    A custom temporal sampler that:
      1) Pads short clips (N < 32) by repeating the last frame until 32.
      2) Uniformly subsamples long clips (N > 32) down to 32 frames.
    """

    def __init__(self, num_frames=32):
        super().__init__()
        self.num_frames = num_frames

    def forward(self, frames: torch.Tensor) -> torch.Tensor:
        """
        Args:
            frames (torch.Tensor): A 4D tensor of shape (T, C, H, W)
              - T: temporal dimension (# of frames)
              - C: # of channels (3 for RGB)
              - H, W: spatial dimensions

        Returns:
            A 4D tensor of shape (32, C, H, W) with fixed temporal length.
        """
        t = frames.shape[0]

        if t < self.num_frames:
            # --- Pad short clips ---
            pad_needed = self.num_frames - t
            last_frame = frames[-1:].clone()  # shape (1, C, H, W)
            # Repeat the last frame 'pad_needed' times and concatenate
            pad_frames = last_frame.repeat(pad_needed, 1, 1, 1)
            frames = torch.cat([frames, pad_frames], dim=0)

        elif t > self.num_frames:
            # --- Uniform subsampling for long clips ---
            # Create 32 indices evenly spaced from [0..t-1]
            indices = torch.linspace(0, t - 1, self.num_frames).long()
            frames = frames[indices]

        # If t == self.num_frames, we do nothing
        return frames


In [6]:
class TransformKey:
    def __init__(self, key, transform):
        self.key = key
        self.transform = transform

    def __call__(self, sample):
        # Apply the transform only on the designated key.
        sample[self.key] = self.transform(sample[self.key])
        return sample


In [7]:
from torchvision.transforms import Compose, Lambda, CenterCrop, Normalize
import os

def get_x3d_model():
    model_name = "x3d_s"
    model = torch.hub.load("facebookresearch/pytorchvideo", model_name, pretrained=True)

    # Freeze parameters
    for param in model.parameters():
        param.requires_grad = False

    for param in model.blocks[-1].proj.parameters():
        param.requires_grad = True

    num_classes = len(
        pd.read_csv(os.path.join(DevConfig.ANNOTATIONS_DIR_LOCAL, "actions_label.csv"))
    )
    model.blocks[-1].proj = nn.Linear(
        in_features=model.blocks[-1].proj.in_features, out_features=num_classes
    )
    
    return model

def scale_pixels(x):
    return x / 255.0

def permute_tensor(x):
    return x.permute(1, 0, 2, 3)

def get_x3d_transform_compose():
    mean = [0.45, 0.45, 0.45]
    std = [0.225, 0.225, 0.225]
    crop_size = 256
    data_transform = Compose(
        [
            FixedSizeClipSampler(num_frames=32),
            Lambda(scale_pixels),
            Normalize(mean, std),
            CenterCrop(crop_size),
            Lambda(permute_tensor),
        ]
    )
    data_transform = TransformKey("frames", data_transform)

    return data_transform

In [8]:
import os
from datetime import datetime


def create_model_and_log_dir(model_dir, model_log_dir, experiment_name):
    # Create the experiment directory if it doesn't already exist.
    experiment_log_dir = os.path.join(model_log_dir, experiment_name)
    os.makedirs(experiment_log_dir, exist_ok=True)

    experiment_dir = os.path.join(model_dir, experiment_name)
    os.makedirs(experiment_dir, exist_ok=True)

    run_name = f"run_{datetime.now().strftime('%Y%m%d-%H%M%S')}"
    # Generate a unique subdirectory name using the current timestamp.
    run_log_dir = os.path.join(
        experiment_log_dir, run_name
    )
    os.makedirs(run_log_dir, exist_ok=True)

    run_model_name = os.path.join(experiment_name, run_name + ".pth")

    return run_model_name, run_log_dir

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
# import multiprocessing
from tqdm import tqdm

def main():
    # Load the official SlowFast model from PyTorch Hub.
    model = get_x3d_model()

    model_name, log_dir = create_model_and_log_dir(DevConfig.MODELS_DIR_LOCAL, DevConfig.LOGS_DIR_LOCAL, "X3D_only_freeze")

    # Create Tensorboard summary writer for logging
    writer = SummaryWriter(log_dir)

    data_transform = get_x3d_transform_compose()
    train_dataset = UCF50Dataset(
        DevConfig.ROOT_DIR,
        DevConfig.TRAIN_ANNOTATIONS_FILE_LOCAL,
        transform=data_transform,
    )
    train_dataloader = DataLoader(
        train_dataset, batch_size=2, shuffle=True
    )

    val_dataset = UCF50Dataset(
        DevConfig.ROOT_DIR,
        DevConfig.VAL_ANNOTATIONS_FILE_LOCAL,
        transform=data_transform,
    )
    val_dataloader = DataLoader(val_dataset, batch_size=2, shuffle=True)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    num_epochs = 5
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_predictions = 0
        batch_count = 0
        model.train()
        for batch in tqdm(train_dataloader):
            input = batch["frames"].to(device)
            batch_count += 1

            labels = batch["action_label"].to(device)

            # Forward pass
            outputs = model(input)

            predictions = outputs.argmax(dim = 1)
            correct_predictions += (predictions == labels).sum().item()

            loss = criterion(outputs, labels)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            global_step = epoch * len(train_dataloader) + batch_count
            
            writer.add_scalar("Loss/Train_Batch", loss.item(), global_step)

        avg_train_loss = running_loss / len(train_dataloader)
        accuracy = (correct_predictions/len(train_dataset)) * 100
        writer.add_scalar("Loss/Train_Epoch", avg_train_loss, epoch)

        print(f"Epoch [{epoch + 1}/{num_epochs}], Training Accuracy: {accuracy}, Training Loss: {avg_train_loss:.4f}")

        model.eval()
        valid_loss = 0.0
        correct_predictions = 0
        with torch.no_grad():
            for batch in tqdm(val_dataloader):
                input = batch["frames"].to(device)
                labels = batch["action_label"].to(device)
                outputs = model(input)

                predictions = outputs.argmax(dim = 1)
                correct_predictions += (predictions == labels).sum().item()
                
                loss = criterion(outputs, labels)
                valid_loss += loss.item()

        avg_val_loss = valid_loss / len(val_dataloader)
        accuracy = (correct_predictions/len(val_dataset)) * 100
        writer.add_scalar("Loss/Valid_Epoch", avg_val_loss, epoch)
        writer.add_scalar("Accuracy/Valid_Epoch", accuracy, epoch)

        print(f"Validation Accuracy: {accuracy:.4f}")
        print(f"Validation Loss: {avg_val_loss:.4f}")

    # After finishing training
    torch.save(
        model.state_dict(),
        f"{DevConfig.MODELS_DIR_LOCAL}/{model_name}",
    )
    print(f"Model saved to {DevConfig.MODELS_DIR_LOCAL}")


if __name__ == "__main__":
    main()


Using cache found in C:\Users\Jani/.cache\torch\hub\facebookresearch_pytorchvideo_main
100%|██████████| 2338/2338 [15:46<00:00,  2.47it/s]


Epoch [1/5], Training Accuracy: 77.07442258340463, Training Loss: 1.3141


100%|██████████| 501/501 [03:31<00:00,  2.36it/s]


Validation Accuracy: 96.4072
Validation Loss: 0.2146


100%|██████████| 2338/2338 [15:44<00:00,  2.48it/s]


Epoch [2/5], Training Accuracy: 92.42942686056459, Training Loss: 0.4806


100%|██████████| 501/501 [03:21<00:00,  2.49it/s]


Validation Accuracy: 97.3054
Validation Loss: 0.1359


100%|██████████| 2338/2338 [14:28<00:00,  2.69it/s]


Epoch [3/5], Training Accuracy: 94.05474764756202, Training Loss: 0.3458


100%|██████████| 501/501 [03:14<00:00,  2.58it/s]


Validation Accuracy: 96.6068
Validation Loss: 0.1254


100%|██████████| 2338/2338 [16:07<00:00,  2.42it/s]


Epoch [4/5], Training Accuracy: 94.3327630453379, Training Loss: 0.2951


100%|██████████| 501/501 [03:12<00:00,  2.61it/s]


Validation Accuracy: 97.6048
Validation Loss: 0.1012


100%|██████████| 2338/2338 [15:54<00:00,  2.45it/s]


Epoch [5/5], Training Accuracy: 95.48759623609922, Training Loss: 0.2468


100%|██████████| 501/501 [03:15<00:00,  2.57it/s]

Validation Accuracy: 98.3034
Validation Loss: 0.0881
Model saved to outputs/models





In [None]:
import torch
import os
import time

def main():

    device = "cuda" if torch.cuda.is_available() else "cpu"
    # Load test dataset
    test_dataset = UCF50Dataset(
        DevConfig.ROOT_DIR,
        DevConfig.TEST_ANNOTATIONS_FILE_LOCAL,
        transform = get_x3d_transform_compose()
    )
    test_dataloader = DataLoader(
        test_dataset, batch_size=2, shuffle=True
    )

    model = get_x3d_model()
    model.load_state_dict(torch.load(os.path.join(DevConfig.MODELS_DIR_LOCAL, "X3D_only_freeze/run_20250422-215343.pth"), map_location=device))
    model.to(device)

    model.eval()

    # Evaluate on the test dataset
    total_samples = len(test_dataset)
    correct_predictions = 0
    inference_times = []

    with torch.no_grad():
        for batch in test_dataloader:
            inputs = batch["frames"].to(device)
            labels = batch["action_label"].to(device)

            start_time = time.perf_counter()

            outputs = model(inputs)
            predictions = outputs.argmax(dim = 1)

            end_time = time.perf_counter()
            inference_times.append(end_time - start_time)

            correct_predictions += (predictions == labels).sum().item()
    
    accuracy = (correct_predictions/total_samples) * 100
    avg_inference_time = sum(inference_times) / len(test_dataset)

    print("Test Accuracy: {:.2f}%".format(accuracy))
    print(f"Average Inference Time per sample: {avg_inference_time:.4f} seconds")

if __name__ == "__main__":
    main()

Using cache found in C:\Users\Jani/.cache\torch\hub\facebookresearch_pytorchvideo_main
