# mount drive and inspect dataset structure

In [1]:

from google.colab import drive
drive.mount('/content/drive')

import os, sys, glob
DATA_ROOT = '/content/drive/MyDrive/Datasets/UCF-101'
video_exts = ('.mp4', '.avi', '.mov', '.mkv')

def find_videos(root):
    vids=[]
    for dp, dn, files in os.walk(root):
        for f in files:
            if f.lower().endswith(video_exts):
                vids.append(os.path.join(dp,f))
    return vids

videos = find_videos(DATA_ROOT)
print("Found videos:", len(videos))
print("Example paths (first 10):")
for v in videos[:10]:
    print(v)

# try detect class folders (common layout: DATA_ROOT/<class_name>/*.avi)
classes = sorted([d for d in os.listdir(DATA_ROOT) if os.path.isdir(os.path.join(DATA_ROOT,d))])
print("Detected class folders count:", len(classes))
print("First 20 class folder names:", classes[:20])


Mounted at /content/drive
Found videos: 11158
Example paths (first 10):
/content/drive/MyDrive/Datasets/UCF-101/Fencing/v_Fencing_g01_c01.avi
/content/drive/MyDrive/Datasets/UCF-101/Fencing/v_Fencing_g01_c02.avi
/content/drive/MyDrive/Datasets/UCF-101/Fencing/v_Fencing_g01_c03.avi
/content/drive/MyDrive/Datasets/UCF-101/Fencing/v_Fencing_g01_c04.avi
/content/drive/MyDrive/Datasets/UCF-101/Fencing/v_Fencing_g01_c05.avi
/content/drive/MyDrive/Datasets/UCF-101/Fencing/v_Fencing_g01_c06.avi
/content/drive/MyDrive/Datasets/UCF-101/Fencing/v_Fencing_g02_c01.avi
/content/drive/MyDrive/Datasets/UCF-101/Fencing/v_Fencing_g02_c02.avi
/content/drive/MyDrive/Datasets/UCF-101/Fencing/v_Fencing_g02_c03.avi
/content/drive/MyDrive/Datasets/UCF-101/Fencing/v_Fencing_g02_c04.avi
Detected class folders count: 88
First 20 class folder names: ['ApplyEyeMakeup', 'ApplyLipstick', 'Archery', 'BabyCrawling', 'BalanceBeam', 'BandMarching', 'BaseballPitch', 'Basketball', 'BasketballDunk', 'BenchPress', 'Biking',

 create a small subset metadata

In [33]:

import random
import pandas as pd
import os

random.seed(42)

N_CLASSES = 25
K_PER_CLASS = 200

# read videos list from step 1
video_exts = ('.mp4', '.avi', '.mov', '.mkv')
def find_videos(root):
    vids=[]
    for dp, dn, files in os.walk(root):
        for f in files:
            if f.lower().endswith(video_exts):
                vids.append(os.path.join(dp,f))
    return vids

DATA_ROOT = '/content/drive/MyDrive/Datasets/UCF-101'
videos = find_videos(DATA_ROOT)

# detect classes
classes = sorted([d for d in os.listdir(DATA_ROOT) if os.path.isdir(os.path.join(DATA_ROOT,d))])

if len(classes) >= N_CLASSES:
    sel_classes = classes[:N_CLASSES]
else:
    parent_names = {}
    for v in videos:
        parent = os.path.basename(os.path.dirname(v))
        parent_names.setdefault(parent, []).append(v)
    sel_classes = list(parent_names.keys())[:N_CLASSES]

selected_videos = []
for c in sel_classes:
    cand = []
    folder = os.path.join(DATA_ROOT, c)
    if os.path.isdir(folder):
        for f in os.listdir(folder):
            if f.lower().endswith(video_exts):
                cand.append(os.path.join(folder,f))
    else:
        for v in videos:
            if os.path.basename(os.path.dirname(v)) == c:
                cand.append(v)
    if len(cand) > K_PER_CLASS:
        cand = random.sample(cand, K_PER_CLASS)
    selected_videos += cand

print("Selected videos:", len(selected_videos))
print("Selected classes:", sel_classes)

OUT_META_DIR = '/content/drive/MyDrive/ucf_small'
os.makedirs(OUT_META_DIR, exist_ok=True)
df = pd.DataFrame([{'video_path': v, 'label': os.path.basename(os.path.dirname(v))} for v in selected_videos])
meta_csv = os.path.join(OUT_META_DIR, 'metadata.csv')
df.to_csv(meta_csv, index=False)
print("Wrote metadata to:", meta_csv)
print(df.head())


Selected videos: 3360
Selected classes: ['ApplyEyeMakeup', 'ApplyLipstick', 'Archery', 'BabyCrawling', 'BalanceBeam', 'BandMarching', 'BaseballPitch', 'Basketball', 'BasketballDunk', 'BenchPress', 'Biking', 'Billiards', 'BlowDryHair', 'BlowingCandles', 'BodyWeightSquats', 'Bowling', 'BoxingPunchingBag', 'BoxingSpeedBag', 'BreastStroke', 'BrushingTeeth', 'CleanAndJerk', 'CliffDiving', 'CricketBowling', 'CricketShot', 'CuttingInKitchen']
Wrote metadata to: /content/drive/MyDrive/ucf_small/metadata.csv
                                          video_path           label
0  /content/drive/MyDrive/Datasets/UCF-101/ApplyE...  ApplyEyeMakeup
1  /content/drive/MyDrive/Datasets/UCF-101/ApplyE...  ApplyEyeMakeup
2  /content/drive/MyDrive/Datasets/UCF-101/ApplyE...  ApplyEyeMakeup
3  /content/drive/MyDrive/Datasets/UCF-101/ApplyE...  ApplyEyeMakeup
4  /content/drive/MyDrive/Datasets/UCF-101/ApplyE...  ApplyEyeMakeup


frame sampling helper

In [34]:

import cv2
import numpy as np

def sample_frames(video_path, num_frames=16):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return None
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total <= 0:
        cap.release()
        return None
    indices = np.linspace(0, total-1, num_frames, dtype=int)
    frames = []
    for idx in indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
        ret, frame = cap.read()
        if not ret:
            if len(frames) > 0:
                frames.append(frames[-1].copy())
            else:
                frames.append(np.zeros((224,224,3), dtype=np.uint8))
            continue
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame)
    cap.release()
    return frames


 extract frame features with ResNet50 and save .npy files

In [35]:

import torch
from torchvision import models, transforms
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# preprocessing pipeline
preprocess = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

# load ResNet50 and remove final FC
resnet = models.resnet50(pretrained=True)
modules = list(resnet.children())[:-1]
resnet_feat = torch.nn.Sequential(*modules).to(device)
resnet_feat.eval()

# parameters
NUM_FRAMES = 16
FEATURE_DIR = '/content/drive/MyDrive/ucf_small/features'
os.makedirs(FEATURE_DIR, exist_ok=True)

meta_csv = os.path.join('/content/drive/MyDrive/ucf_small', 'metadata.csv')
meta = pd.read_csv(meta_csv)

with torch.no_grad():
    for i, row in tqdm(meta.iterrows(), total=len(meta)):
        vpath = row['video_path']
        fname = os.path.splitext(os.path.basename(vpath))[0]
        out_file = os.path.join(FEATURE_DIR, fname + '.npy')
        if os.path.exists(out_file):
            continue
        frames = sample_frames(vpath, num_frames=NUM_FRAMES)
        if frames is None:
            print("Skipping (cannot open):", vpath)
            continue
        batch = torch.stack([preprocess(f) for f in frames]).to(device)
        feats = resnet_feat(batch)  # shape: (T, 2048, 1, 1)
        feats = feats.reshape(feats.size(0), -1).cpu().numpy()  # (T, 2048)
        np.save(out_file, feats)


Using device: cuda


100%|██████████| 3360/3360 [46:37<00:00,  1.20it/s]


create PyTorch Dataset that reads .npy features

In [36]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os

FEATURE_DIR = '/content/drive/MyDrive/ucf_small/features'
meta = pd.read_csv('/content/drive/MyDrive/ucf_small/metadata.csv')

# filter meta to keep only videos with a saved feature file
meta['feat_exists'] = meta['video_path'].apply(lambda p: os.path.exists(os.path.join(FEATURE_DIR, os.path.splitext(os.path.basename(p))[0] + '.npy')))
meta = meta[meta['feat_exists']].reset_index(drop=True)
print("Meta rows with features:", len(meta))

# label mapping
labels = sorted(meta['label'].unique())
label2idx = {l:i for i,l in enumerate(labels)}
idx2label = {i:l for l,i in label2idx.items()}
print("Classes:", len(labels), labels)

# === START OF CHANGES ===

# Split into Train (80%) and a temporary set (20% for Val+Test)
train_df, temp_df = train_test_split(meta, test_size=0.2, stratify=meta['label'], random_state=42)

# Split the temporary set (20%) into Validation (10%) and Test (10%)
# We use test_size=0.5 because 0.5 * 20% = 10%
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

# Reset indices for all dataframes
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print(f"Train/Val/Test sizes: {len(train_df)} / {len(val_df)} / {len(test_df)}")

# save splits
train_df.to_csv('/content/drive/MyDrive/ucf_small/train.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/ucf_small/val.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/ucf_small/test.csv', index=False)

class FeatureDataset(Dataset):
    def __init__(self, df, feature_dir, label2idx):
        self.df = df
        self.feature_dir = feature_dir
        self.label2idx = label2idx
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        fpath = os.path.join(self.feature_dir, os.path.splitext(os.path.basename(row['video_path']))[0] + '.npy')
        feats = np.load(fpath)  # shape (T, feat_dim)
        return torch.tensor(feats, dtype=torch.float32), int(self.label2idx[row['label']])

# dataloaders
BATCH_SIZE = 16
train_ds = FeatureDataset(train_df, FEATURE_DIR, label2idx)
val_ds = FeatureDataset(val_df, FEATURE_DIR, label2idx)
test_ds = FeatureDataset(test_df, FEATURE_DIR, label2idx) # New test dataset

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2) # New test dataloader

# === END OF CHANGES ===

Meta rows with features: 3360
Classes: 25 ['ApplyEyeMakeup', 'ApplyLipstick', 'Archery', 'BabyCrawling', 'BalanceBeam', 'BandMarching', 'BaseballPitch', 'Basketball', 'BasketballDunk', 'BenchPress', 'Biking', 'Billiards', 'BlowDryHair', 'BlowingCandles', 'BodyWeightSquats', 'Bowling', 'BoxingPunchingBag', 'BoxingSpeedBag', 'BreastStroke', 'BrushingTeeth', 'CleanAndJerk', 'CliffDiving', 'CricketBowling', 'CricketShot', 'CuttingInKitchen']
Train/Val/Test sizes: 2688 / 336 / 336


LSTM model and training loop

In [40]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import math

class LSTMClassifier(nn.Module):
    # Corrected the default values in the class definition
    def __init__(self, feat_dim=2048, hidden_dim=512, num_layers=2, num_classes=25, bidirectional=False, dropout=0.7):
        super().__init__()
        self.lstm = nn.LSTM(feat_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=bidirectional, dropout=dropout if num_layers>1 else 0.0)
        self.bidirectional = bidirectional
        self.hidden_dim = hidden_dim
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim * (2 if bidirectional else 1), 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, num_classes)
        )
    def forward(self, x):
        # x: (B, T, feat_dim)
        out, (h_n, c_n) = self.lstm(x)
        if self.bidirectional:
            h = torch.cat([h_n[-2], h_n[-1]], dim=1)
        else:
            h = h_n[-1]
        logits = self.classifier(h)
        return logits

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

num_classes = len(labels)

# === THIS IS THE CORRECTED LINE ===
model = LSTMClassifier(num_classes=num_classes, num_layers=2, dropout=0.7).to(device)
# ==================================

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)

# training loop with early stopping
EPOCHS = 20
best_val_acc = 0.0
patience = 5
no_improve = 0
save_path = '/content/drive/MyDrive/ucf_small/best_lstm.pth'

for epoch in range(1, EPOCHS+1):
    model.train()
    train_loss = 0.0
    correct = 0
    total = 0
    for feats, labels_batch in tqdm(train_loader, desc=f"Train E{epoch}"):
        feats = feats.to(device)
        labels_batch = labels_batch.to(device)
        optimizer.zero_grad()
        logits = model(feats)
        loss = criterion(logits, labels_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * feats.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == labels_batch).sum().item()
        total += feats.size(0)
    train_loss /= total
    train_acc = correct / total

    # validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for feats, labels_batch in val_loader:
            feats = feats.to(device)
            labels_batch = labels_batch.to(device)
            logits = model(feats)
            loss = criterion(logits, labels_batch)
            val_loss += loss.item() * feats.size(0)
            preds = logits.argmax(dim=1)
            correct += (preds == labels_batch).sum().item()
            total += feats.size(0)
    val_loss /= total
    val_acc = correct / total
    print(f"Epoch {epoch}: train_loss={train_loss:.4f} train_acc={train_acc:.4f} val_loss={val_loss:.4f} val_acc={val_acc:.4f}")
    scheduler.step(val_acc)

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save({'model_state_dict': model.state_dict(), 'label2idx': label2idx}, save_path)
        print("Saved best model.")
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= patience:
            print("Early stopping.")
            break

Train E1: 100%|██████████| 168/168 [00:08<00:00, 19.11it/s]


Epoch 1: train_loss=2.7753 train_acc=0.2522 val_loss=1.5802 val_acc=0.5804
Saved best model.


Train E2: 100%|██████████| 168/168 [00:08<00:00, 20.95it/s]


Epoch 2: train_loss=1.2852 train_acc=0.6287 val_loss=0.6145 val_acc=0.8542
Saved best model.


Train E3: 100%|██████████| 168/168 [00:09<00:00, 17.84it/s]


Epoch 3: train_loss=0.6479 train_acc=0.8237 val_loss=0.3269 val_acc=0.9226
Saved best model.


Train E4: 100%|██████████| 168/168 [00:09<00:00, 18.14it/s]


Epoch 4: train_loss=0.3993 train_acc=0.8943 val_loss=0.1888 val_acc=0.9405
Saved best model.


Train E5: 100%|██████████| 168/168 [00:10<00:00, 16.78it/s]


Epoch 5: train_loss=0.2600 train_acc=0.9360 val_loss=0.2326 val_acc=0.9167


Train E6: 100%|██████████| 168/168 [00:08<00:00, 19.49it/s]


Epoch 6: train_loss=0.1904 train_acc=0.9572 val_loss=0.1848 val_acc=0.9405


Train E7: 100%|██████████| 168/168 [00:08<00:00, 19.67it/s]


Epoch 7: train_loss=0.1642 train_acc=0.9628 val_loss=0.1272 val_acc=0.9583
Saved best model.


Train E8: 100%|██████████| 168/168 [00:09<00:00, 17.44it/s]


Epoch 8: train_loss=0.1198 train_acc=0.9743 val_loss=0.1409 val_acc=0.9524


Train E9: 100%|██████████| 168/168 [00:09<00:00, 18.35it/s]


Epoch 9: train_loss=0.1074 train_acc=0.9777 val_loss=0.0813 val_acc=0.9673
Saved best model.


Train E10: 100%|██████████| 168/168 [00:08<00:00, 20.07it/s]


Epoch 10: train_loss=0.0780 train_acc=0.9847 val_loss=0.1570 val_acc=0.9583


Train E11: 100%|██████████| 168/168 [00:09<00:00, 18.60it/s]


Epoch 11: train_loss=0.0844 train_acc=0.9803 val_loss=0.0538 val_acc=0.9851
Saved best model.


Train E12: 100%|██████████| 168/168 [00:08<00:00, 18.69it/s]


Epoch 12: train_loss=0.1194 train_acc=0.9699 val_loss=0.0936 val_acc=0.9613


Train E13: 100%|██████████| 168/168 [00:09<00:00, 17.51it/s]


Epoch 13: train_loss=0.0549 train_acc=0.9881 val_loss=0.0451 val_acc=0.9881
Saved best model.


Train E14: 100%|██████████| 168/168 [00:09<00:00, 16.95it/s]


Epoch 14: train_loss=0.0674 train_acc=0.9825 val_loss=0.1117 val_acc=0.9702


Train E15: 100%|██████████| 168/168 [00:08<00:00, 20.73it/s]


Epoch 15: train_loss=0.0982 train_acc=0.9754 val_loss=0.0436 val_acc=0.9821


Train E16: 100%|██████████| 168/168 [00:08<00:00, 18.75it/s]


Epoch 16: train_loss=0.0557 train_acc=0.9866 val_loss=0.0677 val_acc=0.9851


Train E17: 100%|██████████| 168/168 [00:09<00:00, 18.42it/s]


Epoch 17: train_loss=0.0429 train_acc=0.9900 val_loss=0.0669 val_acc=0.9643


Train E18: 100%|██████████| 168/168 [00:08<00:00, 19.16it/s]


Epoch 18: train_loss=0.0231 train_acc=0.9974 val_loss=0.0274 val_acc=0.9881
Early stopping.


**Model Accurcy**

In [41]:

model = LSTMClassifier(
    num_classes=len(labels),
    num_layers=2,
    dropout=0.7
).to(device)

save_path = '/content/drive/MyDrive/ucf_small/best_lstm.pth'
checkpoint = torch.load(save_path)
model.load_state_dict(checkpoint['model_state_dict'])

# 2. Evaluate on the test set
model.eval()
test_correct = 0
test_total = 0
with torch.no_grad():
    for feats, labels_batch in tqdm(test_loader, desc="Testing"):
        feats = feats.to(device)
        labels_batch = labels_batch.to(device)

        logits = model(feats)
        preds = logits.argmax(dim=1)

        test_correct += (preds == labels_batch).sum().item()
        test_total += feats.size(0)

test_acc = test_correct / test_total
print(f"Final Test Accuracy: {test_acc:.4f}")

Testing: 100%|██████████| 21/21 [00:01<00:00, 19.45it/s]

Final Test Accuracy: 0.9762





Model Testing

In [43]:


import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import numpy as np
from tqdm import tqdm
from decord import VideoReader, cpu
import os

# =================================================================
#  1. Setup
# =================================================================

# We assume your new video has been uploaded to the session
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
save_path = '/content/drive/MyDrive/ucf_small/best_lstm.pth'
video_path = '/content/12433257-uhd_2160_3840_30fps.mp4' # <<< THIS LINE IS UPDATED
NUM_FRAMES = 16

# Check if the video file exists before proceeding
if not os.path.exists(video_path):
    print(f"[ERROR] Video file not found at: {video_path}")
else:
    # =================================================================
    #  2. Model Loading
    # =================================================================

    resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
    resnet = torch.nn.Sequential(*(list(resnet.children())[:-1]))
    resnet.eval().to(device)

    checkpoint = torch.load(save_path)
    label2idx = checkpoint['label2idx']
    idx2label = {i: l for l, i in label2idx.items()}

    model = LSTMClassifier(num_classes=len(label2idx)).to(device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()

    print("Models loaded successfully!")

    # =================================================================
    #  3. Helper Functions
    # =================================================================

    def extract_frames(video_path, num_frames):
        try:
            vr = VideoReader(video_path, ctx=cpu(0))
            total_frames = len(vr)
            frame_indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int)
            frames = vr.get_batch(frame_indices).asnumpy()
            return [Image.fromarray(frame) for frame in frames]
        except Exception as e:
            print(f"Error reading video {video_path}: {e}")
            return []

    def extract_features(frames, feature_extractor):
        preprocess = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

        features = []
        if not frames:
            return np.array([])
        with torch.no_grad():
            for frame in frames:
                img_tensor = preprocess(frame).unsqueeze(0).to(device)
                feat = feature_extractor(img_tensor)
                feat = feat.squeeze(-1).squeeze(-1).cpu().numpy()
                features.append(feat)
        return np.vstack(features)

    # =================================================================
    #  4. Inference Execution
    # =================================================================

    print(f"\n[INFO] Processing video: {video_path}")
    frames = extract_frames(video_path, NUM_FRAMES)
    print(f"[INFO] Extracted {len(frames)} frames.")

    if frames:
        features = extract_features(frames, resnet)
        features_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0).to(device)
        print(f"[INFO] Extracted features with shape: {features_tensor.shape}")

        with torch.no_grad():
            logits = model(features_tensor)
            probabilities = torch.softmax(logits, dim=1)[0]
            pred_idx = torch.argmax(probabilities).item()
            pred_label = idx2label[pred_idx]
            pred_confidence = probabilities[pred_idx].item()

        print("\n" + "="*30)
        print(f"Prediction: {pred_label.upper()}")
        print(f"Confidence: {pred_confidence * 100:.2f}%")
        print("="*30)
    else:
        print("\n[ERROR] Failed to extract frames. Cannot continue.")

Models loaded successfully!

[INFO] Processing video: /content/12433257-uhd_2160_3840_30fps.mp4
[INFO] Extracted 16 frames.
[INFO] Extracted features with shape: torch.Size([1, 16, 2048])

Prediction: BLOWINGCANDLES
Confidence: 72.66%
