# Video Recognition

Project on video recognition whith the dataset HMDB51 (https://serre.lab.brown.edu/hmdb51.html). A special focus is given to the efficiency of the training.

In [102]:
# !pip install opencv-python

In [103]:
import os
import glob
import cv2
import numpy as np
import torch.nn as nn
import torch

dataset_directory = "./dataset"

In [None]:
FRAME_SIZE = 224 
FRAME_RATE_SCALER = 1 # 
REAL_BATCH_SIZE = 1       # Must be 1 for variable lengths
ACCUMULATION_STEPS = 16    # Update weights every 16 videos (Simulated Batch Size = 16)
LEARNING_RATE = 1e-4
EPOCHS = 5
LSTM_HIDDEN_SIZE = 256
LSTM_LAYERS = 1

EMBEDDING_DIM = 512 # output dim of the CNN

my_config = [
    {'out_channels': 16, 'kernel_size': 3, 'stride': 1, 'padding': 1},
    {'out_channels': 32, 'kernel_size': 3, 'stride': 1, 'padding': 1},
    {'out_channels': 64, 'kernel_size': 3, 'stride': 1, 'padding': 1},
    {'out_channels': 128, 'kernel_size': 3, 'stride': 1, 'padding': 1}
]

In [None]:
class VideoLoader:
    def __init__(self, directory):
        self.directory = directory
        self.db = []
        self.classes = [] 
        self.class_to_idx = {} # New: Dictionary to map "catch" -> 0
        
        self.load_dataset()
        
    def load_dataset(self):
        if not os.path.exists(self.directory):
            print(f"Error: Directory '{self.directory}' not found.")
            return

        # 1. Find classes and sort them (sorting ensures 'catch' is always 0, even on different computers)
        self.classes = sorted([d for d in os.listdir(self.directory) if os.path.isdir(os.path.join(self.directory, d))])
        
        # 2. Create the mapping: {'catch': 0, 'dribble': 1, ...}
        self.class_to_idx = {cls_name: i for i, cls_name in enumerate(self.classes)}
        
        print(f"Classes found: {self.classes}")
        print(f"Mapping: {self.class_to_idx}")

        # 3. Collect videos
        for label in self.classes:
            folder_path = os.path.join(self.directory, label)
            video_files = glob.glob(os.path.join(folder_path, "*.avi"))
            
            for video_file in video_files:
                self.db.append((video_file, label))
                
        print(f"Database size: {len(self.db)}")

    def load_video(self, video_path, resize=(FRAME_SIZE, FRAME_SIZE), n=FRAME_RATE_SCALER):
        cap = cv2.VideoCapture(video_path)
        frames = []
        frame_count = 0
        
        try:
            while True:
                ret, frame = cap.read()
                if not ret:
                    break
                
                # Only process if this is the n-th frame
                if frame_count % n == 0:
                    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                    if resize:
                        frame = cv2.resize(frame, resize)
                    frames.append(frame)
                
                frame_count += 1
                
        finally:
            cap.release()
            
        return np.array(frames)

    def __len__(self):
        return len(self.db)

    def __getitem__(self, idx):
        video_path, label_str = self.db[idx]
        
        # Load video
        frames = self.load_video(video_path) 
        
        # Handle empty/broken videos
        if len(frames) == 0:
            # Return a dummy tensor if video is broken to prevent crash
            frames = torch.zeros((16, 3, FRAME_SIZE, FRAME_SIZE), dtype=torch.float32)
        else:
            frames = torch.tensor(frames, dtype=torch.float32)
            frames = frames.permute(0, 3, 1, 2) # (T, H, W, C) -> (T, C, H, W)
            frames = frames / 255.0
        
        # --- CRITICAL FIX HERE ---
        # Convert string label ('catch') to integer (0)
        label_idx = self.class_to_idx[label_str]
        
        return frames, label_idx

In [106]:
dataset = VideoLoader(dataset_directory)


Classes found: ['catch', 'dribble', 'fall_floor', 'hit', 'jump', 'kick_ball', 'push', 'run', 'shoot_ball', 'walk']
Mapping: {'catch': 0, 'dribble': 1, 'fall_floor': 2, 'hit': 3, 'jump': 4, 'kick_ball': 5, 'push': 6, 'run': 7, 'shoot_ball': 8, 'walk': 9}
Database size: 1816


Convolutional Neural Network

In [107]:
AVG_POOL = 0
MAX_POOL = 1
# Updated CNN Class
class CNN(nn.Module):
    def __init__(self, layer_config, poolType=MAX_POOL, input_dims=(3, FRAME_SIZE, FRAME_SIZE), embedding_dim=EMBEDDING_DIM):
        super(CNN, self).__init__()
        
        self.layers = nn.ModuleList()
        current_channels = input_dims[0]
        current_h, current_w = input_dims[1], input_dims[2]

        for i, config in enumerate(layer_config):
            out_ch = config['out_channels']
            k = config['kernel_size']
            s = config['stride']
            p = config['padding']
            
            if(poolType == MAX_POOL):
                layer = nn.Sequential(
                nn.Conv2d(in_channels=current_channels, out_channels=out_ch, kernel_size=k, stride=s, padding=p),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2, stride=2))
            else:
                layer = nn.Sequential(
                nn.Conv2d(in_channels=current_channels, out_channels=out_ch, kernel_size=k, stride=s, padding=p),
                nn.ReLU(),
                nn.AvgPool2d(kernel_size=2, stride=2))
            
            self.layers.append(layer)
            
            # Compute output size
            current_h = int((current_h + 2*p - k) / s) + 1
            current_w = int((current_w + 2*p - k) / s) + 1     
            current_h = int((current_h - 2) / 2) + 1
            current_w = int((current_w - 2) / 2) + 1
            current_channels = out_ch

        self.flatten_size = current_channels * current_h * current_w
        self.fc = nn.Linear(self.flatten_size, embedding_dim)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        
        # --- FIX IS HERE ---
        # Use .reshape() instead of .view()
        x = x.reshape(x.size(0), -1)
        
        x = self.fc(x)
        return x

In [108]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split

class CNNLSTM(nn.Module):
    def __init__(self, cnn_model, num_classes, lstm_hidden_size=256, lstm_layers=2):
        super(CNNLSTM, self).__init__()
        
        self.cnn = cnn_model
        # We extract the output size from your FlexibleCNN's last layer
        self.cnn_output_size = cnn_model.fc.out_features 
        
        # The LSTM "input_size" is the number of features from the CNN (e.g., 512)
        # The LSTM "hidden_size" is the memory capacity (e.g., 256)
        # The LSTM automatically handles the "Sequence Length" (Video Length) in forward()
        self.lstm = nn.LSTM(
            input_size=self.cnn_output_size, 
            hidden_size=lstm_hidden_size, 
            num_layers=lstm_layers, 
            batch_first=True
        )
        
        self.fc = nn.Linear(lstm_hidden_size, num_classes)

    def forward(self, x):
        # x shape: (Batch=1, Time_Steps, Channels, Height, Width)
        # Example: (1, 62, 3, 224, 224)
        batch_size, time_steps, C, H, W = x.size()
        
        cnn_features = []
        
        for t in range(time_steps):
            frame = x[:, t, :, :, :] 
            frame_feature = self.cnn(frame)
            cnn_features.append(frame_feature)
            
        lstm_input = torch.stack(cnn_features, dim=1)
        
        lstm_out, (h_n, c_n) = self.lstm(lstm_input)
        
        last_output = lstm_out[:, -1, :] 
        
        prediction = self.fc(last_output)
        return prediction

In [None]:
# --- SETUP ---
# 1. Dataset & Split (Reuse your existing dataset object)
total_size = len(dataset)
train_size = int(0.8 * total_size)
test_size = total_size - train_size
train_set, test_set = random_split(dataset, [train_size, test_size])

# 2. DataLoaders
# We use batch_size=1 so we don't need a custom collate_fn to pad sequences!
train_loader = DataLoader(train_set, batch_size=REAL_BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_set, batch_size=REAL_BATCH_SIZE, shuffle=False)

# 3. Model Init
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on {device}")

# Assuming 'cnn' is your already defined CNN class instance
# Note: Re-initialize CNN to be sure weights are fresh if you retrain

cnn = CNN(layer_config=my_config, poolType=MAX_POOL) 
model = CNNLSTM(cnn_model=cnn, num_classes=len(dataset.classes)).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# --- TRAINING LOOP ---
print("Starting Training...")

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    # Reset gradients at the start of epoch
    optimizer.zero_grad()
    
    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)
        
        # 1. Forward Pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # 2. Normalize Loss 
        # (We divide by ACCUMULATION_STEPS because the gradients will sum up)
        loss = loss / ACCUMULATION_STEPS
        
        # 3. Backward Pass (Accumulate Gradients)
        loss.backward()
        
        # 4. Step Optimizer (Only every X steps)
        if (i + 1) % ACCUMULATION_STEPS == 0:
            optimizer.step()
            optimizer.zero_grad()
            
        # Stats (Multiply loss back by steps to see real value)
        running_loss += loss.item() * ACCUMULATION_STEPS
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        if (i+1) % 50 == 0:
            print(f"Step [{i+1}/{len(train_loader)}] Loss: {loss.item() * ACCUMULATION_STEPS:.4f}")

    print(f"Epoch {epoch+1} Acc: {100 * correct / total:.2f}%")

print("Training Complete.")

Running on cpu
Starting Training...
Step [50/1452] Loss: 2.3276
Step [100/1452] Loss: 2.2806
Step [150/1452] Loss: 2.4552
Step [200/1452] Loss: 1.5755
Step [250/1452] Loss: 2.4681
Step [300/1452] Loss: 2.7940
Step [350/1452] Loss: 2.3474
Step [400/1452] Loss: 2.6800
Step [450/1452] Loss: 2.6280
Step [500/1452] Loss: 2.6559
Step [550/1452] Loss: 2.6706
Step [600/1452] Loss: 1.9279
Step [650/1452] Loss: 1.3285
Step [700/1452] Loss: 2.4892
Step [750/1452] Loss: 1.5562
Step [800/1452] Loss: 2.0823
Step [850/1452] Loss: 2.0307
Step [900/1452] Loss: 2.5121


In [None]:
import pickle

def save(predicted, labels):
    with open('predicted.pkl', 'wb') as f:
        pickle.dump(predicted, f)
    with open('labels.pkl', 'wb') as f:
        pickle.dump(labels, f)    
    print("Dataset saved!")

def load():
    with open('predicted.pkl', 'rb') as f:
        predicted = pickle.load(f)
    with open('labels.pkl', 'rb') as f:
        labels = pickle.load(f)

save(predicted, labels)