# Video Recognition

Project on video recognition whith the dataset HMDB51 (https://serre.lab.brown.edu/hmdb51.html). A special focus is given to the efficiency of the training.

In [4]:
# !pip install opencv-python

In [12]:
import os
import glob
import cv2
import numpy as np
import torch.nn as nn

dataset_directory = "./dataset"

In [8]:
class VideoLoader:
    def __init__(self, directory):
        self.directory = directory
        self.db = []
        self.lables = []
        
        self.load_dataset()
        
    def load_dataset(self):
        if not os.path.exists(self.directory):
            print(f"Error: Directory '{self.directory}' not found.")
            return

        self.lables = [d for d in os.listdir(self.directory) if os.path.isdir(os.path.join(self.directory, d))]
        
        print(f"Lables: {self.lables}")

        # Collect all video paths
        for label in self.lables:
            folder_path = os.path.join(self.directory, label)
            # Find all AVI files in this folder
            video_files = glob.glob(os.path.join(folder_path, "*.avi"))
            
            for video_file in video_files:
                self.db.append((video_file, label))
                
        print(f"Database size: {len(self.db)}")

    def load_video(self, video_path, resize=(224, 224)):
        cap = cv2.VideoCapture(video_path)
        frames = []
        try:
            while True:
                ret, frame = cap.read()
                if not ret:
                    break
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                if resize:
                    frame = cv2.resize(frame, resize)
                frames.append(frame)
        finally:
            cap.release()
        return np.array(frames)

    def __len__(self):
        return len(self.db)

    def __getitem__(self, idx):
        video_path, label = self.db[idx]
        video_data = self.load_video(video_path)
        return video_data, label

In [None]:
dataset = VideoLoader(dataset_directory)

Lables: ['catch', 'dribble', 'fall_floor', 'hit', 'jump', 'kick_ball', 'push', 'run', 'shoot_ball', 'walk']
Database size: 1816


'catch'

Convolutional Neural Network

In [None]:
AVG_POOL = 0
MAX_POOL = 1
class CNN(nn.Module):
    def __init__(self, layer_config, poolType = MAX_POOL,input_dims=(3, 224, 224), embedding_dim=512):
        super(CNN, self).__init__()
        
        self.layers = nn.ModuleList()
        current_channels = input_dims[0]
        current_h, current_w = input_dims[1], input_dims[2]

        for i, config in enumerate(layer_config):
            out_ch = config['out_channels']
            k = config['kernel_size']
            s = config['stride']
            p = config['padding']
            
            if(poolType == MAX_POOL):
                layer = nn.Sequential(
                nn.Conv2d(in_channels=current_channels, out_channels=out_ch, kernel_size=k, stride=s, padding=p),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2, stride=2))
            else:
                layer = nn.Sequential(
                nn.Conv2d(in_channels=current_channels, out_channels=out_ch, kernel_size=k, stride=s, padding=p),
                nn.ReLU(),
                nn.AvgPool2d(kernel_size=2, stride=2))
            
            self.layers.append(layer)
            
            # we now compute the output size
            current_h = int((current_h + 2*p - k) / s) + 1
            current_w = int((current_w + 2*p - k) / s) + 1     
            current_h = int((current_h - 2) / 2) + 1
            current_w = int((current_w - 2) / 2) + 1
            current_channels = out_ch

        self.flatten_size = current_channels * current_h * current_w
        self.fc = nn.Linear(self.flatten_size, embedding_dim)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

In [15]:
cnn_configuration = [
    {'out_channels': 16, 'kernel_size': 3, 'stride': 1, 'padding': 1},
    {'out_channels': 32, 'kernel_size': 3, 'stride': 1, 'padding': 1},
    {'out_channels': 64, 'kernel_size': 3, 'stride': 1, 'padding': 1},
    {'out_channels': 128, 'kernel_size': 3, 'stride': 1, 'padding': 1}
]
cnn = CNN(layer_config=cnn_configuration, poolType=MAX_POOL, input_dims=(3, 224, 224), embedding_dim=512)