# Video Recognition

Project on video recognition whith the dataset HMDB51 (https://serre.lab.brown.edu/hmdb51.html). A special focus is given to the efficiency of the training.

Training finora:
- 5 epoche tutto il dataset lr=1e-3

In [290]:
# !pip install opencv-python

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
import pickle

# Import everything from your new file
from video_recognition import (
    VideoLoader, CNN, CNNLSTM, train, save_model, load_model, 
    replace_head_for_finetuning, MAX_POOL, get_persistent_splits
)

dataset_directory = "./dataset"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
FRAME_SIZE = 244
FRAME_RATE_SCALER = 2
BATCH_SIZE = 1
ACCUM_STEPS = 16 
EMBEDDING_DIM = 256
LSTM_HIDDEN = 128
LSTM_LAYERS = 1

cnn_config = [
    {'out_channels': 16, 'kernel_size': 3, 'stride': 1, 'padding': 1},
    {'out_channels': 32, 'kernel_size': 3, 'stride': 1, 'padding': 1},
    {'out_channels': 64, 'kernel_size': 3, 'stride': 1, 'padding': 1},
    {'out_channels': 128, 'kernel_size': 3, 'stride': 1, 'padding': 1}
]

In [9]:
# 1. Load ENTIRE dataset
full_dataset = VideoLoader(dataset_directory, FRAME_SIZE, FRAME_RATE_SCALER, classes_to_use=None)

# 2. Get Persistent Split (Will create 'pretrain_full_train.pkl' and 'pretrain_full_test.pkl')
full_train, full_test = get_persistent_splits(full_dataset, 0.8, "pretrain_full")

train_loader = DataLoader(full_train, batch_size=BATCH_SIZE, shuffle=True)

# 3. Initialize Model
cnn = CNN(cnn_config, MAX_POOL, (3, FRAME_SIZE, FRAME_SIZE), EMBEDDING_DIM)
model = CNNLSTM(cnn, len(full_dataset.classes), LSTM_HIDDEN, LSTM_LAYERS).to(device)

# 4. Train
print(f"Pre-training on {len(full_dataset.classes)} classes...")
train(model, epochs=5, accumulation_steps=ACCUM_STEPS, learning_rate=1e-3, train_loader=train_loader, device=device)

# 5. Save Master Model
save_model(model, "trained_on_all_classes.pkl")

Classes loaded: ['brush_hair', 'cartwheel', 'catch', 'chew', 'clap', 'climb', 'climb_stairs', 'dive', 'draw_sword', 'dribble', 'drink', 'eat', 'fall_floor', 'fencing', 'flic_flac', 'golf', 'handstand', 'hit', 'hug', 'jump', 'kick', 'kick_ball', 'kiss', 'laugh', 'pick', 'pour', 'pullup', 'punch', 'push', 'pushup', 'ride_bike', 'ride_horse', 'run', 'shake_hands', 'shoot_ball', 'shoot_bow', 'shoot_gun', 'sit', 'situp', 'smile', 'smoke', 'somersault', 'stand', 'swing_baseball', 'sword', 'sword_exercise', 'talk', 'throw', 'turn', 'walk', 'wave']
Database size: 6341
Loading existing split from pretrain_full...
Pre-training on 51 classes...
Epoch 1 Step [20/5072] Loss: 3.9530
Epoch 1 Step [40/5072] Loss: 4.0157
Epoch 1 Step [60/5072] Loss: 3.9837
Epoch 1 Step [80/5072] Loss: 3.9517
Epoch 1 Step [100/5072] Loss: 3.9158
Epoch 1 Step [120/5072] Loss: 3.9606
Epoch 1 Step [140/5072] Loss: 3.9523
Epoch 1 Step [160/5072] Loss: 3.9214
Epoch 1 Step [180/5072] Loss: 3.9013
Epoch 1 Step [200/5072] Loss:

In [11]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import torch
from torch.utils.data import DataLoader
from video_recognition import load_model 

# --- 1. CONFIGURATION ---
MODEL_FILE = "trained_on_all_classes.pkl"
TEST_SET_FILE = "pretrain_full_test.pkl"
BATCH_SIZE = 1 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- 2. LOAD DATA & MODEL ---
print(f"Loading Test Set from {TEST_SET_FILE}...")
with open(TEST_SET_FILE, 'rb') as f:
    test_set = pickle.load(f)

# Get the list of ALL 51 classes
CLASSES = test_set.dataset.classes 
print(f"Evaluating on {len(CLASSES)} classes.")

test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)

print(f"Loading Master Model from {MODEL_FILE}...")
model = load_model(MODEL_FILE)
model = model.to(device)
model.eval() 

# --- 3. RUN INFERENCE ---
all_preds = []
all_labels = []
print("Running Inference...")

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# --- 4. REPORTING ---
print("\n" + "="*40)
print("MASTER MODEL CLASSIFICATION REPORT")
print("="*40)

# --- FIX: Explicitly pass the range of labels to handle missing classes ---
all_possible_labels = range(len(CLASSES))

print(classification_report(
    all_labels, 
    all_preds, 
    labels=all_possible_labels,  # <--- FIX IS HERE
    target_names=CLASSES, 
    zero_division=0
))

acc = accuracy_score(all_labels, all_preds)
print(f"Total Accuracy: {acc*100:.2f}%")
print("="*40 + "\n")

Loading Test Set from pretrain_full_test.pkl...
Evaluating on 51 classes.
Loading Master Model from trained_on_all_classes.pkl...
Running Inference...

MASTER MODEL CLASSIFICATION REPORT
                precision    recall  f1-score   support

    brush_hair       0.00      0.00      0.00         0
     cartwheel       0.00      0.00      0.00         0
         catch       0.00      0.00      0.00         0
          chew       0.00      0.00      0.00         0
          clap       0.00      0.00      0.00        23
         climb       0.00      0.00      0.00        27
  climb_stairs       0.00      0.00      0.00        26
          dive       0.00      0.00      0.00        24
    draw_sword       0.00      0.00      0.00        25
       dribble       0.00      0.00      0.00        26
         drink       0.00      0.00      0.00        25
           eat       0.00      0.00      0.00        18
    fall_floor       0.00      0.00      0.00        27
       fencing       0.00   

In [None]:
# --- 1. CONFIGURATION ---
MORE_EPOCHS = 1
LEARNING_RATE = 1e-4 # Standard LR for pre-training (higher than fine-tuning)
ACCUM_STEPS = 16
BATCH_SIZE = 1
USE_WEIGHTED_LOSS = True

TRAIN_SET_FILE = "pretrain_full_train.pkl" # The full training set
MODEL_FILE = "trained_on_all_classes.pkl"

# --- 2. LOAD DATA & MODEL ---
print(f"Loading Train Set from {TRAIN_SET_FILE}...")
with open(TRAIN_SET_FILE, 'rb') as f:
    train_set = pickle.load(f)

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)

print(f"Loading Master Model from {MODEL_FILE}...")
from video_recognition import load_model, train, save_model
model = load_model(MODEL_FILE)
model = model.to(device)

# --- 3. RESUME TRAINING ---
print(f"Resuming Master Model training for {MORE_EPOCHS} more epochs...")
train(model, epochs=MORE_EPOCHS, accumulation_steps=ACCUM_STEPS, learning_rate=LEARNING_RATE, train_loader=train_loader, device=device, use_weighted_loss=USE_WEIGHTED_LOSS)

# --- 4. SAVE ---
save_model(model, MODEL_FILE)
print("Updated Master Model saved.")

Loading Train Set from pretrain_full_train.pkl...
Loading Master Model from trained_on_all_classes.pkl...
Resuming Master Model training for 1 more epochs...
Epoch 1 Step [20/5072] Loss: 3.6202
Epoch 1 Step [40/5072] Loss: 3.8893
Epoch 1 Step [60/5072] Loss: 3.6872
Epoch 1 Step [80/5072] Loss: 3.5116
Epoch 1 Step [100/5072] Loss: 3.8071
Epoch 1 Step [120/5072] Loss: 3.4824
Epoch 1 Step [140/5072] Loss: 3.2944
Epoch 1 Step [160/5072] Loss: 3.6232
Epoch 1 Step [180/5072] Loss: 3.6427
Epoch 1 Step [200/5072] Loss: 3.5332
Epoch 1 Step [220/5072] Loss: 3.6442
Epoch 1 Step [240/5072] Loss: 3.4655
Epoch 1 Step [260/5072] Loss: 3.8467
Epoch 1 Step [280/5072] Loss: 3.5959
Epoch 1 Step [300/5072] Loss: 3.7704
Epoch 1 Step [320/5072] Loss: 3.6100
Epoch 1 Step [340/5072] Loss: 3.2781
Epoch 1 Step [360/5072] Loss: 3.7071
Epoch 1 Step [380/5072] Loss: 3.6183
Epoch 1 Step [400/5072] Loss: 3.5289
Epoch 1 Step [420/5072] Loss: 3.4467
Epoch 1 Step [440/5072] Loss: 3.6112
Epoch 1 Step [460/5072] Loss: 3.

KeyboardInterrupt: 

# Fine Tune

In [None]:
# 1. Define Subset
TARGET_CLASSES = ['jump', 'run', 'smile', 'wave']

# 2. Load Subset Dataset
subset_dataset = VideoLoader(dataset_directory, FRAME_SIZE, FRAME_RATE_SCALER, classes_to_use=TARGET_CLASSES)

# 3. Get Persistent Split (Will create 'finetune_subset_train.pkl' and 'finetune_subset_test.pkl')
# IMPORTANT: This ensures that even if you restart the kernel, you test on the EXACT same subset videos.
sub_train, sub_test = get_persistent_splits(subset_dataset, 0.8, "finetune_subset")

sub_loader = DataLoader(sub_train, batch_size=BATCH_SIZE, shuffle=True)

# 4. Load Master Model & Modify
model = load_model("trained_on_all_classes.pkl")
model = model.to(device)
model = replace_head_for_finetuning(model, new_num_classes=len(TARGET_CLASSES))
model = model.to(device)

# 5. Fine-Tune
print(f"Fine-tuning for {TARGET_CLASSES}...")
train(model, epochs=5, accumulation_steps=ACCUM_STEPS, learning_rate=1e-4, train_loader=sub_loader, device=device)

# 6. Save Final Model
save_model(model, "finetuned_model.pkl")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import torch
from torch.utils.data import DataLoader

# --- 1. CONFIGURATION ---
MODEL_FILE = "finetuned_model.pkl"
TEST_SET_FILE = "finetune_subset_test.pkl"
BATCH_SIZE = 1 # Keep at 1 for precise video-by-video evaluation

# Define the classes again to ensure the labels match the report
# (Must match the order used in fine-tuning)
TARGET_CLASSES = ['jump', 'run', 'smile', 'wave'] 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- 2. LOAD DATA & MODEL ---
print(f"Loading Test Set from {TEST_SET_FILE}...")
with open(TEST_SET_FILE, 'rb') as f:
    test_set = pickle.load(f)

test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)

print(f"Loading Model from {MODEL_FILE}...")
# Use the load_model function from your module
from video_recognition import load_model
model = load_model(MODEL_FILE)
model = model.to(device)
model.eval() # Set to evaluation mode (Important: disables Dropout)

# --- 3. RUN INFERENCE ---
all_preds = []
all_labels = []
print("Running Inference...")

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# --- 4. REPORTING ---
# A. Classification Report (Precision, Recall, F1)
print("\n" + "="*40)
print("FINAL CLASSIFICATION REPORT")
print("="*40)
print(classification_report(all_labels, all_preds, target_names=TARGET_CLASSES, zero_division=0))

# B. Accuracy Score
acc = accuracy_score(all_labels, all_preds)
print(f"Total Accuracy: {acc*100:.2f}%")
print("="*40 + "\n")

# C. Confusion Matrix Plot
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=TARGET_CLASSES, 
            yticklabels=TARGET_CLASSES)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# --- 1. CONFIGURATION ---
MORE_EPOCHS = 5
LEARNING_RATE = 1e-5 # Keep this low for fine-tuning/resuming
ACCUM_STEPS = 10
BATCH_SIZE = 1

TRAIN_SET_FILE = "finetune_subset_train.pkl"
MODEL_FILE = "finetuned_model.pkl"

# --- 2. LOAD DATA & MODEL ---
print(f"Loading Train Set from {TRAIN_SET_FILE}...")
with open(TRAIN_SET_FILE, 'rb') as f:
    train_set = pickle.load(f)

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)

print(f"Loading Model from {MODEL_FILE}...")
from video_recognition import load_model, train, save_model
model = load_model(MODEL_FILE)
model = model.to(device)

# --- 3. RESUME TRAINING ---
print(f"Resuming training for {MORE_EPOCHS} more epochs...")
train(model, epochs=MORE_EPOCHS, accumulation_steps=ACCUM_STEPS, learning_rate=LEARNING_RATE, train_loader=train_loader, device=device)

# --- 4. SAVE ---
save_model(model, MODEL_FILE)
print("Updated model saved.")