# Video Recognition

Project on video recognition whith the dataset HMDB51 (https://serre.lab.brown.edu/hmdb51.html). A special focus is given to the efficiency of the training.

Training finora (loss bilanciata):
- 2 epoche lr=5e-4 FRAME RATE A 3
- 5 epoche lr=1e-4 FRAME RATE A 3
- 1 epoche kr=5e-5 FRAME RATE A 3
IDEA: FARE UN PO' DI EPOCHE CON FRAME RATE ALTO E POI ABBASSARLO ALLA FINE

In [290]:
# !pip install opencv-python

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
import pickle

# Import everything from your new file
from video_recognition import (
    VideoLoader, CNN, CNNLSTM, train, save_model, load_model, 
    replace_head_for_finetuning, MAX_POOL, get_persistent_splits
)

dataset_directory = "./dataset"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
FRAME_SIZE = 224
FRAME_RATE_SCALER = 3
BATCH_SIZE = 1
ACCUM_STEPS = 20
EMBEDDING_DIM = 256
LSTM_HIDDEN = 128
LSTM_LAYERS = 1
USE_WEIGHTED_LOSS=True
LEARNING_RATE = 5e-4

cnn_config = [
    {'out_channels': 16, 'kernel_size': 3, 'stride': 1, 'padding': 1},
    {'out_channels': 32, 'kernel_size': 3, 'stride': 1, 'padding': 1},
    {'out_channels': 64, 'kernel_size': 3, 'stride': 1, 'padding': 1},
    {'out_channels': 128, 'kernel_size': 3, 'stride': 1, 'padding': 1}
]

In [7]:
# 1. Load ENTIRE dataset
full_dataset = VideoLoader(dataset_directory, FRAME_SIZE, FRAME_RATE_SCALER, classes_to_use=None)

# 2. Get Persistent Split (Will create 'pretrain_full_train.pkl' and 'pretrain_full_test.pkl')
full_train, full_test = get_persistent_splits(full_dataset, 0.8, "pretrain_full")

train_loader = DataLoader(full_train, batch_size=BATCH_SIZE, shuffle=True)

# 3. Initialize Model
cnn = CNN(cnn_config, MAX_POOL, (3, FRAME_SIZE, FRAME_SIZE), EMBEDDING_DIM)
model = CNNLSTM(cnn, len(full_dataset.classes), LSTM_HIDDEN, LSTM_LAYERS).to(device)

# 4. Train
print(f"Pre-training on {len(full_dataset.classes)} classes...")
train(model, epochs=1, accumulation_steps=ACCUM_STEPS, learning_rate=LEARNING_RATE, train_loader=train_loader, device=device, use_weighted_loss=USE_WEIGHTED_LOSS)

# 5. Save Master Model
save_model(model, "trained_on_all_classes.pkl")

Classes loaded: ['clap', 'climb', 'climb_stairs', 'dive', 'draw_sword', 'dribble', 'drink', 'eat', 'fall_floor', 'fencing', 'flic_flac', 'golf', 'handstand', 'hit', 'hug', 'jump', 'kick', 'kick_ball', 'kiss', 'laugh', 'pick', 'pour', 'pullup', 'punch', 'push', 'pushup', 'ride_bike', 'ride_horse', 'run', 'shake_hands', 'shoot_ball', 'shoot_bow', 'shoot_gun', 'sit', 'situp', 'smile', 'smoke', 'somersault', 'stand', 'swing_baseball', 'sword', 'sword_exercise', 'talk', 'throw', 'turn', 'walk', 'wave']
Database size: 6341
Loading existing split from pretrain_full...
Pre-training on 47 classes...
Calculating class weights for Weighted Loss...
Class Weights (Shape torch.Size([47])): tensor([1.0685, 1.1859, 1.1480, 1.1604, 1.1859, 0.9303, 0.8175, 1.3002, 0.9722,
        1.1241, 1.2404, 1.3835, 1.2125, 1.0685, 1.1859, 0.8431, 0.9810, 1.0900,
        1.3323, 1.0477, 1.1991, 1.2548, 1.3660, 1.0685, 1.1241, 1.3835, 1.3160,
        1.1604, 0.5563, 0.8993, 1.0086, 1.3160, 1.2125, 0.9900, 1.2548, 1.3

## Test

In [12]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import torch
from torch.utils.data import DataLoader
from video_recognition import load_model 

# --- 1. CONFIGURATION ---
MODEL_FILE = "trained_on_all_classes.pkl"
TEST_SET_FILE = "pretrain_full_test.pkl"
BATCH_SIZE = 1 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- 2. LOAD DATA & MODEL ---
print(f"Loading Test Set from {TEST_SET_FILE}...")
with open(TEST_SET_FILE, 'rb') as f:
    test_set = pickle.load(f)

# Get the list of ALL 51 classes
CLASSES = test_set.dataset.classes 
print(f"Evaluating on {len(CLASSES)} classes.")

test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)

print(f"Loading Master Model from {MODEL_FILE}...")
model = load_model(MODEL_FILE)
model = model.to(device)
model.eval() 

# --- 3. RUN INFERENCE ---
all_preds = []
all_labels = []
print("Running Inference...")

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# --- 4. REPORTING ---
print("\n" + "="*40)
print("MASTER MODEL CLASSIFICATION REPORT")
print("="*40)

# --- FIX: Explicitly pass the range of labels to handle missing classes ---
all_possible_labels = range(len(CLASSES))

print(classification_report(
    all_labels, 
    all_preds, 
    labels=all_possible_labels,  # <--- FIX IS HERE
    target_names=CLASSES, 
    zero_division=0
))

acc = accuracy_score(all_labels, all_preds)
print(f"Total Accuracy: {acc*100:.2f}%")
print("="*40 + "\n")

Loading Test Set from pretrain_full_test.pkl...
Evaluating on 47 classes.
Loading Master Model from trained_on_all_classes.pkl...
Running Inference...

MASTER MODEL CLASSIFICATION REPORT
                precision    recall  f1-score   support

          clap       0.00      0.00      0.00        29
         climb       0.05      0.06      0.05        17
  climb_stairs       0.00      0.00      0.00        18
          dive       0.50      0.03      0.06        34
    draw_sword       0.12      0.17      0.14        12
       dribble       0.25      0.59      0.35        29
         drink       0.00      0.00      0.00        32
           eat       0.00      0.00      0.00        25
    fall_floor       0.00      0.00      0.00        25
       fencing       1.00      0.05      0.10        20
     flic_flac       0.00      0.00      0.00        20
          golf       0.23      0.22      0.23        27
     handstand       0.16      0.17      0.16        24
           hit       0.10   

## Continue Training

In [None]:
# --- 1. CONFIGURATION ---
MORE_EPOCHS = 1
LEARNING_RATE = 5e-5 # Standard LR for pre-training (higher than fine-tuning)
ACCUM_STEPS = 20
BATCH_SIZE = 1
FRAME_RATE_SCALER = 3
USE_WEIGHTED_LOSS = True

TRAIN_SET_FILE = "pretrain_full_train.pkl" # The full training set
MODEL_FILE = "trained_on_all_classes.pkl"

# --- 2. LOAD DATA & MODEL ---
print(f"Loading Train Set from {TRAIN_SET_FILE}...")
with open(TRAIN_SET_FILE, 'rb') as f:
    train_set = pickle.load(f)

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)

print(f"Loading Master Model from {MODEL_FILE}...")
from video_recognition import load_model, train, save_model
model = load_model(MODEL_FILE)
model = model.to(device)

# --- 3. RESUME TRAINING ---
print(f"Resuming Master Model training for {MORE_EPOCHS} more epochs...")
train(model, epochs=MORE_EPOCHS, accumulation_steps=ACCUM_STEPS, learning_rate=LEARNING_RATE, train_loader=train_loader, device=device, use_weighted_loss=USE_WEIGHTED_LOSS)

# --- 4. SAVE ---
save_model(model, MODEL_FILE)
print("Updated Master Model saved.")

Loading Train Set from pretrain_full_train.pkl...
Loading Master Model from trained_on_all_classes.pkl...
Resuming Master Model training for 1 more epochs...
Calculating class weights for Weighted Loss...
Class Weights (Shape torch.Size([47])): tensor([1.0685, 1.1859, 1.1480, 1.1604, 1.1859, 0.9303, 0.8175, 1.3002, 0.9722,
        1.1241, 1.2404, 1.3835, 1.2125, 1.0685, 1.1859, 0.8431, 0.9810, 1.0900,
        1.3323, 1.0477, 1.1991, 1.2548, 1.3660, 1.0685, 1.1241, 1.3835, 1.3160,
        1.1604, 0.5563, 0.8993, 1.0086, 1.3160, 1.2125, 0.9900, 1.2548, 1.3323,
        1.3002, 0.9466, 0.8633, 0.9303, 1.0376, 1.1012, 1.0791, 1.4583, 0.5833,
        0.2409, 1.4015])


# Fine Tune

In [None]:
# 1. Define Subset
TARGET_CLASSES = ['jump', 'run', 'smile', 'wave']

# 2. Load Subset Dataset
subset_dataset = VideoLoader(dataset_directory, FRAME_SIZE, FRAME_RATE_SCALER, classes_to_use=TARGET_CLASSES)

# 3. Get Persistent Split (Will create 'finetune_subset_train.pkl' and 'finetune_subset_test.pkl')
# IMPORTANT: This ensures that even if you restart the kernel, you test on the EXACT same subset videos.
sub_train, sub_test = get_persistent_splits(subset_dataset, 0.8, "finetune_subset")

sub_loader = DataLoader(sub_train, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, persistent_workers=True)

# 4. Load Master Model & Modify
model = load_model("trained_on_all_classes.pkl")
model = model.to(device)
model = replace_head_for_finetuning(model, new_num_classes=len(TARGET_CLASSES))
model = model.to(device)

# 5. Fine-Tune
print(f"Fine-tuning for {TARGET_CLASSES}...")
train(model, epochs=5, accumulation_steps=ACCUM_STEPS, learning_rate=1e-4, train_loader=sub_loader, device=device)

# 6. Save Final Model
save_model(model, "finetuned_model.pkl")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import torch
from torch.utils.data import DataLoader

# --- 1. CONFIGURATION ---
MODEL_FILE = "finetuned_model.pkl"
TEST_SET_FILE = "finetune_subset_test.pkl"
BATCH_SIZE = 1 # Keep at 1 for precise video-by-video evaluation

# Define the classes again to ensure the labels match the report
# (Must match the order used in fine-tuning)
TARGET_CLASSES = ['jump', 'run', 'smile', 'wave'] 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- 2. LOAD DATA & MODEL ---
print(f"Loading Test Set from {TEST_SET_FILE}...")
with open(TEST_SET_FILE, 'rb') as f:
    test_set = pickle.load(f)

test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

print(f"Loading Model from {MODEL_FILE}...")
# Use the load_model function from your module
from video_recognition import load_model
model = load_model(MODEL_FILE)
model = model.to(device)
model.eval() # Set to evaluation mode (Important: disables Dropout)

# --- 3. RUN INFERENCE ---
all_preds = []
all_labels = []
print("Running Inference...")

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# --- 4. REPORTING ---
# A. Classification Report (Precision, Recall, F1)
print("\n" + "="*40)
print("FINAL CLASSIFICATION REPORT")
print("="*40)
print(classification_report(all_labels, all_preds, target_names=TARGET_CLASSES, zero_division=0))

# B. Accuracy Score
acc = accuracy_score(all_labels, all_preds)
print(f"Total Accuracy: {acc*100:.2f}%")
print("="*40 + "\n")

# C. Confusion Matrix Plot
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=TARGET_CLASSES, 
            yticklabels=TARGET_CLASSES)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# --- 1. CONFIGURATION ---
MORE_EPOCHS = 5
LEARNING_RATE = 1e-5 # Keep this low for fine-tuning/resuming
ACCUM_STEPS = 10
BATCH_SIZE = 1

TRAIN_SET_FILE = "finetune_subset_train.pkl"
MODEL_FILE = "finetuned_model.pkl"

# --- 2. LOAD DATA & MODEL ---
print(f"Loading Train Set from {TRAIN_SET_FILE}...")
with open(TRAIN_SET_FILE, 'rb') as f:
    train_set = pickle.load(f)

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)

print(f"Loading Model from {MODEL_FILE}...")
from video_recognition import load_model, train, save_model
model = load_model(MODEL_FILE)
model = model.to(device)

# --- 3. RESUME TRAINING ---
print(f"Resuming training for {MORE_EPOCHS} more epochs...")
train(model, epochs=MORE_EPOCHS, accumulation_steps=ACCUM_STEPS, learning_rate=LEARNING_RATE, train_loader=train_loader, device=device)

# --- 4. SAVE ---
save_model(model, MODEL_FILE)
print("Updated model saved.")