In [1]:
import torch
# Choose the `slow_r50` model 
model = torch.hub.load('facebookresearch/pytorchvideo', 'slow_r50', pretrained=True)

Using cache found in C:\Users\jyoti/.cache\torch\hub\facebookresearch_pytorchvideo_main


In [3]:
!pip install av

Collecting av
  Downloading av-13.1.0-cp311-cp311-win_amd64.whl.metadata (4.6 kB)
Downloading av-13.1.0-cp311-cp311-win_amd64.whl (25.8 MB)
   ---------------------------------------- 0.0/25.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/25.8 MB ? eta -:--:--
   ---------------------------------------- 0.1/25.8 MB 656.4 kB/s eta 0:00:40
   ---------------------------------------- 0.3/25.8 MB 2.2 MB/s eta 0:00:12
   - -------------------------------------- 0.7/25.8 MB 4.2 MB/s eta 0:00:06
   - -------------------------------------- 1.2/25.8 MB 5.6 MB/s eta 0:00:05
   -- ------------------------------------- 1.6/25.8 MB 6.0 MB/s eta 0:00:05
   --- ------------------------------------ 2.1/25.8 MB 6.8 MB/s eta 0:00:04
   --- ------------------------------------ 2.6/25.8 MB 7.1 MB/s eta 0:00:04
   ---- ----------------------------------- 3.1/25.8 MB 7.6 MB/s eta 0:00:03
   ----- ---------------------------------- 3.7/25.8 MB 8.1 MB/s eta 0:00:03
   ------ ---------------

In [4]:
import json
import urllib
from pytorchvideo.data.encoded_video import EncodedVideo

from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample
)



In [5]:
device = "cpu"
model = model.eval()
model = model.to(device)

In [6]:
json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"
try: urllib.URLopener().retrieve(json_url, json_filename)
except: urllib.request.urlretrieve(json_url, json_filename)

In [7]:
with open(json_filename, "r") as f:
    kinetics_classnames = json.load(f)

# Create an id to label name mapping
kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

In [8]:
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 8
sampling_rate = 8
frames_per_second = 30

# Note that this transform is specific to the slow_R50 model.
transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size=(crop_size, crop_size))
        ]
    ),
)

# The duration of the input clip is also specific to the model.
clip_duration = (num_frames * sampling_rate)/frames_per_second

In [9]:
url_link = "https://dl.fbaipublicfiles.com/pytorchvideo/projects/archery.mp4"
video_path = 'archery.mp4'
try: urllib.URLopener().retrieve(url_link, video_path)
except: urllib.request.urlretrieve(url_link, video_path)

In [10]:
# Select the duration of the clip to load by specifying the start and end duration
# The start_sec should correspond to where the action occurs in the video
start_sec = 0
end_sec = start_sec + clip_duration

# Initialize an EncodedVideo helper class and load the video
video = EncodedVideo.from_path(video_path)

# Load the desired clip
video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

# Apply a transform to normalize the video input
video_data = transform(video_data)

# Move the inputs to the desired device
inputs = video_data["video"]
inputs = inputs.to(device)

In [11]:
preds = model(inputs[None, ...])

# Get the predicted classes
post_act = torch.nn.Softmax(dim=1)
preds = post_act(preds)
pred_classes = preds.topk(k=5).indices[0]

# Map the predicted classes to the label names
pred_class_names = [kinetics_id_to_classname[int(i)] for i in pred_classes]
print("Top 5 predicted labels: %s" % ", ".join(pred_class_names))

Top 5 predicted labels: archery, throwing axe, playing paintball, stretching arm, riding or walking with horse


In [17]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample
)
from pytorchvideo.data.encoded_video import EncodedVideo
import numpy as np
from pathlib import Path
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
import os

class VideoDataset(Dataset):
    """Custom Dataset for loading video clips from directory structure"""
    def __init__(self, root_dir, clip_duration=10, transform=None):
        self.root_dir = Path(root_dir)
        self.clip_duration = clip_duration
        self.transform = transform
        
        # Get all video paths and their labels
        self.samples = []
        self.class_to_idx = {}
        
        for idx, class_dir in enumerate(sorted(self.root_dir.glob('*'))):
            if class_dir.is_dir():
                class_name = class_dir.name
                self.class_to_idx[class_name] = idx
                
                for video_path in class_dir.glob('*.mp4'):
                    self.samples.append((str(video_path), idx))
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        video_path, label = self.samples[idx]
        
        # Load video
        video = EncodedVideo.from_path(video_path)
        
        # Extract clip
        video_data = video.get_clip(start_sec=0, end_sec=self.clip_duration)
        
        # Apply transform if specified
        if self.transform:
            video_data = self.transform(video_data)
        
        return video_data["video"], label

def create_video_transform(side_size=256, crop_size=256, num_frames=8):
    """Creates video transform pipeline"""
    return ApplyTransformToKey(
        key="video",
        transform=Compose([
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo([0.45, 0.45, 0.45], [0.225, 0.225, 0.225]),
            ShortSideScale(size=side_size),
            CenterCropVideo(crop_size=(crop_size, crop_size))
        ])
    )

def modify_model_head(model, num_classes=3):
    """Modifies the classification head of the model"""
    if hasattr(model, 'blocks') and hasattr(model.blocks[-1], 'proj'):
        in_features = model.blocks[-1].proj.in_features
        model.blocks[-1].proj = nn.Linear(in_features, num_classes)
    else:
        # Generic approach for other model architectures
        for name, module in model.named_children():
            if isinstance(module, nn.Linear):
                in_features = module.in_features
                setattr(model, name, nn.Linear(in_features, num_classes))
    return model

def train_epoch(model, dataloader, criterion, optimizer, device):
    """Trains the model for one epoch"""
    model.train()
    running_loss = 0.0
    predictions = []
    true_labels = []
    
    for inputs, labels in tqdm(dataloader, desc="Training"):
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        predictions.extend(outputs.argmax(1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
    
    return running_loss / len(dataloader), predictions, true_labels

def validate(model, dataloader, criterion, device):
    """Validates the model"""
    model.eval()
    running_loss = 0.0
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for inputs, labels in tqdm(dataloader, desc="Validating"):
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item()
            predictions.extend(outputs.argmax(1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    return running_loss / len(dataloader), predictions, true_labels

def plot_confusion_matrix(cm, class_names, fold):
    """Plots and saves confusion matrix"""
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names,
                yticklabels=class_names)
    plt.title(f'Confusion Matrix - Fold {fold}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.savefig(f'confusion_matrix_fold_{fold}.png')
    plt.close()

def train_model(model_class, dataset_path, num_epochs=30, batch_size=8, num_folds=5,
                learning_rate=0.001, device="cuda"):
    """
    Main training function with cross-validation
    
    Args:
        model_class: Class of the model to be trained (e.g., SlowR50)
        dataset_path: Path to dataset directory
        num_epochs: Number of training epochs
        batch_size: Batch size for training
        num_folds: Number of folds for cross-validation
        learning_rate: Learning rate for optimizer
        device: Device to train on ('cuda' or 'cpu')
    """
    # Setup
    device = torch.device(device if torch.cuda.is_available() else "cpu")
    transform = create_video_transform()
    dataset = VideoDataset(dataset_path, transform=transform)
    
    # Cross-validation setup
    kfold = KFold(n_splits=num_folds, shuffle=True)
    fold_results = []
    
    # Training loop for each fold
    for fold, (train_ids, val_ids) in enumerate(kfold.split(dataset)):
        print(f"\nTraining Fold {fold+1}/{num_folds}")
        
        # Create data loaders
        train_loader = DataLoader(dataset, batch_size=batch_size,
                                sampler=SubsetRandomSampler(train_ids))
        val_loader = DataLoader(dataset, batch_size=batch_size,
                              sampler=SubsetRandomSampler(val_ids))
        
        # Initialize model
        model = model_class()
        model = modify_model_head(model, num_classes=3)
        model = model.to(device)
        
        # Setup training
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3)
        
        best_val_loss = float('inf')
        fold_best_metrics = {}
        
        # Training loop
        for epoch in range(num_epochs):
            # Train
            train_loss, train_preds, train_labels = train_epoch(
                model, train_loader, criterion, optimizer, device
            )
            
            # Validate
            val_loss, val_preds, val_labels = validate(
                model, val_loader, criterion, device
            )
            
            # Print progress
            print(f"Epoch {epoch+1}/{num_epochs}")
            print(f"Train Loss: {train_loss:.4f}")
            print(f"Val Loss: {val_loss:.4f}")
            
            # Update learning rate
            scheduler.step(val_loss)
            
            # Save best model
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(model.state_dict(), f'best_model_fold_{fold}.pth')
                
                # Calculate metrics
                val_report = classification_report(val_labels, val_preds, 
                                                output_dict=True)
                fold_best_metrics = {
                    'fold': fold,
                    'val_loss': val_loss,
                    'val_report': val_report,
                    'confusion_matrix': confusion_matrix(val_labels, val_preds)
                }
        
        # Save fold results
        fold_results.append(fold_best_metrics)
        
        # Plot confusion matrix
        plot_confusion_matrix(
            fold_best_metrics['confusion_matrix'],
            list(dataset.class_to_idx.keys()),
            fold
        )
    
    # Calculate and print final results
    print("\nFinal Cross-Validation Results:")
    class_metrics = {cls: {'precision': [], 'recall': [], 'f1-score': []}
                    for cls in dataset.class_to_idx.keys()}
    
    for fold_metric in fold_results:
        for cls in dataset.class_to_idx.keys():
            metrics = fold_metric['val_report'][str(dataset.class_to_idx[cls])]
            class_metrics[cls]['precision'].append(metrics['precision'])
            class_metrics[cls]['recall'].append(metrics['recall'])
            class_metrics[cls]['f1-score'].append(metrics['f1-score'])
    
    # Create final report
    final_report = {}
    for cls in class_metrics:
        final_report[cls] = {
            'precision': f"{np.mean(class_metrics[cls]['precision']):.3f} ± {np.std(class_metrics[cls]['precision']):.3f}",
            'recall': f"{np.mean(class_metrics[cls]['recall']):.3f} ± {np.std(class_metrics[cls]['recall']):.3f}",
            'f1-score': f"{np.mean(class_metrics[cls]['f1-score']):.3f} ± {np.std(class_metrics[cls]['f1-score']):.3f}"
        }
    
    # Save final report
    pd.DataFrame(final_report).transpose().to_csv('classification_report.csv')
    print("\nClass-wise Performance:")
    print(pd.DataFrame(final_report).transpose())

def main():
    # Configuration
    config = {
        'model_class': lambda: torch.hub.load('facebookresearch/pytorchvideo', 'slow_r50', pretrained = True),
        'dataset_path': "vrwalking",
        'num_epochs': 5,
        'batch_size': 8,
        'num_folds': 2,
        'learning_rate': 0.001,
        'device': "cuda"
    }
    
    # Train model
    train_model(**config)

if __name__ == "__main__":
    main()


Training Fold 1/2


Using cache found in C:\Users\jyoti/.cache\torch\hub\facebookresearch_pytorchvideo_main
Training: 100%|██████████| 9/9 [06:14<00:00, 41.65s/it]
Validating: 100%|██████████| 9/9 [05:41<00:00, 37.92s/it]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/5
Train Loss: 1.0583
Val Loss: 23.0019


Training: 100%|██████████| 9/9 [07:03<00:00, 47.04s/it]
Validating: 100%|██████████| 9/9 [06:39<00:00, 44.35s/it]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/5
Train Loss: 1.0276
Val Loss: 20.9693


Training: 100%|██████████| 9/9 [08:37<00:00, 57.51s/it]
Validating: 100%|██████████| 9/9 [08:23<00:00, 55.99s/it]


Epoch 3/5
Train Loss: 0.8452
Val Loss: 30.5809


Training: 100%|██████████| 9/9 [09:08<00:00, 60.96s/it]
Validating: 100%|██████████| 9/9 [08:32<00:00, 56.93s/it]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 4/5
Train Loss: 0.8123
Val Loss: 6.8510


Training: 100%|██████████| 9/9 [58:16<00:00, 388.45s/it]   
Validating: 100%|██████████| 9/9 [11:01<00:00, 73.54s/it]


Epoch 5/5
Train Loss: 0.7061
Val Loss: 6.5737


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Training Fold 2/2


Using cache found in C:\Users\jyoti/.cache\torch\hub\facebookresearch_pytorchvideo_main
Training: 100%|██████████| 9/9 [11:00<00:00, 73.44s/it]
Validating: 100%|██████████| 9/9 [11:43<00:00, 78.17s/it] 
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/5
Train Loss: 0.9967
Val Loss: 363.5230


Training: 100%|██████████| 9/9 [12:33<00:00, 83.69s/it]
Validating: 100%|██████████| 9/9 [12:02<00:00, 80.29s/it] 
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2/5
Train Loss: 1.0123
Val Loss: 146.8923


Training: 100%|██████████| 9/9 [12:27<00:00, 83.07s/it]
Validating: 100%|██████████| 9/9 [11:34<00:00, 77.19s/it] 
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 3/5
Train Loss: 0.8507
Val Loss: 20.5166


Training: 100%|██████████| 9/9 [12:30<00:00, 83.40s/it] 
Validating: 100%|██████████| 9/9 [11:25<00:00, 76.19s/it]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 4/5
Train Loss: 0.7823
Val Loss: 3.0489


Training: 100%|██████████| 9/9 [13:12<00:00, 88.07s/it] 
Validating: 100%|██████████| 9/9 [10:55<00:00, 72.86s/it]


Epoch 5/5
Train Loss: 0.7321
Val Loss: 3.2890

Final Cross-Validation Results:

Class-wise Performance:
       precision         recall       f1-score
0  0.550 ± 0.036  1.000 ± 0.000  0.709 ± 0.030
1  0.250 ± 0.250  0.031 ± 0.031  0.056 ± 0.056
2  0.000 ± 0.000  0.000 ± 0.000  0.000 ± 0.000
