# 0. Important Imports

In [2]:
import torch
from tqdm import tqdm
from torchvision import transforms
from datasetclasses import MeanVideoFramesDataset  # Dataset created specifically to calculate normalization values

Helper functions

In [3]:

def calculate_mean_std(root, train_val_file, transform, only_normal):
    dataset = MeanVideoFramesDataset(root, train_val_file, transform, only_normal)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    loader = torch.utils.data.DataLoader(dataset,batch_size=128,shuffle=False,num_workers=24,pin_memory=True)

    # Initialize accumulators
    mean = torch.zeros(dataset[0].size(0)).to(device)  # Auto-detect number of channels
    std = torch.zeros(dataset[0].size(0)).to(device)
    total_samples = 0

    for imgs in tqdm(loader):
        # imgs shape: [B, C, H, W]
        imgs = imgs.to(device)
        batch_samples = imgs.size(0)
        
        # Reshape to [B, C, H*W]
        imgs = imgs.view(batch_samples, imgs.size(1), -1)
        
        # Calculate mean and std per image, then sum across batch
        mean += imgs.mean(dim=2).sum(dim=0)  # Sum across batch -> [C]
        std += imgs.std(dim=2).sum(dim=0)    # Sum across batch -> [C]
        
        total_samples += batch_samples

        # Average across all samples
    mean /= total_samples
    std /= total_samples

    print("Mean:", mean.cpu().numpy())
    print("Std :", std.cpu().numpy())
    return mean, std

# 1. Metrics

### Approach 1: Use Gray Scale or RGB Frames of Size 204x204

In [None]:
root = "/home/public/mkamal/datasets/deep_learning/projdata/uploaded_data"
train_val_file = "train_val.txt"

conv_grayscale_tranform204x204 = transforms.Compose([
    transforms.Resize((204, 204)),
    transforms.ToTensor()
])

calculate_mean_std(root, train_val_file, conv_grayscale_tranform204x204, True)

# (1x204x204) Grayscale Normal Normalized on train_val.txt
conv_grayscale_tranform204x204 = transforms.Compose([
    transforms.Resize((204, 204)),
    transforms.Grayscale(num_output_channels=1),
    transforms.ToTensor(),
    transforms.Normalize([0.43333843], [0.25460896])
])

# (3x204x204) Grayscale Normal Normalized on train_val.txt
conv_RGB_tranform204x204 = transforms.Compose([
    transforms.Resize((204, 204)),
    transforms.ToTensor(),
    transforms.Normalize([0.42010325, 0.43749467, 0.44676054], [0.249865 , 0.2580571 ,0.26895407])
])

# (1x227x227) Grayscale Normal Normalized on train_val.txt
conv_grayscale_tranform227x227 = transforms.Compose([
    transforms.Resize((227, 227)),
    transforms.Grayscale(num_output_channels=1),
    transforms.ToTensor(),
    transforms.Normalize([0.43332753], [0.25522682])
])

# (3x227x227) RGB Normal Normalized on train_val.txt
conv_RGB_tranform_227x227 = transforms.Compose([
    transforms.Resize((227, 227)), 
    transforms.ToTensor(),
    transforms.Normalize([0.4200925, 0.43748456, 0.4467506 ], [0.2505051, 0.25867647, 0.2695316 ])
])

100%|██████████| 2248/2248 [04:01<00:00,  9.30it/s]


Mean: [0.42010325 0.43749467 0.44676054]
Std : [0.249865   0.2580571  0.26895407]


TypeError: ToTensor.__init__() takes 1 positional argument but 3 were given

==== (1x227x227) normal images Train_val.txt Noramlization ==== <br>
Mean: [0.43332753] <br>
Std : [0.25522682] <br><br>

==== (3x227x227) normal images Train_val.txt Noramlization ==== <br>
Mean: [0.4200925, 0.43748456, 0.4467506 ] <br>
Std :  [0.2505051, 0.25867647, 0.2695316 ] <br>

## Aproach 2: ViViT  Video Vision Transformer