In [1]:
import sys
import os

# Add the project root to sys.path
sys.path.append(os.path.abspath(os.path.join(os.path.abspath(''), "..")))

### X3D

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from src.dataset import EpicKitchens100Dataset
from torchvision.transforms import Compose, Lambda
from torchvision.transforms import Normalize, CenterCrop
from src.transforms import FixedSizeClipSampler, TransformKey

def scale_pixels(x):
    return x / 255.0

def permute_tensor(x):
    return x.permute(1, 0, 2, 3)

mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
train_transform = Compose(
    [
        FixedSizeClipSampler(num_frames=32),
        Lambda(scale_pixels),
        Normalize(mean, std),
        CenterCrop(crop_size),
        Lambda(permute_tensor)
    ]
)
train_transform = TransformKey("frames", train_transform)

In [3]:
model_name = 'x3d_s'
model = torch.hub.load('facebookresearch/pytorchvideo', model_name, pretrained=True)
model

Using cache found in C:\Users\Jani/.cache\torch\hub\facebookresearch_pytorchvideo_main


Net(
  (blocks): ModuleList(
    (0): ResNetBasicStem(
      (conv): Conv2plus1d(
        (conv_t): Conv3d(3, 24, kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), bias=False)
        (conv_xy): Conv3d(24, 24, kernel_size=(5, 1, 1), stride=(1, 1, 1), padding=(2, 0, 0), groups=24, bias=False)
      )
      (norm): BatchNorm3d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (activation): ReLU()
    )
    (1): ResStage(
      (res_blocks): ModuleList(
        (0): ResBlock(
          (branch1_conv): Conv3d(24, 24, kernel_size=(1, 1, 1), stride=(1, 2, 2), bias=False)
          (branch2): BottleneckBlock(
            (conv_a): Conv3d(24, 54, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
            (norm_a): BatchNorm3d(54, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (act_a): ReLU()
            (conv_b): Conv3d(54, 54, kernel_size=(3, 3, 3), stride=(1, 2, 2), padding=(1, 1, 1), groups=54, bias=False)
            (nor

In [4]:
model.blocks[-1].proj

Linear(in_features=2048, out_features=400, bias=True)

In [6]:
from src.config import DevConfig

train_dataset = EpicKitchens100Dataset(DevConfig.ROOT_DIR, DevConfig.ANNOTATIONS_DIR_RELATIVE, transform=train_transform)
train_dataloader = DataLoader(train_dataset, batch_size = 2, shuffle = True, num_workers=0)

sample = next(iter(train_dataloader))
sample["frames"].shape

torch.Size([2, 3, 32, 256, 256])

In [None]:
# ------------------------------------------------------
# Currently hardcoded and temporary. To be changed later
# ------------------------------------------------------
num_classes = 88
model.blocks[-1].proj = nn.Linear(in_features=model.blocks[-1].proj.in_features, out_features=num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    running_loss = 0.0
    batch_count = 0
    for batch in train_dataloader:
        if batch_count % 50 == 0:
            print(f"training on batch: {batch_count}")
        input = batch["frames"].to(device)
        batch_count += 1
        
        labels = batch["verb_class"].to(device)
        
        # Forward pass
        outputs = model(input)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_dataloader):.4f}")

### SlowFast

In [None]:
from src.transforms import PackPathway

model = torch.hub.load("facebookresearch/pytorchvideo", "slowfast_r50", pretrained=True)
model.train()

def scale_pixels(x):
    return x / 255.0

def permute_tensor(x):
    return x.permute(1, 0, 2, 3)

mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
train_transform = Compose(
    [
        FixedSizeClipSampler(num_frames=32),
        Lambda(scale_pixels),
        Normalize(mean, std),
        CenterCrop(crop_size),
        Lambda(permute_tensor),
        PackPathway()
    ]
)
train_transform = TransformKey("frames", train_transform)

train_dataset = EpicKitchens100Dataset(DevConfig.ROOT_DIR, DevConfig.ANNOTATIONS_DIR_RELATIVE, transform=train_transform)
train_dataloader = DataLoader(train_dataset, batch_size = 2, shuffle = True, num_workers=0)

# ------------------------------------------------------
# Currently hardcoded and temporary. To be changed later
# ------------------------------------------------------
num_classes = 88
model.blocks[-1].proj = nn.Linear(in_features=model.blocks[-1].proj.in_features, out_features=num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    running_loss = 0.0
    batch_count = 0
    for batch in train_dataloader:
        if batch_count % 50 == 0:
            print(f"training on batch: {batch_count}")
        batch_count += 1
        inputs = batch["frames"]
        inputs = [inp.to(device) for inp in inputs]
        
        labels = batch["verb_class"].to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_dataloader):.4f}")