In [23]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import pandas as pd
from torchvision.io import read_video
from torchvision import transforms
from torchvision.models import resnet50


In [24]:
VIDEOS_ROOT = r"C:\Users\User\OneDrive\RD_Project\RD_Project"          
ANNOT_CSV   = r"C:\Users\User\OneDrive\RD_Project\RD_Project\EPIC_100_04.csv"
PARTICIPANT = "P01"
VIDEO_ID    = "P01_04"


In [25]:
df = pd.read_csv(ANNOT_CSV)

df_video = df[(df["participant_id"] == PARTICIPANT) & 
              (df["video_id"] == VIDEO_ID)].reset_index(drop=True)

df_video.head(), len(df_video)


(  narration_id participant_id video_id narration_timestamp start_timestamp  \
 0     P01_04_0            P01   P01_04        00:00:00.780     00:00:00.11   
 1     P01_04_1            P01   P01_04        00:00:03.480     00:00:02.87   
 2    P01_04_10            P01   P01_04        00:00:31.750     00:00:30.75   
 3    P01_04_11            P01   P01_04        00:00:37.410     00:00:37.33   
 4    P01_04_12            P01   P01_04        00:00:41.899     00:00:41.36   
 
   stop_timestamp  start_frame  stop_frame               narration       verb  \
 0    00:00:03.04            6         182                take cup       take   
 1    00:00:05.10          172         306            put down cup   put-down   
 2    00:00:35.04         1845        2102         fold tablecloth       fold   
 3    00:00:38.81         2239        2328    pull down tablecloth  pull-down   
 4    00:00:42.13         2481        2527  take washing up liquid       take   
 
    verb_class               noun  n

In [26]:
IMAGE_SIZE = 224
T_FRAMES = 8  # temporal samples per clip

frame_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(IMAGE_SIZE),
    transforms.ConvertImageDtype(torch.float32),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406], 
        std=[0.229, 0.224, 0.225]
    ),
])


In [27]:
import os

print("VIDEO_ID:", VIDEO_ID)
print("VIDEOS_ROOT:", VIDEOS_ROOT)

video_path = os.path.join(VIDEOS_ROOT, f"{VIDEO_ID}.MP4")  # same as before
print("video_path:", video_path)
print("Exists:", os.path.exists(video_path))

if os.path.exists(video_path):
    print("Size (bytes):", os.path.getsize(video_path))
    print("Dir listing:")
    print(os.listdir(VIDEOS_ROOT))


VIDEO_ID: P01_04
VIDEOS_ROOT: C:\Users\User\OneDrive\RD_Project\RD_Project
video_path: C:\Users\User\OneDrive\RD_Project\RD_Project\P01_04.MP4
Exists: True
Size (bytes): 11099311
Dir listing:
['.git', '.gitignore', 'EPIC_100_04.csv', 'EPIC_100_train.csv', 'EPIC_100_validation.csv', 'notebooks', 'P01_04.mp4', 'README.md', 'src']


In [28]:
class EpicSingleVideoDataset(Dataset):
    def __init__(self, video_path, annotations_df, t_frames=8, transform=None):
        self.annotations = annotations_df.reset_index(drop=True)
        self.t_frames = t_frames
        self.transform = transform

        # Load the video (all frames)
        video, _, info = read_video(video_path, pts_unit="sec")
        # video: (num_frames, H, W, 3) uint8
        self.video = video.permute(0, 3, 1, 2)  # (num_frames, 3, H, W)
        self.num_frames = self.video.shape[0]

        # You can check frame counts vs annotations here if you want
        print("Loaded video with", self.num_frames, "frames")

    def __len__(self):
        return len(self.annotations)

    def _sample_indices(self, start_f, end_f):
        """Uniformly sample t_frames between start and end (inclusive)."""
        start_f = int(start_f)
        end_f = int(end_f)
        length = max(end_f - start_f + 1, 1)
        t = self.t_frames

        if length >= t:
            indices = torch.linspace(start_f, end_f, steps=t)
        else:
            # if segment is very short, we repeat frames
            indices = torch.linspace(start_f, end_f, steps=length)
            # pad by repeating last
            pad = t - length
            last = indices[-1].repeat(pad)
            indices = torch.cat([indices, last])

        indices = indices.clamp(0, self.num_frames - 1).long()
        return indices

    def __getitem__(self, idx):
        row = self.annotations.iloc[idx]

        start_f = row["start_frame"]
        end_f   = row["stop_frame"]
        verb    = int(row["verb_class"])
        noun    = int(row["noun_class"])

        frame_indices = self._sample_indices(start_f, end_f)
        frames = self.video[frame_indices]  # (T, 3, H, W)

        if self.transform is not None:
            frames = torch.stack([self.transform(f) for f in frames], dim=0)

        # Output: (T, 3, H, W), verb_label, noun_label
        return frames, verb, noun


In [29]:
video_path = os.path.join(VIDEOS_ROOT, f"{VIDEO_ID}.MP4")  # or .avi
dataset = EpicSingleVideoDataset(video_path, df_video, t_frames=T_FRAMES, transform=frame_transform)

len(dataset)




Loaded video with 6308 frames


32

In [32]:
print("len(dataset):", len(dataset))

frames, verb, noun = dataset[0]
print("frames shape:", frames.shape)  # expect (T, 3, H, W)
print("verb:", verb, "noun:", noun)


len(dataset): 32
frames shape: torch.Size([8, 3, 224, 224])
verb: 0 noun: 13


In [31]:
# loader = DataLoader(dataset, batch_size=2, shuffle=True, num_workers=2)

# batch_frames, batch_verbs, batch_nouns = next(iter(loader))
loader = DataLoader(dataset, batch_size=2, shuffle=True, num_workers=0)
batch_frames, batch_verbs, batch_nouns = next(iter(loader))


print(batch_frames.shape)   # (B, T, 3, H, W)
print(batch_verbs.shape)    # (B,)
print(batch_nouns.shape)    # (B,)


torch.Size([2, 8, 3, 224, 224])
torch.Size([2])
torch.Size([2])


In [33]:
class Epic2DResNet(nn.Module):
    def __init__(self, num_verbs, num_nouns):
        super().__init__()
        base = resnet50(weights="IMAGENET1K_V2")

        self.backbone = nn.Sequential(*list(base.children())[:-1])  # up to global avg pool
        self.feat_dim = base.fc.in_features  # 2048 for resnet50

        self.fc_verb = nn.Linear(self.feat_dim, num_verbs)
        self.fc_noun = nn.Linear(self.feat_dim, num_nouns)

    def forward(self, x):
        # x: (B, T, 3, H, W)
        B, T, C, H, W = x.shape
        x = x.view(B * T, C, H, W)        # (B*T, 3, H, W)

        feats = self.backbone(x)          # (B*T, 2048, 1, 1)
        feats = feats.view(B, T, -1)      # (B, T, 2048)

        vid_feat = feats.mean(dim=1)      # temporal average -> (B, 2048)

        verb_logits = self.fc_verb(vid_feat)
        noun_logits = self.fc_noun(vid_feat)
        return verb_logits, noun_logits


In [34]:
num_verbs = int(df["verb_class"].max()) + 1
num_nouns = int(df["noun_class"].max()) + 1

model = Epic2DResNet(num_verbs, num_nouns)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to C:\Users\User/.cache\torch\hub\checkpoints\resnet50-11ad3fa6.pth


100%|██████████| 97.8M/97.8M [00:07<00:00, 13.5MB/s]


Epic2DResNet(
  (backbone): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): Conv2

In [35]:
batch_frames = batch_frames.to(device)
verb_logits, noun_logits = model(batch_frames)

print(verb_logits.shape)  # (B, num_verbs)
print(noun_logits.shape)  # (B, num_nouns)


torch.Size([2, 61])
torch.Size([2, 64])


In [36]:
subset_size = min(20, len(dataset))
subset_indices = list(range(subset_size))
subset = torch.utils.data.Subset(dataset, subset_indices)
train_loader = DataLoader(subset, batch_size=4, shuffle=True)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)


In [37]:
for epoch in range(5):  # small number for test
    model.train()
    running_loss = 0.0

    for frames, v, n in train_loader:
        frames = frames.to(device)      # (B,T,3,H,W)
        v = v.to(device)
        n = n.to(device)

        optimizer.zero_grad()
        verb_logits, noun_logits = model(frames)

        loss_verb = criterion(verb_logits, v)
        loss_noun = criterion(noun_logits, n)
        loss = loss_verb + loss_noun

        loss.backward()
        optimizer.step()

        running_loss += loss.item() * frames.size(0)

    avg_loss = running_loss / subset_size
    print(f"Epoch {epoch+1}: loss = {avg_loss:.4f}")


Epoch 1: loss = 8.1432
Epoch 2: loss = 7.2685
Epoch 3: loss = 6.2510
Epoch 4: loss = 5.1479
Epoch 5: loss = 4.1542


In [38]:
model.eval()
with torch.no_grad():
    frames, v, n = dataset[0]
    frames = frames.unsqueeze(0).to(device)  # (1,T,3,H,W)
    verb_logits, noun_logits = model(frames)
    verb_pred = verb_logits.argmax(dim=1).item()
    noun_pred = noun_logits.argmax(dim=1).item()

print("True verb:", v, "Pred verb:", verb_pred)
print("True noun:", n, "Pred noun:", noun_pred)


True verb: 0 Pred verb: 0
True noun: 13 Pred noun: 13


In [39]:
class EpicDataset(Dataset):
    def __init__(self, root_videos, annotations_df, t_frames=8, transform=None):
        self.annotations = annotations_df.reset_index(drop=True)
        self.t_frames = t_frames
        self.transform = transform
        self.root_videos = root_videos

    def _load_video(self, participant, video_id):
        path = os.path.join(self.root_videos, participant, f"{video_id}.MP4")
        video, _, _ = read_video(path, pts_unit="sec")
        return video.permute(0, 3, 1, 2)  # (num_frames,3,H,W)

    def __getitem__(self, idx):
        row = self.annotations.iloc[idx]
        # load + sample frames like before ...


In [40]:
num_verbs = int(df["verb_class"].max()) + 1
num_nouns = int(df["noun_class"].max()) + 1
