In [9]:
! pip install pandas opencv-python numpy pillow



In [None]:
import pandas as pd
import cv2
import numpy as np
from torch.utils.data import Dataset, Subset, DataLoader
from IPython import display
import torch
import time
from PIL import Image

DEVICE = 'cuda'

class VideoDataset(Dataset):
    def __init__(self, annotations_file, vid_file, device):
        self.annotations = pd.read_csv(annotations_file)
        labels = self.annotations['label']
        self.label_map = {c: i for i, c in enumerate(set(labels))}
        self.labels = torch.tensor([self.label_map[s] for s in labels]).to(device)
        self.cap = cv2.VideoCapture(vid_file)
        self.device = device

    def __len__(self):
        return len(self.annotations)
    
    def get_clip(self, start, length):
        self.cap.set(cv2.CAP_PROP_POS_FRAMES, start)
        
        return np.expand_dims(np.array([self.cap.read()[1][:,:,0] for _ in range(length)]).astype(np.float32), 1)

    def __getitem__(self, idx):
        row = self.annotations.loc[idx]
        return torch.tensor(self.get_clip(row['offset'], row['length'])).to(self.device), self.labels[idx]

def show_frame(frame):
    frame.cpu()
    display.display(Image.fromarray(frame.numpy().astype(np.uint8)))

def play_video(vid: torch.Tensor):
    for frame in vid[:,0,:,:]:
        display.clear_output(wait=True)
        show_frame(frame)
        time.sleep(1/15)
    
def get_first_last(vid):
    frame_start = vid[0][0].cpu()
    frame_end = vid[-1][0].cpu()
    
    show_frame(frame_start)
    show_frame(frame_end)

def stats(i):
    get_first_last(ds[i][0])
    print(ds.annotations.loc[i])
        
ds = VideoDataset('labels.csv', 'wlasl_downsampled.mp4', DEVICE)

stats(0)



AttributeError: module 'av' has no attribute 'AVError'

In [134]:
import torch
from sklearn.model_selection import train_test_split
import torch.nn.utils.rnn as rnn_utils

TEST_SIZE = 0.8
BATCH_SIZE = 64
SEED = 10

train_i, test_i = train_test_split(
    range(len(ds)),
    stratify=ds.annotations['label'],
    test_size=TEST_SIZE,
    random_state=SEED
    )

torch.manual_seed(SEED)

train_split = Subset(ds, train_i)
test_split = Subset(ds, test_i)

def collate(batch):
    batch.sort(key=lambda x: x[0].size(0), reverse=True)
    videos, labels = zip(*batch)

    lengths = [video.size(0) for video in videos]

    padded_videos = rnn_utils.pad_sequence(videos)
    
    return padded_videos, lengths, torch.stack(labels)

train_batches = DataLoader(train_split, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate)
test_batches = DataLoader(test_split, batch_size=BATCH_SIZE, collate_fn=collate)


In [None]:
from torch import nn


class TransformerModule(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers_2d = nn.Sequential(
            nn.Flatten(0, 1),
            nn.MaxPool2d(4),
            nn.BatchNorm2d(1),
            nn.Conv2d(1, 1, 5),
            )
        
        self.layers_1d = nn.Sequential(
            nn.Flatten(),
            nn.Linear(1680, 100),
            nn.ReLU(),
            nn.BatchNorm1d(100),
        )

        self.rnn = nn.GRU(100, 100)

        self.output = nn.Sequential(
            nn.BatchNorm1d(100),
            nn.Linear(100, 100),
            nn.ReLU(),
            nn.BatchNorm1d(100),
            nn.Linear(100, 2000),
            nn.Softmax(dim=1)
        )


    def forward(self, inp, lengths):
        x = inp
        frames, batch, channels, h, w = x.shape
        x = self.layers_2d(x)
        x = self.layers_1d(x)
        x = x.view(frames, batch, -1)
        x = rnn_utils.pack_padded_sequence(x, lengths, enforce_sorted=False)
        x, h = self.rnn(x)
        x = h[0, :, :]
        output = self.output(x)
        
        return output

model = TransformerModule()

def print_hook(module, args, output):
    inp = args[0]
    if isinstance(inp, torch.Tensor):
        inp = inp.shape
    elif isinstance(inp, rnn_utils.PackedSequence) :
        inp = inp.data.shape
    
    if isinstance(output, tuple):
        output = output[0]
    outp = output.shape
    name = type(module).__name__
    print(f'{name}: {inp}, {outp}')

handles = []

for child in model.children():
    if child.children():
        for child_ in child.children():
            handles.append(child_.register_forward_hook(print_hook))
    else:
        handles.append(child.register_forward_hook(print_hook))

sample = rnn_utils.pad_sequence(torch.rand([32, 110, 1, 136, 242]))
lengths = torch.randint(1, 100, (32,))

result = model(sample, lengths)

print(result.shape)

for handle in handles:
    handle.remove()

Flatten: torch.Size([110, 32, 1, 136, 242]), torch.Size([3520, 1, 136, 242])
MaxPool2d: torch.Size([3520, 1, 136, 242]), torch.Size([3520, 1, 34, 60])
BatchNorm2d: torch.Size([3520, 1, 34, 60]), torch.Size([3520, 1, 34, 60])
Conv2d: torch.Size([3520, 1, 34, 60]), torch.Size([3520, 1, 30, 56])
Flatten: torch.Size([3520, 1, 30, 56]), torch.Size([3520, 1680])
Linear: torch.Size([3520, 1680]), torch.Size([3520, 100])
ReLU: torch.Size([3520, 100]), torch.Size([3520, 100])
BatchNorm1d: torch.Size([3520, 100]), torch.Size([3520, 100])
BatchNorm1d: torch.Size([32, 100]), torch.Size([32, 100])
Linear: torch.Size([32, 100]), torch.Size([32, 100])
ReLU: torch.Size([32, 100]), torch.Size([32, 100])
BatchNorm1d: torch.Size([32, 100]), torch.Size([32, 100])
Linear: torch.Size([32, 100]), torch.Size([32, 100])
ReLU: torch.Size([32, 100]), torch.Size([32, 100])
BatchNorm1d: torch.Size([32, 100]), torch.Size([32, 100])
Linear: torch.Size([32, 100]), torch.Size([32, 2000])
Softmax: torch.Size([32, 2000]

In [136]:
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
from torch import profiler

criterion = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

model.to(DEVICE)


num_epochs = 1

with profiler.profile(
    activities=[
        profiler.ProfilerActivity.CPU,
        profiler.ProfilerActivity.CUDA,
    ]
) as prof:
    with profiler.record_function("model_forward"):
        model.train()
        for epoch in range(num_epochs):
            running_loss = 0.0
            for inputs, lengths, annotations in tqdm(train_batches, smoothing=0.8):
                optimizer.zero_grad()
                outputs = model(inputs, lengths)
                loss = criterion(outputs, annotations)
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
            print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_batches)}")

print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))


# model.eval()
# correct = 0
# total = 0
# with torch.no_grad():
#     for inputs, lengths, annotations in tqdm(test_batches, smoothing=0.8):
#         outputs = model(inputs, lengths)
#         _, predicted = torch.max(outputs.data, 1)
#         total += annotations.size(0)
#         correct += (predicted == annotations).sum().item()

# print(f"Accuracy: {100 * correct / total}%")

100%|██████████| 37/37 [00:19<00:00,  1.86it/s]


Epoch 1/1, Loss: 7.600675711760649
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          model_forward         4.60%     952.701ms        96.20%       19.934s       19.934s        1.515s         7.10%       19.932s       19.932s             1  
enumerate(DataLoader)#_SingleProcessDataLoaderIter._...        71.21%       14.754s        84.26%       17.458s     459.419ms       14.645s        68.66%       17.458s     