In [1]:
import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler
from torchinfo import summary
from tqdm import tqdm
from transformers.optimization import get_linear_schedule_with_warmup
import wandb

from dataset import ATMADataset
from models.timesformer_gru import TimesformerGRU

2025-01-08 09:21:06.397040: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
model = TimesformerGRU(pretrained_tsf="facebook/timesformer-base-finetuned-k400",
                       gru_hidden_size=128, gru_layers=2,
                       num_classes=2)

In [3]:
dummy_input = list(np.random.rand(8 ,3, 224, 224))
summary(model=model)

Layer (type:depth-idx)                                            Param #
TimesformerGRU                                                    --
├─TimesformerModel: 1-1                                           --
│    └─TimesformerEmbeddings: 2-1                                 158,208
│    │    └─TimesformerPatchEmbeddings: 3-1                       (590,592)
│    │    └─Dropout: 3-2                                          --
│    │    └─Dropout: 3-3                                          --
│    └─TimesformerEncoder: 2-2                                    --
│    │    └─ModuleList: 3-4                                       (120,508,416)
│    └─LayerNorm: 2-3                                             (1,536)
├─GRU: 1-2                                                        344,832
├─Dropout: 1-3                                                    --
├─Linear: 1-4                                                     258
Total params: 121,603,842
Trainable params: 345,090
Non-trainabl

In [4]:
class TrainingLoop:
    def __init__(self, model, dataloader, optimizer, scheduler, num_epochs, device):
        self.model = model
        self.train_dataloader = dataloader
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.num_epochs = num_epochs
        self.device = device

    def _grad_norm(self):
        total_norm = 0
        for p in self.model.parameters():
            param_grad = p.grad
            if param_grad is not None:
                param_norm = param_grad.data.norm(2)
                total_norm += param_norm.item() ** 2
        total_norm = total_norm ** (1. / 2)
        return total_norm

    def train(self):
        self.model.to(self.device)
        self.model.train()
        for epoch in range(self.num_epochs):
            epoch_iterator = tqdm(self.train_dataloader, desc=f"Epoch {epoch + 1}/{self.num_epochs}")
            for step, (inputs, labels) in enumerate(epoch_iterator):
                inputs, labels = inputs.to(self.device), labels.to(self.device)
                self.optimizer.zero_grad()
                outputs = self.model(inputs)
                # print(f"Input shape: {inputs.shape}")
                # print(f"Output shape: {outputs.shape}")
                # print(f"Labels shape: {labels.shape}")

                loss = torch.nn.functional.cross_entropy(outputs, labels)

                loss.backward()
                self.optimizer.step()
                self.scheduler.step()
                grad_norm = self._grad_norm()
                # wandb.log({"batch_loss": loss.item(), "grad_norm": grad_norm, "epoch": epoch})

                epoch_iterator.set_postfix(loss=loss.item())

        # Save model
        torch.save(self.model.state_dict(), "timesformer_gru.pth")

In [5]:
batch_size = 1
lr = 1e-4
warm_up_steps = 0
num_epochs = 100

In [6]:
dataset = ATMADataset(vid_folder_path="./datasets/ATMA-V/videos/train/aug",
                    label_path="./datasets/ATMA-V/labels/labels.txt")

train_sampler = RandomSampler(dataset)
train_dataloader = DataLoader(dataset, sampler=train_sampler, batch_size=batch_size)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warm_up_steps, num_training_steps=len(train_dataloader))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"

training_loop = TrainingLoop(model=model,
                             dataloader=train_dataloader,
                             optimizer=optimizer,
                             scheduler=scheduler,
                             num_epochs=num_epochs,
                             device=device)

In [7]:
# Inputs:  torch.Size([batch, 30, 16, 3, 224, 224])
# Load tensor time: 0.5 - 3s
# Bottleneck: Timesformer feature extraction time: ~19s

In [8]:
training_loop.train()

Epoch 1/100:   0%|          | 0/2572 [00:00<?, ?it/s]

Batch shape: torch.Size([30, 16, 3, 224, 224])


Epoch 1/100:   0%|          | 0/2572 [00:05<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 830.00 MiB. GPU 0 has a total capacity of 3.63 GiB of which 150.00 MiB is free. Including non-PyTorch memory, this process has 3.47 GiB memory in use. Of the allocated memory 2.62 GiB is allocated by PyTorch, and 605.70 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)