# X3D (eXpanded 3D) Implementation - Design Alternative 1

`Warning:` Do not run all the code, read the notes first before running them as they may reduce your device's memory when not careful, leading to a kernel crash or Out-Of-Memory (OOM) Error, sige ka magla-lag yang pc mo

`Note`: Isipin mo alternate universe na training toh

**Local System Specifications used to train the model:** <br>
| Component  | Device   |
| :------------- | :--------------- | 
| CPU   | AMD Ryzen 5 5600x     | 
| GPU   | Nvidia GeForce RTX 3070  8GB   | 
| RAM   | 32 GB     | 


`Note`: This notebook's codes will come from [PytorchVideo](https://pytorchvideo.org/docs/tutorial_overview) and from this Talha Anwar's [YouTube Playlist](https://www.youtube.com/playlist?list=PLtGXgNsNHqPRAscIi6dMUuPCfDQ1Vho2U)

#### Import Libraries

In [2]:
import sys
sys.path.append('pytorchvideo')
import torchvision.transforms.functional as F_t

sys.modules["torchvision.transforms.functional_tensor"] = F_t
import pytorchvideo
from pytorchvideo.data import *
import pytorch_lightning
import torchmetrics
import torch

import pandas as pd
import pathlib
import os
import numpy as np
import matplotlib.pyplot as plt

In [3]:
torch.cuda.is_available(), torch.cuda.get_device_name() # Check if GPU is available

(True, 'NVIDIA GeForce RTX 3070')

In [4]:
#pathlib.PosixPath = pathlib.WindowsPath # For windows only

#### Prepare the dataset

In [5]:
dataset_dir = pathlib.Path("./PD_Dataset")

In [6]:
def renameFile():
    for file_path in dataset_dir.rglob("*.avi"):
        # Get the parent directory name and clean it up
        dir_name_clean = file_path.parent.name.replace(" ", "_")
        # Clean the file name by replacing spaces with underscores
        file_clean = file_path.name.replace(" ", "_")
        
        # Create the new file name
        new_name = f"{dir_name_clean}_{file_clean}"
        new_path = file_path.parent / new_name
        
        print(f"Renaming: {file_path} -> {new_path}")
        file_path.rename(new_path)

In [7]:
#renameFile() #run once...

In [8]:
categories = ["BarbellCurl", "Deadlift", "LateralRaises", "OverheadPress", "Squat"]

video_labels = [
    [video for video in (dataset_dir / category).iterdir() if video.is_file()]
    for category in categories
]

# Flatten and create labels
label = [i for i, videos in enumerate(video_labels) for _ in videos]
combined = [video for sublist in video_labels for video in sublist]

print(label)
print(combined)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

In [9]:
df = pd.DataFrame(zip(combined, label), columns=["file", "label"])
df.head()

Unnamed: 0,file,label
0,PD_Dataset/BarbellCurl/BarbellCurl_barbell_cur...,0
1,PD_Dataset/BarbellCurl/BarbellCurl_barbell_cur...,0
2,PD_Dataset/BarbellCurl/BarbellCurl_barbell_cur...,0
3,PD_Dataset/BarbellCurl/BarbellCurl_barbell_cur...,0
4,PD_Dataset/BarbellCurl/BarbellCurl_barbell_cur...,0


In [10]:
df["label"].value_counts()

label
3    157
4    132
2    125
1    124
0    123
Name: count, dtype: int64

data is not balanced so... Remove tayo randomly :)

In [11]:
## Function to balance
def balance_dataset(df):
    # Find the minimum count among all classes
    min_count = df["label"].value_counts().min()
    
    # Downsample each class to match min_count
    balanced_df = (
        df.groupby("label").apply(lambda x: x.sample(n=min_count, random_state=43)).reset_index(drop=True)
    )
    
    return balanced_df

In [12]:
df = balance_dataset(df)
df["label"].value_counts() # Check for the number of data

  df.groupby("label").apply(lambda x: x.sample(n=min_count, random_state=43)).reset_index(drop=True)


label
0    123
1    123
2    123
3    123
4    123
Name: count, dtype: int64

#### Split the dataset

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

train_df, val_df = train_test_split(df, test_size=0.3, shuffle=True)
val_df, test_df = train_test_split(val_df, test_size=0.1, shuffle=False)

In [14]:
len(train_df), len(val_df), len(test_df)

(430, 166, 19)

In [15]:
train_df.to_csv("dataset20/train_df.csv", header=False , sep=" ", index=False) 
val_df.to_csv("dataset20/val_df.csv", header=False, sep=" ", index=False)
test_df.to_csv("dataset20/test_df.csv", header=False, sep=" ", index=False)

#### Prepare and augment the dataset for better generalization of the mudel 

In [16]:
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip
)

In [17]:
class KineticsDataModule(pytorch_lightning.LightningDataModule):

  # Dataset configuration
  _DATA_PATH = "dataset20"
  _CLIP_DURATION = 2  # Duration of sampled clip for each video
  _BATCH_SIZE = 6
  _NUM_WORKERS = 0  # Number of parallel processes fetching data

  train_transform = Compose(
            [
            ApplyTransformToKey(
              key="video",
              transform=Compose(
                  [
                    UniformTemporalSubsample(15),
                    Lambda(lambda x: x / 255.0),
                    Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                    RandomShortSideScale(min_size=248, max_size=256),
                    RandomCrop(224),
                    RandomHorizontalFlip(p=0.5),
                  ]
                ),
              ),
            ]
        )
  
  video_transform = Compose(
            [
            ApplyTransformToKey(
              key="video",
              transform=Compose(
                  [
                    UniformTemporalSubsample(15),
                    Lambda(lambda x: x / 255.0),
                    Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225))
                  ]
                ),
              ),
            ]
        )

  def train_dataloader(self):
    """
    Create the Kinetics train partition from the list of video labels
    in {self._DATA_PATH}/train
    """

    train_dataset = Kinetics(
        data_path=os.path.join(self._DATA_PATH, "train_df.csv"),
        clip_sampler=make_clip_sampler("random", self._CLIP_DURATION),
        transform=self.train_transform,
        decode_audio=False,
    )
    return torch.utils.data.DataLoader(
        train_dataset,
        batch_size=self._BATCH_SIZE,
        num_workers=self._NUM_WORKERS,
    )

  def val_dataloader(self):
    """
    Create the Kinetics validation partition from the list of video labels
    in {self._DATA_PATH}/val
    """
    val_dataset = Kinetics(
        data_path=os.path.join(self._DATA_PATH, "val_df.csv"),
        clip_sampler=make_clip_sampler("uniform", self._CLIP_DURATION),
        transform=self.video_transform,
        decode_audio=False,
    )
    return torch.utils.data.DataLoader(
        val_dataset,
        batch_size=self._BATCH_SIZE,
        num_workers=self._NUM_WORKERS,
    )
  
  def test_dataloader(self):
    """
    Create the Kinetics testing partition from the list of video labels
    in {self._DATA_PATH}/test
    """
    test_dataset = Kinetics(
        data_path=os.path.join(self._DATA_PATH, "test_df.csv"),
        clip_sampler=make_clip_sampler("uniform", self._CLIP_DURATION),
        transform=self.video_transform,
        decode_audio=False,
    )
    return torch.utils.data.DataLoader(
        test_dataset,
        batch_size=self._BATCH_SIZE,
        num_workers=self._NUM_WORKERS,
    )

In [18]:
video_transform = Compose(
            [
            ApplyTransformToKey(
              key="video",
              transform=Compose(
                  [
                    UniformTemporalSubsample(15),
                    Lambda(lambda x: x / 255.0),
                    Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225))
                  ]
                ),
              ),
            ]
        )

test_dataset = Kinetics("dataset20/test_df.csv",
                                      clip_sampler=make_clip_sampler('uniform', 2),
                                      transform=video_transform,
                                      decode_audio=False
                                      )
loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=6,
    num_workers=0,
    pin_memory=True
)

batch = next(iter(loader))
print(batch.keys())
print(batch['video'].shape, batch['label'].shape)

dict_keys(['video', 'video_name', 'video_index', 'clip_index', 'aug_index', 'label'])
torch.Size([6, 3, 15, 240, 320]) torch.Size([6])


We can see that each clip in the dataset is in following dictionary format:
```
 {
     'video': <video_tensor>,     # Shape: (C, T, H, W)
     'audio': <audio_tensor>,     # Shape: (S)
     'label': <action_label>,     # Integer defining class annotation
     'video_name': <video_path>,  # Video file path stem
     'video_index': <video_id>,   # index of video used by sampler
     'clip_index': <clip_id>      # index of clip sampled within video
  } 
```

In [19]:
from pytorchvideo.models.x3d import create_x3d

def make_kinetics_x3d():
    return create_x3d(
        input_channel=3,
        depth_factor=2.2,
        model_num_class=len(categories),
        dropout_rate=0.5,
        norm=torch.nn.BatchNorm3d,
        activation=torch.nn.ReLU
    )

In [None]:
class VideoClassificationLightningModule(pytorch_lightning.LightningModule):
  def __init__(self):
      super().__init__()
      self.model = make_kinetics_x3d()
      self.mca = torchmetrics.Accuracy(task='multiclass', num_classes=len(categories))

  def forward(self, x):
      return self.model(x)

  def training_step(self, batch, batch_idx):
      # The model expects a video tensor of shape (B, C, T, H, W), which is the
      # format provided by the dataset
      y_hat = self.model(batch["video"])

      # Compute cross entropy loss, loss.backwards will be called behind the scenes
      # by PyTorchLightning after being returned from this method.
      loss = torch.nn.functional.cross_entropy(y_hat, batch["label"])

      # Log the train loss to Tensorboard
      self.log("train_loss", loss.item())
      
      return loss

  def validation_step(self, batch, batch_idx):
      y_hat = self.model(batch["video"])
      loss = torch.nn.functional.cross_entropy(y_hat, batch["label"])
      metric = self.mca(y_hat, batch['label'].to(torch.int64))
      self.log("val_loss", loss)
      return loss

  def configure_optimizers(self):
      """
      Setup the Adam optimizer. Note, that this function also can return a lr scheduler, which is
      usually useful for training video models.
      """
      return torch.optim.Adam(self.parameters(), lr=1e-4)

In [21]:
import pytorch_lightning.callbacks

checkpoints_callback = pytorch_lightning.callbacks.ModelCheckpoint(
    monitor='val_loss',
    dirpath='checkpoints',
    filename='file',
    save_last=True)

lr_monitor = pytorch_lightning.callbacks.LearningRateMonitor(logging_interval='epoch')

In [22]:
classification_module = VideoClassificationLightningModule()

data_module = KineticsDataModule()
trainer = pytorch_lightning.Trainer(
    max_epochs=50,
    accelerator='gpu',
    devices=-1,
    accumulate_grad_batches=2,
    enable_progress_bar=True,
    num_sanity_val_steps=0,
    callbacks=[lr_monitor, checkpoints_callback],
    limit_train_batches=36,
    limit_val_batches=12
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/joker/Documents/devProjects/myenv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [22]:
trainer.fit(classification_module, data_module)

You are using a CUDA device ('NVIDIA GeForce RTX 3070') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/home/joker/Documents/devProjects/myenv/lib/python3.12/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/joker/Documents/devProjects/Python/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type | Params | Mode 
---------------------------------------
0 | model | Net  | 3.0 M  | train
---------------------------------------
3.0 M     Trainable params
0         Non-trainable params
3.0 M     Total params
11.940    Total estimated model params size (MB)
447       Modules in train mode
0         Modules in eval mode
/home/joker/Documents/devProjec

Epoch 0: 100%|██████████| 36/36 [00:58<00:00,  0.61it/s, v_num=79]

/home/joker/Documents/devProjects/myenv/lib/python3.12/site-packages/pytorch_lightning/utilities/data.py:79: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 6. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


Epoch 7:   0%|          | 0/36 [00:00<?, ?it/s, v_num=79]         

Could not find ref with POC 0


Epoch 14: 100%|██████████| 36/36 [00:57<00:00,  0.63it/s, v_num=79]

Could not find ref with POC 0
 (repeated 5 more times)
Could not find ref with POC 1024


Epoch 49: 100%|██████████| 36/36 [01:06<00:00,  0.54it/s, v_num=79]

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 36/36 [01:06<00:00,  0.54it/s, v_num=79]


In [30]:
trainer.validate(datamodule=data_module)

Restoring states from the checkpoint path at /home/joker/Documents/devProjects/Python/checkpoints/file-v3.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at /home/joker/Documents/devProjects/Python/checkpoints/file-v3.ckpt
/home/joker/Documents/devProjects/myenv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Validation DataLoader 0: 100%|██████████| 12/12 [00:13<00:00,  0.86it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        val_loss            1.5119069814682007
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_loss': 1.5119069814682007}]