In [1]:
!pip install remotezip



In [2]:
import tqdm
import random
import pathlib
import itertools
import collections

import cv2
import einops
import numpy as np
import remotezip as rz
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn

import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader

# import tensorflow as tf
# import keras
# from keras import layers

In [68]:
def list_files_per_class(zip_url):
  """
    List the files in each class of the dataset given the zip URL.

    Args:
      zip_url: URL from which the files can be unzipped.

    Return:
      files: List of files in each of the classes.
  """
  files = []
  with rz.RemoteZip(zip_url) as zip:
    for zip_info in zip.infolist():
      files.append(zip_info.filename)
  return files

def get_class(fname):
  """
    Retrieve the name of the class given a filename.

    Args:
      fname: Name of the file in the UCF101 dataset.

    Return:
      Class that the file belongs to.
  """
  return fname.split('_')[-3]

def get_files_per_class(files):
  """
    Retrieve the files that belong to each class.

    Args:
      files: List of files in the dataset.

    Return:
      Dictionary of class names (key) and files (values).
  """
  files_for_class = collections.defaultdict(list)
  for fname in files:
    class_name = get_class(fname)
    files_for_class[class_name].append(fname)
  return files_for_class

def download_from_zip(zip_url, to_dir, file_names):
  """
    Download the contents of the zip file from the zip URL.

    Args:
      zip_url: Zip URL containing data.
      to_dir: Directory to download data to.
      file_names: Names of files to download.
  """
  with rz.RemoteZip(zip_url) as zip:
    for fn in tqdm.tqdm(file_names):
      class_name = get_class(fn)
      zip.extract(fn, str(to_dir / class_name))
      unzipped_file = to_dir / class_name / fn

      fn = pathlib.Path(fn).parts[-1]
      output_file = to_dir / class_name / fn
      unzipped_file.rename(output_file,)

def split_class_lists(files_for_class, count):
  """
    Returns the list of files belonging to a subset of data as well as the remainder of
    files that need to be downloaded.

    Args:
      files_for_class: Files belonging to a particular class of data.
      count: Number of files to download.

    Return:
      split_files: Files belonging to the subset of data.
      remainder: Dictionary of the remainder of files that need to be downloaded.
  """
  split_files = []
  remainder = {}
  for cls in files_for_class:
    split_files.extend(files_for_class[cls][:count])
    remainder[cls] = files_for_class[cls][count:]
  return split_files, remainder

def download_ufc_101_subset(zip_url, num_classes, splits, download_dir):
  """
    Download a subset of the UFC101 dataset and split them into various parts, such as
    training, validation, and test.

    Args:
      zip_url: Zip URL containing data.
      num_classes: Number of labels.
      splits: Dictionary specifying the training, validation, test, etc. (key) division of data
              (value is number of files per split).
      download_dir: Directory to download data to.

    Return:
      dir: Posix path of the resulting directories containing the splits of data.
  """
  files = list_files_per_class(zip_url)
  for f in files:
    tokens = f.split('/')
    if len(tokens) <= 2:
      files.remove(f) # Remove that item from the list if it does not have a filename

  files_for_class = get_files_per_class(files)

  classes = list(files_for_class.keys())[:num_classes]

  for cls in classes:
    new_files_for_class = files_for_class[cls]
    random.shuffle(new_files_for_class)
    files_for_class[cls] = new_files_for_class

  # Only use the number of classes you want in the dictionary
  files_for_class = {x: files_for_class[x] for x in list(files_for_class)[:num_classes]}

  dirs = {}
  for split_name, split_count in splits.items():
    print(split_name, ":")
    split_dir = download_dir / split_name
    split_files, files_for_class = split_class_lists(files_for_class, split_count)
    download_from_zip(zip_url, split_dir, split_files)
    dirs[split_name] = split_dir

  return dirs

# def format_frames(frame, output_size):
#   """
#     Pad and resize an image from a video.

#     Args:
#       frame: Image that needs to resized and padded.
#       output_size: Pixel size of the output frame image.

#     Return:
#       Formatted frame with padding of specified output size.
#   """
#   frame = tf.image.convert_image_dtype(frame, tf.float32)
#   frame = tf.image.resize_with_pad(frame, *output_size)
#   return frame

def format_frames(frame, output_size):
    """
    Pad and resize an image from a video.

    Args:
      frame: Image that needs to be resized and padded.
      output_size: Pixel size of the output frame image (height, width).

    Returns:
      Formatted frame with padding of specified output size.
    """
    frame = frame.astype(float) / 255.0
    frame = np.transpose(frame, (2, 0, 1))
    # Define the padding and resizing transform
    transform = T.Compose([
        T.Resize(output_size),  # Resize with padding to fit output size
        # T.Pad([max(0, output_size[1] - frame.shape[2]), max(0, output_size[0] - frame.shape[1])]),
    ])
    # Apply the transform
    frame = transform(torch.from_numpy(frame))

    return frame


def frames_from_video_file(video_path, n_frames, output_size = (224,224), frame_step = 15):
  """
    Creates frames from each video file present for each category.

    Args:
      video_path: File path to the video.
      n_frames: Number of frames to be created per video file.
      output_size: Pixel size of the output frame image.

    Return:
      An NumPy array of frames in the shape of (n_frames, height, width, channels).
  """
  # Read each video frame by frame
  result = []
  src = cv2.VideoCapture(str(video_path))

  video_length = src.get(cv2.CAP_PROP_FRAME_COUNT)

  need_length = 1 + (n_frames - 1) * frame_step

  if need_length > video_length:
    start = 0
  else:
    max_start = video_length - need_length
    start = random.randint(0, max_start + 1)

  src.set(cv2.CAP_PROP_POS_FRAMES, start)
  # ret is a boolean indicating whether read was successful, frame is the image itself
  ret, frame = src.read()
  result.append(format_frames(frame, output_size))

  for _ in range(n_frames - 1):
    for _ in range(frame_step):
      ret, frame = src.read()
    if ret:
      frame = format_frames(frame, output_size)
      result.append(frame)
    else:
      result.append(np.zeros_like(result[0]))
  src.release()
  result = np.array(result)#[..., [2, 1, 0]]
  return np.transpose(result, (1, 2, 3, 0))

# class FrameGenerator:
#   def __init__(self, path, n_frames, training = False):
#     """ Returns a set of frames with their associated label.

#       Args:
#         path: Video file paths.
#         n_frames: Number of frames.
#         training: Boolean to determine if training dataset is being created.
#     """
#     self.path = path
#     self.n_frames = n_frames
#     self.training = training
#     self.class_names = sorted(set(p.name for p in self.path.iterdir() if p.is_dir()))
#     self.class_ids_for_name = dict((name, idx) for idx, name in enumerate(self.class_names))

#   def get_files_and_class_names(self):
#     video_paths = list(self.path.glob('*/*.avi'))
#     classes = [p.parent.name for p in video_paths]
#     return video_paths, classes

#   def __call__(self):
#     video_paths, classes = self.get_files_and_class_names()

#     pairs = list(zip(video_paths, classes))

#     if self.training:
#       random.shuffle(pairs)

#     for path, name in pairs:
#       video_frames = frames_from_video_file(path, self.n_frames)
#       label = self.class_ids_for_name[name] # Encode labels
#       yield video_frames, label


class FrameGeneratorDataset(Dataset):
    def __init__(self, path, n_frames, training=False):
        """
        Returns a set of frames with their associated label.

        Args:
          path: Directory path containing video files.
          n_frames: Number of frames to extract from each video.
          training: Boolean indicating if this is the training dataset (to enable shuffling).
        """
        self.path = path
        self.n_frames = n_frames
        self.training = training

        # Get all class names and assign a unique ID to each
        self.class_names = sorted(set(p.name for p in self.path.iterdir() if p.is_dir()))
        self.class_ids_for_name = {name: idx for idx, name in enumerate(self.class_names)}

        # Get video file paths and their corresponding class labels
        self.video_paths, self.classes = self.get_files_and_class_names()

        # Shuffle the pairs if in training mode
        if self.training:
            data = list(zip(self.video_paths, self.classes))
            random.shuffle(data)
            self.video_paths, self.classes = zip(*data)

    def get_files_and_class_names(self):
        video_paths = list(self.path.glob('*/*.avi'))
        classes = [p.parent.name for p in video_paths]
        return video_paths, classes

    def __len__(self):
        # Return the number of video files available
        return len(self.video_paths)

    def __getitem__(self, idx):
        """
        Retrieve frames and the label for a given video index.

        Args:
          idx: Index of the video to retrieve.

        Returns:
          A tuple (frames, label) where frames are the extracted frames from the video,
          and label is the encoded class label.
        """
        path = self.video_paths[idx]
        class_name = self.classes[idx]

        # Extract frames from the video file
        video_frames = frames_from_video_file(path, self.n_frames)

        # Get label by encoding the class name
        label = self.class_ids_for_name[class_name]

        # Convert frames and label to tensors
        video_frames = torch.tensor(video_frames, dtype=torch.float32)
        label = torch.tensor(label, dtype=torch.int64)

        return video_frames, label


In [4]:
URL = 'https://storage.googleapis.com/thumos14_files/UCF101_videos.zip'
download_dir = pathlib.Path('./UCF101_subset/')
subset_paths = download_ufc_101_subset(URL,
                        num_classes = 10,
                        splits = {"train": 30, "val": 10, "test": 10},
                        download_dir = download_dir)

train :


100%|██████████| 300/300 [00:19<00:00, 15.13it/s]


val :


100%|██████████| 100/100 [00:06<00:00, 15.48it/s]


test :


100%|██████████| 100/100 [00:06<00:00, 15.42it/s]


In [98]:
arr = np.array([[[1, 2, 3],
                [4, 5, 6]],

                [[7, 8, 9],
                [10, 11, 12]]])
# arr = np.array([[[1, 2, 3]],

#                 [[7, 8, 9]]])
print(arr.shape)
arr = np.transpose(arr, (2, 0, 1))
print(arr)
print(arr.shape)

(2, 2, 3)
[[[ 1  4]
  [ 7 10]]

 [[ 2  5]
  [ 8 11]]

 [[ 3  6]
  [ 9 12]]]
(3, 2, 2)


In [None]:
# n_frames = 10
# batch_size = 8

# output_signature = (tf.TensorSpec(shape = (None, None, None, 3), dtype = tf.float32),
#                     tf.TensorSpec(shape = (), dtype = tf.int16))

# train_ds = tf.data.Dataset.from_generator(FrameGenerator(subset_paths['train'], n_frames, training=True),
#                                           output_signature = output_signature)


# # Batch the data
# train_ds = train_ds.batch(batch_size)

# val_ds = tf.data.Dataset.from_generator(FrameGenerator(subset_paths['val'], n_frames),
#                                         output_signature = output_signature)
# val_ds = val_ds.batch(batch_size)

# test_ds = tf.data.Dataset.from_generator(FrameGenerator(subset_paths['test'], n_frames),
#                                          output_signature = output_signature)

# test_ds = test_ds.batch(batch_size)

In [None]:
frame = frames_from_video_file("/content/UCF101_subset/test/ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c01.avi", 4, output_size = (2,2), frame_step = 15)
print(frame.shape)
# print(frame.shape)

(3, 4, 2, 2)


In [69]:
# Parameters
n_frames = 10
batch_size = 8

# Dataset paths
train_dataset = FrameGeneratorDataset(subset_paths['train'], n_frames, training=True)
val_dataset = FrameGeneratorDataset(subset_paths['val'], n_frames, training=False)
test_dataset = FrameGeneratorDataset(subset_paths['test'], n_frames, training=False)

# DataLoaders with batching
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [70]:
print(train_dataset.__getitem__(0)[0].shape)

torch.Size([3, 224, 224, 10])


In [None]:
frames, label = train_dataset.__getitem__(0)
print(frames.size())

torch.Size([3, 240, 320])
torch.Size([3, 224, 224])
torch.Size([3, 240, 320])
torch.Size([3, 224, 224])
torch.Size([3, 240, 320])
torch.Size([3, 224, 224])
torch.Size([3, 240, 320])
torch.Size([3, 224, 224])
torch.Size([3, 240, 320])
torch.Size([3, 224, 224])
torch.Size([3, 240, 320])
torch.Size([3, 224, 224])
torch.Size([3, 240, 320])
torch.Size([3, 224, 224])
torch.Size([3, 240, 320])
torch.Size([3, 224, 224])
torch.Size([10, 3, 224, 224])


In [12]:
count = 0
for i in val_loader:
  count += 1
  print(i[0].size())
  if count == 2:
    break

torch.Size([8, 3, 10, 224, 224])
torch.Size([8, 3, 10, 224, 224])


In [None]:
# for i in test_loader:
#   print(i)
#   assert False

[tensor([[[[[0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.],
           ...,
           [0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.]],

          [[0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.],
           ...,
           [0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.]],

          [[0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.],
           ...,
           [0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.]],

          ...,

          [[0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.],
           ...,
           [0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.]],

          [[0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.],
           ...,
           [0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.]],

          [[0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.],
           ...,
           [0., 0., 0.],
           [0., 0., 0.],
  

AssertionError: 

In [7]:
# Define the dimensions of one frame in the set of frames created
HEIGHT = 224
WIDTH = 224

In [None]:
# class Conv2Plus1D(keras.layers.Layer):
#   def __init__(self, filters, kernel_size, padding):
#     """
#       A sequence of convolutional layers that first apply the convolution operation over the
#       spatial dimensions, and then the temporal dimension.
#     """
#     super().__init__()
#     self.seq = keras.Sequential([
#         # Spatial decomposition
#         layers.Conv3D(filters=filters,
#                       kernel_size=(1, kernel_size[1], kernel_size[2]),
#                       padding=padding),
#         # Temporal decomposition
#         layers.Conv3D(filters=filters,
#                       kernel_size=(kernel_size[0], 1, 1),
#                       padding=padding)
#         ])

#   def call(self, x):
#     return self.seq(x)

In [71]:
class Conv2Plus1D(nn.Module):
    def __init__(self, input_channels, filters, kernel_size, padding):
        """
        A sequence of convolutional layers that first apply the convolution operation over the
        spatial dimensions, and then the temporal dimension.

        Args:
          filters (int): Number of filters in each convolutional layer.
          kernel_size (tuple): Kernel size for the convolution in (depth, height, width).
          padding (str): Padding mode ('same' or 'valid').
        """
        super(Conv2Plus1D, self).__init__()

        # Define padding as per PyTorch's Conv3d requirements
        if padding == "same":
            padding_spatial = (0, kernel_size[1] // 2, kernel_size[2] // 2)
            padding_temporal = (kernel_size[0] // 2, 0, 0)
        elif padding == "valid":
            padding_spatial = (0, 0, 0)
            padding_temporal = (0, 0, 0)
        else:
            raise ValueError("Unsupported padding mode. Use 'same' or 'valid'.")

        # Sequential container for spatial and temporal convolution
        self.seq = nn.Sequential(
            # Spatial decomposition
            nn.Conv3d(in_channels=input_channels,
                      out_channels=filters,
                      kernel_size=(1, kernel_size[1], kernel_size[2]),
                      padding=padding_spatial),
            # Temporal decomposition
            nn.Conv3d(in_channels=filters,
                      out_channels=filters,
                      kernel_size=(kernel_size[0], 1, 1),
                      padding=padding_temporal)
        )

    def forward(self, x):
        return self.seq(x)


In [57]:
Conv2Plus1D(3, 16, (3, 3, 3), 'same')

Conv2Plus1D(
  (seq): Sequential(
    (0): Conv3d(3, 16, kernel_size=(1, 3, 3), stride=(1, 1, 1), padding=(0, 1, 1))
    (1): Conv3d(16, 16, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0))
  )
)

In [None]:
# class ResidualMain(keras.layers.Layer):
#   """
#     Residual block of the model with convolution, layer normalization, and the
#     activation function, ReLU.
#   """
#   def __init__(self, filters, kernel_size):
#     super().__init__()
#     self.seq = keras.Sequential([
#         Conv2Plus1D(filters=filters,
#                     kernel_size=kernel_size,
#                     padding='same'),
#         layers.LayerNormalization(),
#         layers.ReLU(),
#         Conv2Plus1D(filters=filters,
#                     kernel_size=kernel_size,
#                     padding='same'),
#         layers.LayerNormalization()
#     ])

#   def call(self, x):
#     return self.seq(x)

In [72]:
class ResidualMain(nn.Module):
    """
    Residual block of the model with convolution, layer normalization, and ReLU activation.
    """
    def __init__(self, input_channels, filters, kernel_size, height, width):
        super(ResidualMain, self).__init__()
        self.seq = nn.Sequential(
            Conv2Plus1D(input_channels=input_channels, filters=filters, kernel_size=kernel_size, padding='same'),
            nn.LayerNorm(normalized_shape=[filters, height, width, n_frames]),  # LayerNorm for channels-first
            nn.ReLU(),
            Conv2Plus1D(input_channels=filters, filters=filters, kernel_size=kernel_size, padding='same'),
            nn.LayerNorm(normalized_shape=[filters, height, width, n_frames])   # LayerNorm for channels-first
        )

    def forward(self, x):
        return self.seq(x)


In [None]:
N, C, H, W = 8, 3, 224, 224
input = torch.randn(N, C, H, W)
layer_norm = nn.LayerNorm([C, H, W])
output = layer_norm(input)
output.shape

torch.Size([8, 3, 224, 224])

In [None]:
# class Project(keras.layers.Layer):
#   """
#     Project certain dimensions of the tensor as the data is passed through different
#     sized filters and downsampled.
#   """
#   def __init__(self, units):
#     super().__init__()
#     self.seq = keras.Sequential([
#         layers.Dense(units),
#         layers.LayerNormalization()
#     ])

#   def call(self, x):
#     return self.seq(x)

In [86]:
import torch
import torch.nn as nn

class Project(nn.Module):
    """
    Project certain dimensions of the tensor as the data is passed through different
    sized filters and downsampled.
    """
    def __init__(self, units, height, width):
        super(Project, self).__init__()
        self.seq = nn.Sequential(
            nn.Linear(in_features=units[0], out_features=units[1]),  # Linear projection
            nn.LayerNorm(units[1])
            # nn.LayerNorm(normalized_shape=[units[1], height, width, n_frames])  # Apply LayerNorm after the linear layer
        )

    def forward(self, x):
        return self.seq(x)


In [None]:
# def add_residual_block(input, filters, kernel_size):
#   """
#     Add residual blocks to the model. If the last dimensions of the input data
#     and filter size does not match, project it such that last dimension matches.
#   """
#   out = ResidualMain(filters,
#                      kernel_size)(input)

#   res = input
#   # Using the Keras functional APIs, project the last dimension of the tensor to
#   # match the new filter size
#   if out.shape[-1] != input.shape[-1]:
#     res = Project(out.shape[-1])(res)

#   return layers.add([res, out])

In [87]:
def add_residual_block(x, filters, kernel_size):
    """
    Add residual blocks to the model. If the last dimension of the input data
    and filter size do not match, project it so the last dimension matches.

    Args:
      x (torch.Tensor): Input tensor.
      filters (int): Number of filters in the residual block.
      kernel_size (tuple): Kernel size for the Conv2Plus1D layers.

    Returns:
      torch.Tensor: Output tensor with residual connection.
    """
    # Pass input through the residual block
    out = ResidualMain(x.size(1), filters, kernel_size, x.size(2), x.size(3))(x)
    # Check if projection is needed
    res = x
    if out.shape[1] != x.shape[1]:  # Compare channel dimensions in PyTorch (C dimension)
        res = Project(units=(x.shape[1], out.shape[1]), height=x.size(2), width=x.size(3))(res.permute(0, 4, 2, 3, 1))
        res = res.permute(0, 4, 2, 3, 1)
        # Project channels to match out

    # Element-wise addition for the residual connection
    return torch.add(res, out)


In [None]:
# class ResizeVideo(keras.layers.Layer):
#   def __init__(self, height, width):
#     super().__init__()
#     self.height = height
#     self.width = width
#     self.resizing_layer = layers.Resizing(self.height, self.width)

#   def call(self, video):
#     """
#       Use the einops library to resize the tensor.

#       Args:
#         video: Tensor representation of the video, in the form of a set of frames.

#       Return:
#         A downsampled size of the video according to the new height and width it should be resized to.
#     """
#     # b stands for batch size, t stands for time, h stands for height,
#     # w stands for width, and c stands for the number of channels.
#     old_shape = einops.parse_shape(video, 'b t h w c')
#     images = einops.rearrange(video, 'b t h w c -> (b t) h w c')
#     images = self.resizing_layer(images)
#     videos = einops.rearrange(
#         images, '(b t) h w c -> b t h w c',
#         t = old_shape['t'])
#     return videos

In [100]:
output_tensor = einops.rearrange(arr, 't b c -> b c t')
output_tensor

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [75]:
import torch.nn.functional as F
import einops

class ResizeVideo(nn.Module):
    def __init__(self, height, width):
        super(ResizeVideo, self).__init__()
        self.height = height
        self.width = width

    def forward(self, video):
        """
        Resize the tensor using einops and PyTorch's interpolation.

        Args:
          video (torch.Tensor): Tensor representation of the video, as a batch of frames.

        Returns:
          torch.Tensor: Resized video tensor with the new height and width.
        """
        # Parse the shape and rearrange for resizing
        old_shape = einops.parse_shape(video, 'b c h w t')
        images = einops.rearrange(video, 'b c h w t -> b t h w c')
        images = einops.rearrange(images, 'b t h w c -> (b t) h w c')

        # Resize images using interpolate
        images = F.interpolate(images.permute(0, 3, 1, 2), size=(self.height, self.width), mode='bilinear')
        images = images.permute(0, 2, 3, 1)  # Convert back to (batch, height, width, channels)

        # Reshape back to video format
        videos = einops.rearrange(images, '(b t) h w c -> b t h w c', t=old_shape['t'])
        videos = einops.rearrange(videos, 'b t h w c -> b c h w t')
        return videos


In [88]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class VideoModel(nn.Module):
    def __init__(self, height, width):
        super(VideoModel, self).__init__()
        self.height = height
        self.width = width

        # Initial Conv2Plus1D block
        self.initial_conv = nn.Sequential(
            Conv2Plus1D(input_channels=3, filters=16, kernel_size=(3, 7, 7), padding='same'),
            nn.BatchNorm3d(16),
            nn.ReLU(),
            ResizeVideo(height // 2, width // 2)
        )

        # Residual Blocks with Resizing
        self.block1 = ResizeVideo(height // 4, width // 4)

        self.block2 = ResizeVideo(height // 8, width // 8)

        self.block3 = ResizeVideo(height // 16, width // 16)

        # Pooling and Output Layers
        self.global_avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(128, 10)  # Adjust the input to `Linear` based on the final channel size

    def forward(self, x):
        # print("0", x.size())
        x = self.initial_conv(x)
        # print("1", x.size())
        # Residual Blocks
        x = add_residual_block(x, 16, (3, 3, 3))
        # print("2", x.size())
        x = self.block1(x)
        # print("3", x.size())
        x = add_residual_block(x, 32, (3, 3, 3))
        # print("4", x.size())
        x = self.block2(x)
        # print("5", x.size())
        x = add_residual_block(x, 64, (3, 3, 3))
        # print("6", x.size())
        x = self.block3(x)
        # print("7", x.size())
        x = add_residual_block(x, filters=128, kernel_size=(3, 3, 3))
        # print("8", x.size())

        # Global average pooling and flatten
        x = self.global_avg_pool(x)
        x = self.flatten(x)
        x = self.fc(x)

        return x

In [91]:
# Training function
def train_model(model, optimizer, criterion, train_loader, val_loader, epochs=50):
    history = {'train_loss': [], 'val_loss': [], 'val_accuracy': []}
    best = 0
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for frames, labels in train_loader:
            frames, labels = frames.to(device), labels.to(device)

            optimizer.zero_grad()
            # print(epoch, frames.size())
            outputs = model(frames)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        # Validation phase
        val_loss, val_accuracy = evaluate_model(model, val_loader, criterion)

        train_loss = running_loss / len(train_loader)

        if val_accuracy > best:
              torch.save(model.state_dict(), "best.pt")
              best = val_accuracy
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['val_accuracy'].append(val_accuracy)

        print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

    return history

def evaluate_model(model, val_loader, criterion):
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for frames, labels in val_loader:
            frames, labels = frames.to(device), labels.to(device)
            outputs = model(frames)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_loss = val_loss / len(val_loader)
    val_accuracy = correct / total
    return val_loss, val_accuracy


# Generate actual and predicted labels
def get_actual_predicted_labels(model, dataset_loader):
    model.eval()
    actual = []
    predicted = []
    with torch.no_grad():
        for frames, labels in dataset_loader:
            frames = frames.to(device)
            outputs = model(frames)
            _, preds = torch.max(outputs, 1)
            actual.extend(labels.cpu().numpy())
            predicted.extend(preds.cpu().numpy())
    return actual, predicted


In [92]:
import torch.optim as optim

# Set up the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VideoModel(height=HEIGHT, width=WIDTH).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [16]:
model

VideoModel(
  (initial_conv): Sequential(
    (0): Conv2Plus1D(
      (seq): Sequential(
        (0): Conv3d(3, 16, kernel_size=(1, 7, 7), stride=(1, 1, 1), padding=(0, 3, 3))
        (1): Conv3d(16, 16, kernel_size=(3, 1, 1), stride=(1, 1, 1), padding=(1, 0, 0))
      )
    )
    (1): BatchNorm3d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): ResizeVideo()
  )
  (block1): ResizeVideo()
  (block2): ResizeVideo()
  (block3): ResizeVideo()
  (global_avg_pool): AdaptiveAvgPool3d(output_size=(1, 1, 1))
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc): Linear(in_features=128, out_features=10, bias=True)
)

In [None]:
# Training
history = train_model(model, optimizer, criterion, train_loader, val_loader)

0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 224, 224, 10])
0 torch.Size([8, 3, 

In [None]:
# Evaluation
test_metrics = evaluate_model(model, test_loader)

# Generate and plot confusion matrix for test set
actual, predicted = get_actual_predicted_labels(model, test_loader)

In [None]:
# Plot confusion matrix
def plot_confusion_matrix(actual, predicted, labels, ds_type):
    cm = confusion_matrix(actual, predicted)
    plt.figure(figsize=(12, 12))
    ax = sns.heatmap(cm, annot=True, fmt='g', cmap='Blues')
    ax.set_title(f'Confusion Matrix of Action Recognition for {ds_type}')
    ax.set_xlabel('Predicted Action')
    ax.set_ylabel('Actual Action')
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    ax.set_xticklabels(labels)
    ax.set_yticklabels(labels)
    plt.show()

In [None]:
plot_confusion_matrix(actual, predicted, labels=['label_1', 'label_2', '...'], ds_type="Test")

In [None]:
from sklearn.metrics import confusion_matrix