In [1]:
from pathlib import Path
from PIL import Image

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader, random_split
from torchvision import models, transforms

In [2]:
# check torch device
torch.cuda.is_available()

False

In [4]:
frame_shape = (3, 224, 224)  # (channels, height, width)

In [None]:
# Define the CNN-LSTM model
class CNNLSTM(nn.Module):
    def __init__(self, cnn_model, lstm_input_size, lstm_hidden_size, num_classes):
        super().__init__()

        # Use a pretrained CNN model
        self.cnn = cnn_model
        self.cnn.classifier = nn.Identity()  # Remove the classification head, keep feature extractor

        # Dynamically compute CNN output size
        with torch.no_grad():
            dummy_input = torch.zeros(1, 3, 224, 224)  # Simulate a single frame input
            cnn_output = self.cnn(dummy_input)
            self.cnn_output_size = cnn_output.shape[-1]  # Capture the feature size

        # Layer to match CNN output size to LSTM input size
        self.cnn2lstm = nn.Linear(self.cnn_output_size, lstm_input_size)

        # Parameters for LSTM
        self.lstm = nn.LSTM(
            input_size=512,
            hidden_size=lstm_hidden_size,
            num_layers=2,
            batch_first=True,
        )

        # Fully connected layer for classification
        self.fc = nn.Linear(lstm_hidden_size, num_classes)

    def forward(self, x):
        # Forward pass through CNNLSTM
        batch_size, seq_length, channels, height, width = x.size()

        # Reshape for CNN
        cnn_features = []
        # Extract CNN features for each frame
        for t in range(seq_length):
            frame_features = self.cnn(x[:, t, :, :, :])
            reduced_features = self.cnn2lstm(frame_features)  # Apply dimensionality reduction
            cnn_features.append(reduced_features)

        # Stack features along the sequence dimension
        cnn_features = torch.stack(cnn_features, dim=1)  # -> (batch_sz, seq_length, feature_sz)

        # Pass through LSTM
        lstm_out, _ = self.lstm(cnn_features)  # -> (batch_sz, seq_length, lstm_hidden_sz)

        # Get the last time step's output for classification
        output = self.fc(lstm_out[:, -1, :])  # Shape: (batch_size, num_classes)

        return output

In [None]:
# Load a pretrained model

pretrained_model = models.densenet201(pretrained=True)
# pretrained_model = models.resnet34(pretrained=True)

Downloading: "https://download.pytorch.org/models/densenet201-c1103571.pth" to C:\Users\jai/.cache\torch\hub\checkpoints\densenet201-c1103571.pth
100%|██████████| 77.4M/77.4M [00:00<00:00, 114MB/s]


In [13]:
# Define the model with LSTM
model = CNNLSTM(pretrained_model, lstm_input_size=512, lstm_hidden_size=512, num_classes=4)

# Example input
batch_size = 8
seq_length = 15  # Number of frames (half a second at 30 FPS)
channels, height, width = frame_shape[0], frame_shape[1], frame_shape[2]
dummy_input = torch.randn(batch_size, seq_length, channels, height, width)

# Forward pass
output = model(dummy_input)
print(output.shape)  # Expected output: (batch_size, num_classes)

torch.Size([8, 4])


In [8]:
cnn = pretrained_model

In [9]:
cnn

DenseNet(
  (features): Sequential(
    (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu0): ReLU(inplace=True)
    (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): _DenseBlock(
      (denselayer1): _DenseLayer(
        (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu2): ReLU(inplace=True)
        (conv2): Conv2d(128, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (denselayer2): _DenseLayer(
        (norm1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu