in this instead of using 3d cnn on the video i used only 2d cnn on a single frame of the video
which lead to similar mae as that of linear regression

In [None]:
import os
import cv2
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from torch import nn, optim
import numpy as np
from sklearn.model_selection import train_test_split


In [None]:
# Custom Dataset to load videos
class VideoDataset(Dataset):
    def __init__(self, folder_path, labels_csv, transform=None):
        self.folder_path = folder_path
        self.labels = pd.read_csv(labels_csv)
        self.transform = transform
        
        # Convert video_summary to tuple of (x_summary, y_summary)
        self.labels['video_summary'] = self.labels['video_summary'].apply(eval)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        video_id = self.labels.iloc[idx]['video_id']
        video_path = os.path.join(self.folder_path, f"{video_id}.mp4")
        
        # OpenCV to read the first frame of the video
        cap = cv2.VideoCapture(video_path)
        ret, frame = cap.read()
        cap.release()
        
        if not ret:
            raise ValueError(f"Failed to read video: {video_path}")
        
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert to RGB
        frame = cv2.resize(frame, (224, 224))  # Resize to 224x224 for ResNet
        
        if self.transform:
            frame = self.transform(frame)

        # Extract x_summary and y_summary
        x_summary, y_summary = self.labels.iloc[idx]['video_summary']
        
        return frame, torch.tensor([x_summary, y_summary], dtype=torch.float32)

In [5]:
# Define the ResNet model
class ResNet2D(nn.Module):
    def __init__(self, num_classes=2):
        super(ResNet2D, self).__init__()
        self.resnet = models.resnet18(pretrained=True)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, num_classes)

    def forward(self, x):
        return self.resnet(x)

In [6]:
# Define transformations
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load data and split 80% for training and 20% for testing
folder_path = 'BH25/Training_Data/Train_Videos'  
labels_csv = 'BH25/Training_Data/train.csv'  

dataset = VideoDataset(folder_path, labels_csv, transform=transform)
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [7]:
# Check if CUDA (GPU) is available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model and move it to the  device
model = ResNet2D(num_classes=2).to(device)

# Loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for frames, labels in train_loader:
        # Move frames and labels to the same device as the model (GPU or CPU)
        frames, labels = frames.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(frames)
        
        # Compute loss
        loss = loss_fn(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    avg_loss = running_loss / len(train_loader)
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

# Save the trained model
torch.save(model.state_dict(), 'resnet2d.pth')




Epoch 1/10, Loss: 239.1704
Epoch 2/10, Loss: 233.8458
Epoch 3/10, Loss: 189.9032
Epoch 4/10, Loss: 83.4688
Epoch 5/10, Loss: 36.6052
Epoch 6/10, Loss: 24.9026
Epoch 7/10, Loss: 20.4779
Epoch 8/10, Loss: 16.9844
Epoch 9/10, Loss: 17.5873
Epoch 10/10, Loss: 18.9559


model is overfitted here as loss is less but on validation set mae is high

In [8]:
import torch
from sklearn.metrics import mean_absolute_error
import numpy as np

# Check if CUDA is available and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the trained model
model = ResNet2D(num_classes=2)
model.load_state_dict(torch.load('resnet2d.pth'))
model.to(device)  # Move the model to the selected device
model.eval()

# Initialize lists to store predictions and true labels
all_preds = []
all_labels = []

# Evaluate the model
with torch.no_grad():
    for frames, labels in test_loader:
        frames, labels = frames.to(device), labels.to(device)  # Move data to the selected device
        
        # Forward pass to get predictions
        outputs = model(frames)
        
        # Collect predictions and true labels
        all_preds.append(outputs.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Convert lists to numpy arrays for evaluation
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Calculate Mean Absolute Error (MAE)
mae_loss = mean_absolute_error(all_labels, all_preds)
print(f"Mean Absolute Error (MAE): {mae_loss:.4f}")

  model.load_state_dict(torch.load('resnet2d.pth'))


Mean Absolute Error (MAE): 13.3050
