In this i used normal cnns to extract and pred video summary values

In [3]:
import os
import cv2
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F 
from torchvision import transforms
from torch import nn, optim
import numpy as np
from sklearn.model_selection import train_test_split
from PIL import Image

# Custom Dataset to load videos
class VideoDataset(Dataset):
    def __init__(self, folder_path, labels_csv, transform=None):
        self.folder_path = folder_path
        self.labels = pd.read_csv(labels_csv)
        self.transform = transform
        
        # Convert video_summary to tuple of (x_summary, y_summary)
        self.labels['video_summary'] = self.labels['video_summary'].apply(eval) 

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        video_id = self.labels.iloc[idx]['video_id']
        video_path = os.path.join(self.folder_path, f"{video_id}.mp4")  # Ensure the filename matches video_id
        
        # OpenCV to read the video frames
        cap = cv2.VideoCapture(video_path)
        frames = []
        
        # Read 20 frames from the video
        for _ in range(20):
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  
            frame = cv2.resize(frame, (64, 64))  
            frames.append(frame)
        
        cap.release()
        
        frames = np.stack(frames)  # Stack to get shape (20, 64, 64, 3)
        
        # Convert frames to the proper format: (3, 20, 64, 64)
        frames = np.transpose(frames, (3, 0, 1, 2))  
        
        frames = torch.tensor(frames, dtype=torch.float32)
        
        # Apply transformations if any
        if self.transform:
            frames = self.transform(frames)
        
        # Extract x_summary and y_summary
        x_summary, y_summary = self.labels.iloc[idx]['video_summary']
        
        return frames, torch.tensor([x_summary, y_summary], dtype=torch.float32)

# Define transformations (for tensor)
transform = transforms.Compose([
    transforms.Lambda(lambda x: x / 255.0)  # Normalize the frames to [0, 1]
])

# Load data and split 80% for training and 20% for testing
folder_path = 'BH25/Training_Data/Train_Videos' 
labels_csv = 'BH25/Training_Data/train.csv' 

dataset = VideoDataset(folder_path, labels_csv, transform=transform)
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


# Limit the train_dataset to the first 1000 videos
train_dataset = torch.utils.data.Subset(train_dataset, range(2000))

# custom CNN model with 3 layers
class CustomCNN(nn.Module):
    def __init__(self, num_classes=2):
        super(CustomCNN, self).__init__()
        self.conv1 = nn.Conv3d(3, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv3d(32, 64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv3d(64, 128, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool3d(kernel_size=2, stride=2, padding=0)
        self.fc = nn.Linear(128 * 8 * 8 * 2, num_classes)  

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model and move it to the device
model = CustomCNN(num_classes=2).to(device)

# Loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for frames, labels in train_loader:
        # Move frames and labels to the same device as the model (GPU or CPU)
        frames, labels = frames.to(device), labels.to(device)
        
        # Forward pass
        outputs = model(frames)
        
        # Compute loss
        loss = loss_fn(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    avg_loss = running_loss / len(train_loader)
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

# Save the trained model
torch.save(model.state_dict(), 'selfcnn.pth')


Epoch 1/10, Loss: 238.6948
Epoch 2/10, Loss: 238.5713
Epoch 3/10, Loss: 238.4915
Epoch 4/10, Loss: 238.5207
Epoch 5/10, Loss: 238.3798
Epoch 6/10, Loss: 238.2071
Epoch 7/10, Loss: 238.0210
Epoch 8/10, Loss: 237.6506
Epoch 9/10, Loss: 237.4782
Epoch 10/10, Loss: 237.3383


In [4]:
import torch
from sklearn.metrics import mean_absolute_error
import numpy as np

# Check if CUDA is available and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the trained model
model = CustomCNN(num_classes=2)
model.load_state_dict(torch.load('selfcnn.pth'))
model.to(device)  
model.eval()

# Initialize lists to store predictions and true labels
all_preds = []
all_labels = []

# Evaluate the model
with torch.no_grad():
    for frames, labels in test_loader:
        frames, labels = frames.to(device), labels.to(device)  
        
        # Forward pass to get predictions
        outputs = model(frames)
        
        # Collect predictions and true labels
        all_preds.append(outputs.cpu().numpy())
        all_labels.append(labels.cpu().numpy())

# Convert lists to numpy arrays for evaluation
all_preds = np.concatenate(all_preds, axis=0)
all_labels = np.concatenate(all_labels, axis=0)

# Calculate Mean Absolute Error (MAE)
mae_loss = mean_absolute_error(all_labels, all_preds)
print(f"Mean Absolute Error (MAE): {mae_loss:.4f}")


  model.load_state_dict(torch.load('selfcnn.pth'))


Mean Absolute Error (MAE): 12.3206


making the csv file for test folder

In [5]:
import torch
import os
import cv2
import pandas as pd
from torchvision import transforms

# Check if CUDA is available and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the trained model
model = CustomCNN(num_classes=2)
model.load_state_dict(torch.load('selfcnn.pth'))
model.to(device)  # Move the model to the selected device
model.eval()

# Define transformation for input video frames
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((64, 64)),  # Resize if needed
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Directory containing test videos
test_video_dir = 'BH25/Testing_Data'  # The folder where test videos are stored
output_csv_path = 'self_test_summary.csv'  # Path to output CSV file

# List to hold video_id and video_summary (tuple of x_summary, y_summary)
video_data = []

# Iterate over all videos in the test folder
for video_id in os.listdir(test_video_dir):
    if video_id.endswith(".mp4"):  # Ensure the file is a video
        video_path = os.path.join(test_video_dir, video_id)
        
        # Open the video file using OpenCV
        cap = cv2.VideoCapture(video_path)
        frames = []
        
        # Read 20 frames (or all available frames if the video has fewer than 20)
        for _ in range(20):
            ret, frame = cap.read()
            if not ret:
                break  # Stop if video ends or fewer frames are available
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert to RGB
            frames.append(transform(frame))  # Apply transform
        
        # If there are less than 20 frames, pad with zeros (optional, depending on your needs)
        if len(frames) < 20:
            for _ in range(20 - len(frames)):
                frames.append(torch.zeros((3, 64, 64)))  # Add empty frames
        
        # Stack frames into a tensor of shape (20, 3, 64, 64)
        frames_tensor = torch.stack(frames)  # Shape: [20, 3, 64, 64]
        
        # Permute the dimensions to match the expected input shape for 3D CNNs
        # New shape: [3, 20, 64, 64]
        frames_tensor = frames_tensor.permute(1, 0, 2, 3)

        # Add batch dimension to the tensor (shape becomes [1, 3, 20, 64, 64])
        video_tensor = frames_tensor.unsqueeze(0).to(device)  # Shape: [1, 3, 20, 64, 64]
        
        # Predict using the trained model
        with torch.no_grad():
            outputs = model(video_tensor)
        
        # Get x_summary and y_summary values from the model output
        x_summary, y_summary = outputs.cpu().numpy().flatten()

        # Add video_id and video_summary as a tuple (x_summary, y_summary) to the results list
        video_summary = (x_summary, y_summary)
        video_data.append([video_id.split('.')[0], video_summary])

        # Release the video capture object
        cap.release()

# Sort the video_data list by video_id
video_data.sort(key=lambda x: x[0])  # Sort by video_id
# Create a DataFrame to save the results
df = pd.DataFrame(video_data, columns=["video_id", "video_summary"])

# Save to CSV file
df.to_csv(output_csv_path, index=False)

print(f"Video summaries have been saved to {output_csv_path}")

  model.load_state_dict(torch.load('selfcnn.pth'))


Video summaries have been saved to self_test_summary.csv
