### Prerequisite Packages

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, Subset
from torcheval.metrics import BinaryPrecision, BinaryRecall, BinaryF1Score
from sklearn.model_selection import train_test_split, KFold

In [None]:
sys.path.append('../')

from modules.cross_attentionb import CrossAttentionB
from modules.dataloader import load_npy_files
from modules.classifier import DenseLayer, BCELoss
from modules.linear_transformation import LinearTransformations
from modules.output_max import output_max
from evaluation_validation.train_model import train_model
from evaluation_validation.evaluate_model import evaluate_model
from evaluation_validation.test_model import test_model

### Data Loading

In [None]:
class MultimodalDataset(Dataset):
    def __init__(self, id_label_df, text_features, audio_features, video_features):
        self.id_label_df = id_label_df
        
        # Convert feature lists to dictionaries for fast lookup
        self.text_features = {os.path.basename(file).split('.')[0]: tensor for file, tensor in text_features}
        self.audio_features = {os.path.basename(file).split('_')[1].split('.')[0]: tensor for file, tensor in audio_features}
        self.video_features = {os.path.basename(file).split('_')[0]: tensor for file, tensor in video_features}

        # List to store missing files
        self.missing_files = []

        # Filter out entries with missing files
        self.valid_files = self._filter_valid_files()


    def _filter_valid_files(self):
        valid_files = []
        for idx in range(len(self.id_label_df)):
            imdbid = self.id_label_df.iloc[idx]['IMDBid']

            # Check if the IMDBid exists in each modality's features
            if imdbid in self.text_features and imdbid in self.audio_features and imdbid in self.video_features:
                valid_files.append(idx)
            else:
                self.missing_files.append({'IMDBid': imdbid})

        # Print missing files after checking all
        if self.missing_files:
            print("Missing files:")
            for item in self.missing_files:
                print(f"IMDBid: {item['IMDBid']}")
            print(f"Total IMDB IDs with missing files: {len(self.missing_files)}")
        else:
            print("No missing files.")

        return valid_files

    def __len__(self):
        return len(self.valid_files)

    def __getitem__(self, idx):
        # Get the original index from the filtered valid files
        original_idx = self.valid_files[idx]
        imdbid = self.id_label_df.iloc[original_idx]['IMDBid']
        label = self.id_label_df.iloc[original_idx]['Label']

        # Retrieve data from the loaded features
        text_data = self.text_features.get(imdbid, torch.zeros((1024,)))
        audio_data = self.audio_features.get(imdbid, torch.zeros((1, 197, 768)))
        video_data = self.video_features.get(imdbid, torch.zeros((95, 768)))
        
        # Define label mapping
        label_map = {'red': 0, 'green': 1} 
        
        # Convert labels to tensor using label_map
        try:
            label_data = torch.tensor([label_map[label]], dtype=torch.float32)  # Ensure labels are integers
        except KeyError as e:
            print(f"Error: Label '{e}' not found in label_map.")
            raise

        return text_data, audio_data, video_data, label_data


In [None]:
def collate_fn(batch):
    text_data, audio_data, video_data, label_data = zip(*batch)

    # Convert lists to tensors
    text_data = torch.stack(text_data)
    audio_data = torch.stack(audio_data)

    # Padding for video data
    # Determine maximum length of video sequences in the batch
    video_lengths = [v.size(0) for v in video_data]
    max_length = max(video_lengths)

    # Pad video sequences to the maximum length
    video_data_padded = torch.stack([
        F.pad(v, (0, 0, 0, max_length - v.size(0)), "constant", 0)
        for v in video_data
    ])

    # Convert labels to tensor and ensure the shape [batch_size, 1]
    label_data = torch.stack(label_data)  # Convert list of tensors to a single tensor

    return text_data, audio_data, video_data_padded, label_data

In [None]:
# Load the labels DataFrame
id_label_df = pd.read_excel('../misc/MM-Trailer_dataset.xlsx')

# Define the directories
text_features_dir = 'D:\\Projects\\Thesis\\Text'
audio_features_dir = 'D:\\Projects\\Thesis\\Audio'
video_features_dir = 'D:\\Projects\\Thesis\\Video'

# Load the feature vectors from each directory
text_features = load_npy_files(text_features_dir)
audio_features = load_npy_files(audio_features_dir)
video_features = load_npy_files(video_features_dir)

print(f"Number of text feature vectors loaded: {len(text_features)}")
print(f"Number of audio feature vectors loaded: {len(audio_features)}")
print(f"Number of video feature vectors loaded: {len(video_features)}")

### Important Functions

In [None]:
# Cross Attention Function
def PairCrossAttention(modalityAlpha, modalityBeta, d_out_kq=768, d_out_v=768):
    cross_attn = CrossAttentionB(modalityAlpha.shape[-1], modalityBeta.shape[-1], d_out_kq, d_out_v)
    modalityAlphaBeta = cross_attn(modalityAlpha, modalityBeta)
    return modalityAlphaBeta

In [None]:
def get_optimizer(parameters, lr=1e-3):
    # Create an optimizer, for example, Adam
    return optim.Adam(parameters, lr=lr)

In [None]:
def train_model(dense_layer, dataloader, criterion, optimizer, device):
    dense_layer.train()  # Set the model to training mode
    total_loss = 0.0

    for batch in dataloader:
        # Assuming batch is a tuple (fused_features, targets)
        fused_features, targets = batch

        # Move tensors to the specified device
        fused_features = fused_features.to(device)
        targets = targets.to(device)

        # Clear gradients
        optimizer.zero_grad()

        # Pass the fused features through the dense layer
        predictions = dense_layer(fused_features).squeeze()  # Ensure correct shape

        # Compute loss
        loss = criterion(predictions, targets)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


In [None]:
def evaluate_model(dense_layer, dataloader, criterion, device):
    dense_layer.eval()
    total_loss = 0

    # Initialize the metrics for binary classification
    precision_metric = BinaryPrecision().to(device)
    recall_metric = BinaryRecall().to(device)
    f1_metric = BinaryF1Score().to(device)
    
    with torch.no_grad():
        for fused_features, targets in dataloader:
            fused_features, targets = (
                fused_features.to(device),  # Use fused features directly
                targets.to(device).squeeze()
            )

            # Pass the fused features through the dense layer
            predictions = dense_layer(fused_features).squeeze()  
            
            # Compute loss
            loss = criterion(predictions, targets)
            total_loss += loss.item()

            # Apply threshold to get binary predictions
            preds = (predictions > 0.5).float()
            
            # Update the precision, recall, and F1 score metrics
            precision_metric.update(preds.long(), targets.long())
            recall_metric.update(preds.long(), targets.long())
            f1_metric.update(preds.long(), targets.long())

    # Compute precision, recall, and F1 score
    precision = precision_metric.compute().item()
    recall = recall_metric.compute().item()
    f1_score = f1_metric.compute().item()

    average_loss = total_loss / len(dataloader)
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")
    
    return average_loss, precision, recall, f1_score


In [None]:
def test_model(dense_layer, dataloader, criterion, device):
    dense_layer.eval()  # Set the model to evaluation mode
    total_loss = 0

    # Initialize the metrics for binary classification
    precision_metric = BinaryPrecision().to(device)
    recall_metric = BinaryRecall().to(device)
    f1_metric = BinaryF1Score().to(device)

    with torch.no_grad():
        for fused_features, targets in dataloader:
            fused_features, targets = (
                fused_features.to(device),  # Move fused features to device
                targets.to(device).squeeze()  # Ensure targets are the correct shape
            )

            # Pass the fused features through the dense layer
            predictions = dense_layer(fused_features).squeeze()  
            
            # Compute loss
            loss = criterion(predictions, targets)
            total_loss += loss.item()

            # Apply threshold to get binary predictions
            preds = (predictions > 0.5).float()
            
            # Update the precision, recall, and F1 score metrics
            precision_metric.update(preds.long(), targets.long())
            recall_metric.update(preds.long(), targets.long())
            f1_metric.update(preds.long(), targets.long())

    # Compute precision, recall, and F1 score
    precision = precision_metric.compute().item()
    recall = recall_metric.compute().item()
    f1_score = f1_metric.compute().item()

    average_loss = total_loss / len(dataloader)

    print(f"Test Loss: {average_loss:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1_score:.4f}")

    return average_loss, precision, recall, f1_score


### SMCA Functions and Model

In [None]:
# Stage 1 of SMCA
def SMCAStage1(modalityAlpha, modalityBeta, d_out_kq, d_out_v, device):
    # modalityAlpha: (batch_size, num_queries, d_in_query)
    # modalityBeta: (batch_size, num_kv, d_in_kv)
    
    cross_attn = CrossAttentionB(modalityAlpha.shape[-1], modalityBeta.shape[-1], d_out_kq, d_out_v).to(device)
    
    # Cross-attention: Alpha -> Beta
    alphaBeta = cross_attn(modalityAlpha, modalityBeta)  # Shape: (batch_size, num_queries, d_out_v)
    
    # Cross-attention: Beta -> Alpha
    betaAlpha = cross_attn(modalityBeta, modalityAlpha)  # Shape: (batch_size, num_kv, d_out_v)
    
    # Get the sequence lengths
    seq_len_alpha = alphaBeta.size(1)
    seq_len_beta = betaAlpha.size(1)
    
    # Check which sequence length is larger
    if seq_len_alpha > seq_len_beta:
        # Expand betaAlpha to match alphaBeta's sequence length
        betaAlpha = betaAlpha.expand(-1, seq_len_alpha, -1)  # Expands along seq_len dimension
    elif seq_len_beta > seq_len_alpha:
        # Expand alphaBeta to match betaAlpha's sequence length
        alphaBeta = alphaBeta.expand(-1, seq_len_beta, -1)
        
    # Concatenate cross-attention outputs along the feature dimension (-1)
    modalityAlphaBeta = torch.cat((alphaBeta, betaAlpha), dim=-1)  # Shape: (batch_size, max(num_queries, num_kv), 2 * d_out_v)
    
    return modalityAlphaBeta 

In [None]:
class ProjectionLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ProjectionLayer, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

def SMCAStage2(modalityAlphaBeta, modalityGamma, d_out_kq, d_out_v, device):
    # modalityAlphaBeta: (batch_size, seq_len, 2 * d_out_v) [output of Stage 1]

    # Initialize the projection layer for modalityAlphaBeta
    projection_layer = ProjectionLayer(modalityAlphaBeta.shape[-1], d_out_v).to(device)

    # Project modalityAlphaBeta to (batch_size, seq_len, d_out_v)
    modalityAlphaBetaProjected = projection_layer(modalityAlphaBeta)

    # Initialize the cross-attention module
    cross_attn = CrossAttentionB(modalityAlphaBetaProjected.shape[-1], modalityGamma.shape[-1], d_out_kq, d_out_v).to(device)
    
    # Cross-attention: AlphaBeta -> Gamma
    alphaBetaGamma = cross_attn(modalityAlphaBetaProjected, modalityGamma)  # Shape: (batch_size, seq_len_alphaBeta, d_out_v)

    # Cross-attention: Gamma -> AlphaBeta
    gammaAlphaBeta = cross_attn(modalityGamma, modalityAlphaBetaProjected)  # Shape: (batch_size, seq_len_gamma, d_out_v)

    # Get the sequence lengths for both modalities
    seq_len_alphaBeta = alphaBetaGamma.size(1)
    seq_len_gamma = gammaAlphaBeta.size(1)
    
    # Dynamically expand the smaller sequence to match the larger sequence length
    if seq_len_alphaBeta > seq_len_gamma:
        # Repeat gammaAlphaBeta to match alphaBetaGamma's sequence length
        gammaAlphaBeta = gammaAlphaBeta.repeat(1, seq_len_alphaBeta // seq_len_gamma + 1, 1)[:, :seq_len_alphaBeta, :]
    elif seq_len_gamma > seq_len_alphaBeta:
        # Repeat alphaBetaGamma to match gammaAlphaBeta's sequence length
        alphaBetaGamma = alphaBetaGamma.repeat(1, seq_len_gamma // seq_len_alphaBeta + 1, 1)[:, :seq_len_gamma, :]
    
    # Concatenate along the feature dimension (-1)
    multimodal_representation = torch.cat((alphaBetaGamma, gammaAlphaBeta), dim=-1)  # Shape: (batch_size, max(seq_len_alphaBeta, seq_len_gamma), 2 * d_out_v)
    
    # Apply Global Average Pooling across the sequence length dimension (-2)
    multimodal_representation = torch.mean(multimodal_representation, dim=1)  # Shape: (batch_size, 2 * d_out_v)
    
    return multimodal_representation


In [None]:
class SMCAModel(nn.Module):
    def __init__(self, d_out_kq, d_out_v, device):
        super(SMCAModel, self).__init__()
        self.d_out_kq = d_out_kq
        self.d_out_v = d_out_v
        self.device = device
    
    def forward(self, modalityAlpha, modalityBeta, modalityGamma):
        # Stage 1: Cross attention between modalityAlpha and modalityBeta
        modalityAlphaBeta = SMCAStage1(modalityAlpha, modalityBeta, self.d_out_kq, self.d_out_v, self.device)

        print("Stage 1: ", modalityAlphaBeta.shape)

        # Stage 2: Cross attention with modalityAlphaBeta (as query) and modalityGamma (as key-value)
        multimodal_representation = SMCAStage2(modalityAlphaBeta, modalityGamma, self.d_out_kq, self.d_out_v, self.device)
 
       
        return multimodal_representation

Test on one instance

In [None]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SMCAModel(768, 768, device) # Dimension for d_out_kq and d_out_v

# Select the first file from each modality directories (for testing)
video_file_name, video_feature = video_features[5]
audio_file_name, audio_feature = audio_features[5]
text_file_name, text_feature = text_features[5]

# Print the file names
print("\nSelected File Names:")
print("Audio file:", audio_file_name)
print("Video file:", video_file_name)
print("Text file:", text_file_name)

video_feature = video_feature.unsqueeze(0).to(device)  # Add batch dimension
text_feature = text_feature.unsqueeze(0).to(device)
audio_feature = audio_feature.to(device)    # Add batch dimension

modalityAlpha=audio_feature 
modalityBeta=text_feature
modalityGamma=video_feature

# Apply linear transformation to match dimensions
linear_transform_Alpha = LinearTransformations(modalityAlpha.shape[-1], 768).to(device)
linear_transform_Beta = LinearTransformations(modalityBeta.shape[-1], 768).to(device)
linear_transform_Gamma = LinearTransformations(modalityGamma.shape[-1], 768).to(device)

modalityAlpha = linear_transform_Alpha(modalityAlpha).to(device)
modalityBeta = linear_transform_Beta(modalityBeta).to(device)
modalityGamma = linear_transform_Gamma(modalityGamma).to(device)

print("Audio: ",modalityAlpha.shape)
print("Text: ",modalityBeta.shape)
print("Video: ",modalityGamma.shape)

outputs = model(modalityAlpha, modalityBeta, modalityGamma)

print("Output shape:", outputs.shape)




In [None]:
num_classes = 2
dense_layer = nn.Linear(1536, num_classes).to(device)  # Change the input dimension if necessary

# Project the output to the dense layer
# Pass through the dense layer to get final predictions
predictions = dense_layer(outputs)
probabilities = nn.Softmax(dim=1)(predictions)
print("Probabilities:", probabilities)


Test on Entire Dataset

In [None]:
# Load the labels DataFrame
id_label_df = pd.read_excel('../misc/MM-Trailer_dataset.xlsx')

# Define the directories
text_features_dir = '../misc/text_features'
audio_features_dir = '../misc/audio_features'
video_features_dir = '../misc/video_features'

# Load the feature vectors from each directory
text_features = load_npy_files(text_features_dir)
audio_features = load_npy_files(audio_features_dir)
video_features = load_npy_files(video_features_dir)

print(f"Number of text feature vectors loaded: {len(text_features)}")
print(f"Number of audio feature vectors loaded: {len(audio_features)}")
print(f"Number of video feature vectors loaded: {len(video_features)}")

# Drop unnecessary columns
id_label_df = id_label_df.drop(columns=['Movie Title', 'URL'])

# Splitting data for training, validation, and testing
train_df, val_test_df = train_test_split(id_label_df, test_size=0.3, random_state=42)

# Further splitting remaining set into validation and test sets
val_df, test_df = train_test_split(val_test_df, test_size=0.5, random_state=42)

print("-" * 30)

# Create datasets
train_dataset = MultimodalDataset(train_df, text_features, audio_features, video_features)
val_dataset = MultimodalDataset(val_df, text_features, audio_features, video_features)
test_dataset = MultimodalDataset(test_df, text_features, audio_features, video_features)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True, num_workers=0, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True, num_workers=0, collate_fn=collate_fn)

# Combine all data for K-fold cross-validation
full_dataset = MultimodalDataset(id_label_df, text_features, audio_features, video_features)

In [None]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model
model = SMCAModel(768, 768, device)  # Dimension for d_out_kq and d_out_v

# Training loop
for text_features, audio_features, video_features, targets in train_dataloader:
    # Move features to the specified device
    text_features = text_features.to(device)
    audio_features = audio_features.to(device)
    video_features = video_features.to(device)

    # Print dimensions for debugging
    print("Text Dimension: ", text_features.shape)  
    print("Audio Dimension: ", audio_features.shape)  
    print("Video Dimension: ", video_features.shape) 

    # Squeeze the audio features to remove the extra dimension
    audio_features = audio_features.squeeze(1) 

    # Apply linear transformations to match dimensions
    linear_transform_Alpha = LinearTransformations(audio_features.shape[-1], 768) 
    linear_transform_Beta = LinearTransformations(text_features.shape[-1], 768)   
    linear_transform_Gamma = LinearTransformations(video_features.shape[-1], 768)    

    # Transform features to match the target dimension
    modalityAlpha = linear_transform_Alpha(audio_features)  
    modalityBeta = linear_transform_Beta(text_features)    
    modalityGamma = linear_transform_Gamma(video_features)

    # Print shapes after transformation to verify the batch dimension
    print("Transformed Audio Dimension: ", modalityAlpha.shape)  
    print("Transformed Text Dimension: ", modalityBeta.shape)    
    print("Transformed Video Dimension: ", modalityGamma.shape)  

    # Pass inputs through the SMCA model
    outputs = model(
        modalityAlpha=modalityAlpha,  # Ensure to pass transformed modalities
        modalityBeta=modalityBeta,
        modalityGamma=modalityGamma,
    )

    print("Stage 2:", outputs.shape)  # Check the output shape
    print("--------")
