In [110]:
import sys
import os
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, Subset
from torcheval.metrics import BinaryPrecision, BinaryRecall, BinaryF1Score
from sklearn.model_selection import train_test_split, KFold

In [111]:
# Append the path for module imports
sys.path.append('../')

# Import custom modules
from modules.cross_attention import CrossAttention
from modules.cross_attentionb import CrossAttentionB
from modules.dataloader import load_npy_files
from modules.classifier import DenseLayer, BCELoss
from modules.linear_transformation import LinearTransformations

### Data Loading

In [112]:

class MultimodalDataset(Dataset):
    def __init__(self, id_label_df, text_features, audio_features, video_features):
        self.id_label_df = id_label_df
        
        # Convert feature lists to dictionaries for fast lookup
        self.text_features = {os.path.basename(file).split('.')[0]: tensor for file, tensor in text_features}
        self.audio_features = {os.path.basename(file).split('_')[1].split('.')[0]: tensor for file, tensor in audio_features}
        self.video_features = {os.path.basename(file).split('_')[0]: tensor for file, tensor in video_features}

        # List to store missing files
        self.missing_files = []

        # Filter out entries with missing files
        self.valid_files = self._filter_valid_files()


    def _filter_valid_files(self):
        valid_files = []
        for idx in range(len(self.id_label_df)):
            imdbid = self.id_label_df.iloc[idx]['IMDBid']

            # Check if the IMDBid exists in each modality's features
            if imdbid in self.text_features and imdbid in self.audio_features and imdbid in self.video_features:
                valid_files.append(idx)
            else:
                self.missing_files.append({'IMDBid': imdbid})

        # # Print missing files after checking all
        # if self.missing_files:
        #     print("Missing files:")
        #     for item in self.missing_files:
        #         print(f"IMDBid: {item['IMDBid']}")
        #     print(f"Total IMDB IDs with missing files: {len(self.missing_files)}")
        # else:
        #     print("No missing files.")

        return valid_files

    def __len__(self):
        return len(self.valid_files)

    def __getitem__(self, idx):
        # Get the original index from the filtered valid files
        original_idx = self.valid_files[idx]
        imdbid = self.id_label_df.iloc[original_idx]['IMDBid']
        label = self.id_label_df.iloc[original_idx]['Label']

        # Retrieve data from the loaded features
        text_data = self.text_features.get(imdbid, torch.zeros((1024,)))
        audio_data = self.audio_features.get(imdbid, torch.zeros((1, 197, 768)))
        video_data = self.video_features.get(imdbid, torch.zeros((95, 768)))
        
        # Define label mapping
        label_map = {'red': 0, 'green': 1} 
        
        # Convert labels to tensor using label_map
        try:
            label_data = torch.tensor([label_map[label]], dtype=torch.float32)  # Ensure labels are integers
        except KeyError as e:
            print(f"Error: Label '{e}' not found in label_map.")
            raise
        
        # Debugging output
        if label_data.shape[0] == 0:
            print(f"Empty target for IMDBid {imdbid} at index {idx}")

        return text_data, audio_data, video_data, label_data


In [113]:
def collate_fn(batch):
    text_data, audio_data, video_data, label_data = zip(*batch)

    # Convert lists to tensors
    text_data = torch.stack(text_data)
    audio_data = torch.stack(audio_data)

    # Padding for video data
    # Determine maximum length of video sequences in the batch
    video_lengths = [v.size(0) for v in video_data]
    max_length = max(video_lengths)

    # Pad video sequences to the maximum length
    video_data_padded = torch.stack([
        F.pad(v, (0, 0, 0, max_length - v.size(0)), "constant", 0)
        for v in video_data
    ])

    # Convert labels to tensor and ensure the shape [batch_size, 1]
    label_data = torch.stack(label_data)  # Convert list of tensors to a single tensor

    return text_data, audio_data, video_data_padded, label_data

In [114]:
# Load the labels DataFrame
id_label_df = pd.read_excel('../misc/MM-Trailer_dataset.xlsx')

# Define the directories
text_features_dir = 'D:\\Projects\\Thesis\\Text'
audio_features_dir = 'D:\\Projects\\Thesis\\Audio'
video_features_dir = 'D:\\Projects\\Thesis\\Video'


# Load the feature vectors from each directory
text_features = load_npy_files(text_features_dir)
audio_features = load_npy_files(audio_features_dir)
video_features = load_npy_files(video_features_dir)

print(f"Number of text feature vectors loaded: {len(text_features)}")
print(f"Number of audio feature vectors loaded: {len(audio_features)}")
print(f"Number of video feature vectors loaded: {len(video_features)}")

# Splitting data for training, validation, and testing
train_df, val_test_df = train_test_split(id_label_df, test_size=0.3, random_state=42)

# Further splitting remaining set into validation and test sets
val_df, test_df = train_test_split(val_test_df, test_size=0.5, random_state=42)

# Create datasets
train_dataset = MultimodalDataset(train_df, text_features, audio_features, video_features)
val_dataset = MultimodalDataset(val_df, text_features, audio_features, video_features)
test_dataset = MultimodalDataset(test_df, text_features, audio_features, video_features)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=0, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=0, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=0, collate_fn=collate_fn)

# Combine all data for K-fold cross-validation
full_dataset = MultimodalDataset(id_label_df, text_features, audio_features, video_features)


Number of text feature vectors loaded: 1353
Number of audio feature vectors loaded: 1353
Number of video feature vectors loaded: 1353


### SMCA Model Classes

In [115]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Stage 1 of SMCA
def SMCAStage1(modalityAlpha, modalityBeta, d_out_kq, d_out_v, device):
    cross_attn = CrossAttention(modalityAlpha.shape[-1], modalityBeta.shape[-1], d_out_kq, d_out_v).to(device)
    modalityAlphaBeta = cross_attn(modalityAlpha, modalityBeta)
    return modalityAlphaBeta

In [116]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Stage 2 of SMCA - Model A: Stage 1 Output as Query
def SMCAStage2_ModelA(modalityAlphaBeta, modalityGamma, d_out_kq, d_out_v, device):
    cross_attn = CrossAttention(modalityAlphaBeta.shape[-1], modalityGamma.shape[-1], d_out_kq, d_out_v).to(device)
    multimodal_representation = cross_attn(modalityAlphaBeta, modalityGamma)
    return multimodal_representation

### SMCAModelA

In [117]:
class SMCAModelA(nn.Module):
    def __init__(self, d_out_kq, d_out_v, device):
        super(SMCAModelA, self).__init__()
        self.d_out_kq = d_out_kq
        self.d_out_v = d_out_v
        self.device = device
    
    def forward(self, modalityAlpha, modalityBeta, modalityGamma, device):
        # Stage 1: Cross attention between modalityAlpha and modalityBeta
        modalityAlphaBeta = SMCAStage1(modalityAlpha, modalityBeta, self.d_out_kq, self.d_out_v, device)
        
        # Stage 2: Cross attention with modalityAlphaBeta (as query) and modalityGamma (as key-value)
        multimodal_representation = SMCAStage2_ModelA(modalityAlphaBeta, modalityGamma, self.d_out_kq, self.d_out_v, device)
        
        # Flatten the output
        return torch.flatten(multimodal_representation, start_dim=1)  # Flatten all dimensions except batch

In [118]:
# Device configuration

# # Stage 2 of SMCA - Model B: Stage 1 Output as Key-Value
# def SMCAStage2_ModelB(modalityGamma, modalityAlphaBeta, d_out_kq, d_out_v, device):
#     cross_attn = CrossAttention(modalityGamma.shape[-1], modalityAlphaBeta.shape[-1], d_out_kq, d_out_v).to(device)
#     multimodal_representation = cross_attn(modalityGamma, modalityAlphaBeta)
#     return multimodal_representation

def SMCAStage2_ModelB(modalityGamma, modalityAlphaBeta, d_out_kq, d_out_v, device):
    cross_attn = CrossAttentionB(
        d_in_query=modalityGamma.shape[-1],
        d_in_kv=modalityAlphaBeta.shape[-1],
        d_out_kq=d_out_kq,
        d_out_v=d_out_v
    ).to(device)
    multimodal_representation = cross_attn(modalityGamma, modalityAlphaBeta)
    return multimodal_representation

In [119]:
class SMCAModelB(nn.Module):
    def __init__(self, d_out_kq, d_out_v, device):
        super(SMCAModelB, self).__init__()
        self.d_out_kq = d_out_kq
        self.d_out_v = d_out_v
        self.device = device
        
    def forward(self, modalityAlpha, modalityBeta, modalityGamma, device):
        # Stage 1: Cross attention between modalityAlpha and modalityBeta
        modalityAlphaBeta = SMCAStage1(modalityAlpha, modalityBeta, self.d_out_kq, self.d_out_v, device)
        
        # Stage 2: Cross attention with modalityGamma (as query) and modalityAlphaBeta (as key-value)
        multimodal_representation = SMCAStage2_ModelB(modalityGamma, modalityAlphaBeta, self.d_out_kq, self.d_out_v, device)
        
        # Flatten the output
        return torch.flatten(multimodal_representation, start_dim=1)  # Flatten all dimensions except batch


### Test Model (For Debugging)

In [120]:
# Test the SMCA model using the items from dataloader as input

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

# Determine the output dimensions
d_out_kq = 512
d_out_v = 256

# Initialize the SMCA model A
model = SMCAModelB(d_out_kq, d_out_v, device)
model.to(device)

# Use DataLoader to get a batch of data
for batch in train_dataloader:  # You can use any DataLoader (train_dataloader, val_dataloader, etc.)
    text_data, audio_data, video_data, labels = batch
    
    text_data = text_data.to(device)
    audio_data = audio_data.to(device)
    video_data = video_data.to(device)
    labels = labels.to(device)
    
    # Feed the entire batch to the GMU model
    with torch.no_grad():
        output = model(modalityAlpha=audio_data, modalityBeta=text_data, modalityGamma=video_data, device=device)
        
    # Print the output shape
    print('-'*50)
    print("SMCA Output Shape:", output.shape)    
    # Break after the first batch for testing purposes
    break


Device: cuda
--------------------------------------------------
SMCA Output Shape: torch.Size([8, 276480])


In [121]:
# Test the SMCA model using a single file from each feature directory

from modules.dataloader import load_npy_files

# Define the directories
text_features_dir = 'D:\\Projects\\Thesis\\Text'
audio_features_dir = 'D:\\Projects\\Thesis\\Audio'
video_features_dir = 'D:\\Projects\\Thesis\\Video'

# Load the feature vectors from each directory
text_features = load_npy_files(text_features_dir)
audio_features = load_npy_files(audio_features_dir)
video_features = load_npy_files(video_features_dir)

index = 0

# Select the first file from each modality directories (for testing)
text_file_name, text_features = text_features[index]
audio_file_name, audio_features = audio_features[index]
video_file_name, video_features = video_features[index]

print("Selected File:")
print("Text file:", os.path.basename(text_file_name))
print("Audio file:", os.path.basename(audio_file_name))
print("Video file:", os.path.basename(video_file_name))
print("-"*50)

# Determine the output dimensions
d_out_kq = 512 
d_out_v = 256

# Initialize the SMCA model A
model = SMCAModelB(d_out_kq, d_out_v, device)

# Move model to the same device as your data (e.g., GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Reshape features
video_features = video_features.unsqueeze(0)  # Add batch dimension
text_features = text_features.unsqueeze(0)    # Add batch dimension

print("Text Feature Shape:", text_features.shape)
print("Audio Feature Shape:", audio_features.shape)
print("Video Feature Shape:", video_features.shape)
print("-"*50)

# Pass the data through the SMCA model
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # No need to compute gradients
    output = model(modalityAlpha=text_features.to(device), modalityBeta=video_features.to(device), modalityGamma=audio_features.to(device), device=device)

# Print the output shape and the output itself
print("Model output shape:", output.shape, "###[batch_size, output_dim]")
print("-"*50)

Selected File:
Text file: tt0021814.npy
Audio file: feature_tt0021814.npy
Video file: tt0021814_features.npy
--------------------------------------------------
Text Feature Shape: torch.Size([1, 1024])
Audio Feature Shape: torch.Size([1, 197, 768])
Video Feature Shape: torch.Size([1, 95, 768])
--------------------------------------------------
Model output shape: torch.Size([1, 50432]) ###[batch_size, output_dim]
--------------------------------------------------


In [122]:
# Original Sample Test for the SMCA  (Non-Classes)
if __name__ == "__main__":
    torch.manual_seed(42)
    
    # Load .npy files
    video_features = load_npy_files(r'D:\\Projects\\Thesis\\Video')
    audio_features = load_npy_files(r'D:\\Projects\\Thesis\\Audio')
    text_features = load_npy_files(r'D:\\Projects\\Thesis\\Text')
    
    # Select the first file from each modality directories (for testing)
    video_file_name, video_features = video_features[0]
    audio_file_name, audio_features = audio_features[0]
    text_file_name, text_features = text_features[0]

    # Print the file names
    print("\nSelected File Names:")
    print("Audio file:", audio_file_name)
    print("Video file:", video_file_name)
    print("Text file:", text_file_name)
    
    # Reshape features
    video_features = video_features.unsqueeze(0)  # Add batch dimension
    text_features = text_features.unsqueeze(0)    # Add batch dimension

    # # Randomize assignment of Alpha, Beta, Gamma
    # modalityAlpha, modalityBeta, modalityGamma = randomize_modalities(audio_features, video_features, text_features)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Manual assignment of modalities
    modalityAlpha = audio_features.to(device)
    modalityBeta = text_features.to(device)
    modalityGamma = video_features.to(device)

    # Apply linear transformation to match dimensions
    linear_transform_Alpha = LinearTransformations(modalityAlpha.shape[-1], 768).to(device)
    linear_transform_Beta = LinearTransformations(modalityBeta.shape[-1], 768).to(device)
    linear_transform_Gamma = LinearTransformations(modalityGamma.shape[-1], 768).to(device)

    modalityAlpha = linear_transform_Alpha(modalityAlpha)
    modalityBeta = linear_transform_Beta(modalityBeta)
    modalityGamma = linear_transform_Gamma(modalityGamma)

    # Determine the output dimensions
    d_out_kq = 768  # Final transformed dimension
    d_out_v = 768

    # Stage 1: Bimodal Representation
    modalityAlphaBeta = SMCAStage1(modalityAlpha, modalityBeta, d_out_kq, d_out_v, device)
    
    # Stage 2, Model A: Multimodal Representation (using AlphaBeta as Query)
    final_representation_A = SMCAStage2_ModelA(modalityAlphaBeta, modalityGamma, d_out_kq, d_out_v, device)
    
    # Stage 2, Model B: Multimodal Representation (using AlphaBeta as Key-Value)
    final_representation_B = SMCAStage2_ModelB(modalityGamma, modalityAlphaBeta, d_out_kq, d_out_v, device)

    print("Modality Alpha Shape:", modalityAlpha.shape)
    print("Modality Beta Shape:", modalityBeta.shape)
    print("Modality Gamma Shape:", modalityGamma.shape)
    print("Stage 1 Bimodal Representation Shape:", modalityAlphaBeta.shape)
    print("Final Multimodal Representation (Model A) Shape:", final_representation_A.shape)
    print("Final Multimodal Representation (Model B) Shape:", final_representation_B.shape)



Selected File Names:
Audio file: D:\\Projects\\Thesis\\Audio\feature_tt0021814.npy
Video file: D:\\Projects\\Thesis\\Video\tt0021814_features.npy
Text file: D:\\Projects\\Thesis\\Text\tt0021814.npy
Modality Alpha Shape: torch.Size([1, 197, 768])
Modality Beta Shape: torch.Size([1, 768])
Modality Gamma Shape: torch.Size([1, 95, 768])
Stage 1 Bimodal Representation Shape: torch.Size([1, 197, 768])
Final Multimodal Representation (Model A) Shape: torch.Size([1, 197, 768])
Final Multimodal Representation (Model B) Shape: torch.Size([1, 95, 768])


### Model Training Functions

In [123]:
def get_optimizer(parameters, lr=1e-4):
    # Create an optimizer, for example, Adam
    return optim.Adam(parameters, lr=lr)

In [124]:
def train_model(model, dense_layer, dataloader, criterion, optimizer, device):
    model.train()
    dense_layer.train()  # Set the model to training mode
    total_loss = 0.0

    for text_features, audio_features, video_features, targets in dataloader:
        text_features, audio_features, video_features, targets = (
            text_features.to(device),
            audio_features.to(device),
            video_features.to(device),
            targets.to(device).view(-1)
        )
        
        optimizer.zero_grad()
        
        # Pass inputs through SMCA model
        
        outputs = model(modalityAlpha=text_features, modalityBeta=audio_features, modalityGamma=video_features, device=device)
        # outputs = model(modalityAlpha=text_features, modalityBeta=video_features, modalityGamma=audio_features, device=device)
        # outputs = model(modalityAlpha=audio_features, modalityBeta=text_features, modalityGamma=video_features, device=device)
        # outputs = model(modalityAlpha=audio_features, modalityBeta=video_features, modalityGamma=text_features, device=device)
        # outputs = model(modalityAlpha=video_features, modalityBeta=audio_features, modalityGamma=text_features, device=device)
        # outputs = model(modalityAlpha=video_features, modalityBeta=text_features, modalityGamma=audio_features, device=device)
        
        # Check if padding is necessary
        output_size = outputs.size(1)
        dense_input_size = dense_layer.fc.in_features
        
        if output_size < dense_input_size:
            # Pad the outputs if they are smaller than the expected size for the dense layer
            padding_size = dense_input_size - output_size
            # Pad on the second dimension (feature dimension)
            outputs = torch.nn.functional.pad(outputs, (0, padding_size))
        elif output_size > dense_input_size:
            # In case outputs are larger (though unlikely, we trim)
            outputs = outputs[:, :dense_input_size]
        
        # Pass the fused features through the dense layer
        predictions = dense_layer(outputs).view(-1)

        # Compute loss
        loss = criterion(predictions, targets)
        total_loss += loss.item()
        # Backward pass and optimization
        loss.backward()
        optimizer.step()



    return total_loss / len(dataloader)


In [125]:
def evaluate_model(model, dense_layer, dataloader, criterion, device):
    model.eval()
    dense_layer.eval()
    total_loss = 0.0

    # Initialize the metrics for binary classification
    precision_metric = BinaryPrecision().to(device)
    recall_metric = BinaryRecall().to(device)
    f1_metric = BinaryF1Score().to(device)

    precision_metric.reset()
    recall_metric.reset()
    f1_metric.reset()

    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
         for text_features, audio_features, video_features, targets in dataloader:
            text_features, audio_features, video_features, targets = (
                text_features.to(device),
                audio_features.to(device),
                video_features.to(device),
                targets.to(device).view(-1)
            )
            
            # Pass inputs through SMCA model
        
            outputs = model(modalityAlpha=text_features, modalityBeta=audio_features, modalityGamma=video_features, device=device)
            # outputs = model(modalityAlpha=text_features, modalityBeta=video_features, modalityGamma=audio_features, device=device)
            # outputs = model(modalityAlpha=audio_features, modalityBeta=text_features, modalityGamma=video_features, device=device)
            # outputs = model(modalityAlpha=audio_features, modalityBeta=video_features, modalityGamma=text_features, device=device)
            # outputs = model(modalityAlpha=video_features, modalityBeta=audio_features, modalityGamma=text_features, device=device)
            # outputs = model(modalityAlpha=video_features, modalityBeta=text_features, modalityGamma=audio_features, device=device)

            # Check if padding is necessary
            output_size = outputs.size(1)
            dense_input_size = dense_layer.fc.in_features
            
            if output_size < dense_input_size:
                # Pad the outputs if they are smaller than the expected size for the dense layer
                padding_size = dense_input_size - output_size
                # Pad on the second dimension (feature dimension)
                outputs = torch.nn.functional.pad(outputs, (0, padding_size))
            elif output_size > dense_input_size:
                # In case outputs are larger (though unlikely, we trim)
                outputs = outputs[:, :dense_input_size]

            # Pass the fused features through the dense layer
            predictions = dense_layer(outputs).view(-1) 

            all_predictions.extend(predictions.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
            
            # Compute loss
            loss = criterion(predictions, targets)
            total_loss += loss.item()

            # Apply threshold to get binary predictions
            preds = (predictions > 0.5).float()
            
            # Update the precision, recall, and F1 score metrics
            precision_metric.update(preds.long(), targets.long())
            recall_metric.update(preds.long(), targets.long())
            f1_metric.update(preds.long(), targets.long())

    # Compute precision, recall, and F1 score
    precision = precision_metric.compute().item()
    recall = recall_metric.compute().item()
    f1_score = f1_metric.compute().item()

    average_loss = total_loss / len(dataloader)

    print(f"Evaluation Loss: {average_loss:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")
    
    return average_loss, precision, recall, f1_score


In [126]:
def test_model(model, dense_layer, dataloader, criterion, device):
    model.eval()
    dense_layer.eval()  # Set the model to evaluation mode
    total_loss = 0

    # Initialize the metrics for binary classification
    precision_metric = BinaryPrecision().to(device)
    recall_metric = BinaryRecall().to(device)
    f1_metric = BinaryF1Score().to(device)

    with torch.no_grad():
        for text_features, audio_features, video_features, targets in dataloader:
            text_features, audio_features, video_features, targets = (
                text_features.to(device),
                audio_features.to(device),
                video_features.to(device),
                targets.to(device).view(-1)
            )
            
            # Pass inputs through SMCA model
            outputs = model(modalityAlpha=text_features.to(device), modalityBeta=audio_features.to(device), modalityGamma=video_features.to(device), device=device)

            # Check if padding is necessary
            output_size = outputs.size(1)
            dense_input_size = dense_layer.fc.in_features
            
            if output_size < dense_input_size:
                # Pad the outputs if they are smaller than the expected size for the dense layer
                padding_size = dense_input_size - output_size
                # Pad on the second dimension (feature dimension)
                outputs = torch.nn.functional.pad(outputs, (0, padding_size))
            elif output_size > dense_input_size:
                # In case outputs are larger (though unlikely, we trim)
                outputs = outputs[:, :dense_input_size]

            # Pass the fused features through the dense layer
            predictions = torch.sigmoid(dense_layer(outputs)).view(-1)  
            
            # Compute loss
            loss = criterion(predictions, targets)
            total_loss += loss.item()

            # Apply threshold to get binary predictions
            preds = (predictions > 0.5).float()
            
            # Update the precision, recall, and F1 score metrics
            precision_metric.update(preds.long(), targets.long())
            recall_metric.update(preds.long(), targets.long())
            f1_metric.update(preds.long(), targets.long())

    # Compute precision, recall, and F1 score
    precision = precision_metric.compute().item()
    recall = recall_metric.compute().item()
    f1_score = f1_metric.compute().item()

    average_loss = total_loss / len(dataloader)

    print(f"Test Loss: {average_loss:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1_score:.4f}")

    return average_loss, precision, recall, f1_score


In [127]:
# Find the largest output size from the SMCA model
def find_largest_output_size(model, dataloader, device):
    max_output_size = 0

    model.eval()
    with torch.no_grad():
        for text_features, audio_features, video_features, targets in dataloader:
            # Move features to device
            text_features = text_features.to(device)
            audio_features = audio_features.to(device)
            video_features = video_features.to(device)

            # Pass inputs through the SMCA model
            outputs = model(
                modalityAlpha=audio_features, 
                modalityBeta=text_features, 
                modalityGamma=video_features,
                device=device
            )

            # Compare and store the maximum output size
            if outputs.size(1) > max_output_size:
                max_output_size = outputs.size(1)

    return max_output_size

### Fusion Model A

In [130]:
if __name__ == "__main__":
    torch.manual_seed(42)

    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")

    # Determine the output dimensions
    output_dim = 768

    # Initialize the SMCA model A
    model = SMCAModelA(512, 256, device) # Dimension for d_out_kq and d_out_v
    model.to(device)  # Move the model to the correct device

    # Loop through the dataloaders to determine the largest output size
    max_output_size_train = find_largest_output_size(model, train_dataloader, device)
    max_output_size_val = find_largest_output_size(model, val_dataloader, device)
    max_output_size_test = find_largest_output_size(model, test_dataloader, device)

    # Get the overall largest output size
    max_output_size = max(max_output_size_train, max_output_size_val, max_output_size_test)

    # Initialize the DenseLayer with the largest output size
    dense_layer = DenseLayer(input_size=512).to(device)  # Initialize and move to the correct device

    # Define the loss function and optimizer
    criterion = BCELoss()  # Use appropriate loss function
    optimizer = get_optimizer(list(dense_layer.parameters())+ list(model.parameters()))  # Pass only the dense layer parameters

    # Training loop
    num_epochs = 10  # Set the number of epochs you want to train for

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Ensure you have a dataloader that yields inputs and targets
        train_loss = train_model(model=model, dense_layer=dense_layer, dataloader=train_dataloader, criterion=criterion, optimizer=optimizer, device=device)
        
        # Validate model
        val_loss, precision, recall, f1_score = evaluate_model(model=model, dense_layer=dense_layer, dataloader=val_dataloader, criterion=criterion, device=device)

        print(f"Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
        print("-" * 30)
    
    # Testing the model
    print("Testing the model on the test set...")
    test_loss, test_precision, test_recall, test_f1_score = test_model(model=model, dense_layer=dense_layer, dataloader=test_dataloader, criterion=criterion, device=device)


Device: cuda
Epoch 1/10
Evaluation Loss: 0.6329
Precision: 0.7460
Recall: 0.9724
F1 Score: 0.8443
Training Loss: 0.7268, Validation Loss: 0.6329
------------------------------
Epoch 2/10
Evaluation Loss: 0.6759
Precision: 0.7360
Recall: 1.0000
F1 Score: 0.8480
Training Loss: 0.6602, Validation Loss: 0.6759
------------------------------
Epoch 3/10
Evaluation Loss: 0.6916
Precision: 0.7394
Recall: 0.8414
F1 Score: 0.7871
Training Loss: 0.6358, Validation Loss: 0.6916
------------------------------
Epoch 4/10
Evaluation Loss: 0.6592
Precision: 0.7212
Recall: 0.8207
F1 Score: 0.7677
Training Loss: 0.6235, Validation Loss: 0.6592
------------------------------
Epoch 5/10
Evaluation Loss: 0.6339
Precision: 0.7293
Recall: 0.9103
F1 Score: 0.8098
Training Loss: 0.6419, Validation Loss: 0.6339
------------------------------
Epoch 6/10
Evaluation Loss: 0.6463
Precision: 0.7348
Recall: 0.9172
F1 Score: 0.8160
Training Loss: 0.6483, Validation Loss: 0.6463
------------------------------
Epoch 7/1

### Fusion Model B

In [131]:
if __name__ == "__main__":
    torch.manual_seed(42)

    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")

    # Determine the output dimensions
    output_dim = 768

    # Initialize the SMCA model A
    model = SMCAModelB(512, 256, device) # Dimension for d_out_kq and d_out_v
    model.to(device)  # Move the model to the correct device


    # Loop through the dataloaders to determine the largest output size
    max_output_size_train = find_largest_output_size(model, train_dataloader, device)
    max_output_size_val = find_largest_output_size(model, val_dataloader, device)
    max_output_size_test = find_largest_output_size(model, test_dataloader, device)

    # Get the overall largest output size
    max_output_size = max(max_output_size_train, max_output_size_val, max_output_size_test)

    # Initialize the DenseLayer with the largest output size
    dense_layer = DenseLayer(input_size=512).to(device)  # Initialize and move to the correct device

    
    # Define the loss function and optimizer
    criterion = BCELoss()  # Use appropriate loss function
    optimizer = get_optimizer(dense_layer.parameters())  # Pass only the dense layer parameters

    # Training loop
    num_epochs = 10  # Set the number of epochs you want to train for

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Ensure you have a dataloader that yields inputs and targets
        train_loss = train_model(model=model, dense_layer=dense_layer, dataloader=train_dataloader, criterion=criterion, optimizer=optimizer, device=device)
        
        # Validate model
        val_loss, precision, recall, f1_score = evaluate_model(model=model, dense_layer=dense_layer, dataloader=val_dataloader, criterion=criterion, device=device)

        print(f"Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
        print("-" * 30)
    
    # Testing the model
    print("Testing the model on the test set...")
    test_loss, test_precision, test_recall, test_f1_score = test_model(model=model, dense_layer=dense_layer, dataloader=test_dataloader, criterion=criterion, device=device)
    


Device: cuda
Epoch 1/10
Evaluation Loss: 1.3887
Precision: 0.7667
Recall: 0.3172
F1 Score: 0.4488
Training Loss: 1.1935, Validation Loss: 1.3887
------------------------------
Epoch 2/10
Evaluation Loss: 0.7448
Precision: 0.7626
Recall: 0.7310
F1 Score: 0.7465
Training Loss: 1.4990, Validation Loss: 0.7448
------------------------------
Epoch 3/10
Evaluation Loss: 1.2374
Precision: 0.7525
Recall: 0.5241
F1 Score: 0.6179
Training Loss: 1.2258, Validation Loss: 1.2374
------------------------------
Epoch 4/10
Evaluation Loss: 1.0247
Precision: 0.7586
Recall: 0.6069
F1 Score: 0.6743
Training Loss: 1.2109, Validation Loss: 1.0247
------------------------------
Epoch 5/10
Evaluation Loss: 0.9895
Precision: 0.7007
Recall: 0.7103
F1 Score: 0.7055
Training Loss: 1.1509, Validation Loss: 0.9895
------------------------------
Epoch 6/10
Evaluation Loss: 1.0092
Precision: 0.7706
Recall: 0.5793
F1 Score: 0.6614
Training Loss: 1.0474, Validation Loss: 1.0092
------------------------------
Epoch 7/1