### Prerequisite Packages

In [1]:
import sys
import os
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, Subset
from torcheval.metrics import BinaryPrecision, BinaryRecall, BinaryF1Score
from sklearn.model_selection import train_test_split, KFold

In [None]:
sys.path.append('../')

from modules.cross_attentionb import CrossAttentionB
from modules.dataloader import load_npy_files
from modules.classifier import DenseLayer, BCELoss, CustomLoss, BCEWithLogits
from modules.linear_transformation import LinearTransformations

### Data Loading

In [None]:
class MultimodalDataset(Dataset):
    def __init__(self, id_label_df, text_features, audio_features, video_features):
        self.id_label_df = id_label_df
        
        # Convert feature lists to dictionaries for fast lookup
        self.text_features = {os.path.basename(file).split('.')[0]: tensor for file, tensor in text_features}
        self.audio_features = {os.path.basename(file).split('_')[1].split('.')[0]: tensor for file, tensor in audio_features}
        self.video_features = {os.path.basename(file).split('_')[0]: tensor for file, tensor in video_features}

        # List to store missing files
        self.missing_files = []

        # Filter out entries with missing files
        self.valid_files = self._filter_valid_files()


    def _filter_valid_files(self):
        valid_indices = []
        missing_files = []
        
        for idx in range(len(self.id_label_df)):
            imdbid = self.id_label_df.iloc[idx]['IMDBid']

            # Check if the IMDBid exists in each modality's features
            if imdbid in self.text_features and imdbid in self.audio_features and imdbid in self.video_features:
                valid_indices.append(idx)
            else:
                missing_files.append({'IMDBid': imdbid})

        # Filter id_label_df to only include valid rows
        self.id_label_df = self.id_label_df.iloc[valid_indices].reset_index(drop=True)
        self.missing_files = missing_files

        # Return valid indices
        return valid_indices
    
    def __len__(self):
        return len(self.valid_files)

    def __getitem__(self, idx):
        # Get the original index from the filtered valid files
        original_idx = self.valid_files[idx]
        imdbid = self.id_label_df.iloc[original_idx]['IMDBid']
        label = self.id_label_df.iloc[original_idx]['Label']

        # Retrieve data from the loaded features
        text_data = self.text_features.get(imdbid, torch.zeros((1024,)))
        audio_data = self.audio_features.get(imdbid, torch.zeros((1, 197, 768)))
        video_data = self.video_features.get(imdbid, torch.zeros((95, 768)))
        
        # Define label mapping
        label_map = {'red': 1, 'green': 0} 
        
        # Convert labels to tensor using label_map
        try:
            label_data = torch.tensor([label_map[label]], dtype=torch.float32)  # Ensure labels are integers
        except KeyError as e:
            print(f"Error: Label '{e}' not found in label_map.")
            raise

        return text_data, audio_data, video_data, label_data


In [4]:
def collate_fn(batch):
    text_data, audio_data, video_data, label_data = zip(*batch)

    # Convert lists to tensors
    text_data = torch.stack(text_data)
    audio_data = torch.stack(audio_data)

    # Padding for video data
    # Determine maximum length of video sequences in the batch
    video_lengths = [v.size(0) for v in video_data]
    max_length = max(video_lengths)

    # Pad video sequences to the maximum length
    video_data_padded = torch.stack([
        F.pad(v, (0, 0, 0, max_length - v.size(0)), "constant", 0)
        for v in video_data
    ])

    # Convert labels to tensor and ensure the shape [batch_size, 1]
    label_data = torch.stack(label_data)  # Convert list of tensors to a single tensor

    return text_data, audio_data, video_data_padded, label_data

In [5]:
# Load the labels DataFrame
id_label_df = pd.read_excel('../misc/MM-Trailer_dataset.xlsx')

# Define the directories
text_features_dir = 'C:\\Users\\edjin\\OneDrive\\Documents\\Programming Files\\Thesis\\SMCA\\misc\\textStream_BERT\\feature_vectors\\feature_vectors'
audio_features_dir = 'C:\\Users\\edjin\\OneDrive\\Documents\\Programming Files\\Thesis\\SMCA\\misc\\audio_fe\\logmel_spectrograms'
video_features_dir = 'C:\\Users\\edjin\\OneDrive\\Documents\\Programming Files\\Thesis\\SMCA\\misc\\visualStream_ViT\\feature_vectors'

# Load the feature vectors from each directory
text_features = load_npy_files(text_features_dir)
audio_features = load_npy_files(audio_features_dir)
video_features = load_npy_files(video_features_dir)

print(f"Number of text feature vectors loaded: {len(text_features)}")
print(f"Number of audio feature vectors loaded: {len(audio_features)}")
print(f"Number of video feature vectors loaded: {len(video_features)}")

Number of text feature vectors loaded: 1353
Number of audio feature vectors loaded: 1353
Number of video feature vectors loaded: 1353


### Important Functions

In [6]:
# Cross Attention Function
def PairCrossAttention(modalityAlpha, modalityBeta, d_out_kq=768, d_out_v=768):
    cross_attn = CrossAttentionB(modalityAlpha.shape[-1], modalityBeta.shape[-1], d_out_kq, d_out_v)
    modalityAlphaBeta = cross_attn(modalityAlpha, modalityBeta)
    return modalityAlphaBeta

### SMCA Functions and Model

In [7]:
def SMCAStage1(modalityAlpha, modalityBeta, d_out_kq, d_out_v, device):
    cross_attn = CrossAttentionB(modalityAlpha.shape[-1], modalityBeta.shape[-1], d_out_kq, d_out_v).to(device)

    # Cross-attention: Alpha -> Beta
    alphaBeta = cross_attn(modalityAlpha, modalityBeta)  # Shape: (batch_size, num_queries, d_out_v)

    # Cross-attention: Beta -> Alpha
    betaAlpha = cross_attn(modalityBeta, modalityAlpha)  # Shape: (batch_size, num_kv, d_out_v)

    # Get the sequence lengths
    seq_len_alpha = alphaBeta.size(1)  # This is num_queries
    seq_len_beta = betaAlpha.size(1)    # This is num_kv

    # Instead of expanding, use padding or trimming
    max_seq_len = max(seq_len_alpha, seq_len_beta)

    # Ensure both alphaBeta and betaAlpha are of shape (batch_size, max_seq_len, d_out_v)
    if seq_len_alpha < max_seq_len:
        alphaBeta = torch.nn.functional.pad(alphaBeta, (0, 0, 0, max_seq_len - seq_len_alpha), value=0)

    if seq_len_beta < max_seq_len:
        betaAlpha = torch.nn.functional.pad(betaAlpha, (0, 0, 0, max_seq_len - seq_len_beta), value=0)

    # Concatenate cross-attention outputs along the feature dimension (-1)
    modalityAlphaBeta = torch.cat((alphaBeta, betaAlpha), dim=-1)  # Shape: (batch_size, max_seq_len, 2 * d_out_v)

    return modalityAlphaBeta


In [8]:
import torch
import torch.nn as nn

class ProjectionLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ProjectionLayer, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

def SMCAStage2(modalityAlphaBeta, modalityGamma, d_out_kq, d_out_v, device):
    # modalityAlphaBeta: (batch_size, seq_len, 2 * d_out_v) [output of Stage 1]
    
    # Initialize the projection layer for modalityAlphaBeta
    projection_layer = ProjectionLayer(modalityAlphaBeta.shape[-1], d_out_v).to(device)

    # Project modalityAlphaBeta to (batch_size, seq_len, d_out_v)
    modalityAlphaBetaProjected = projection_layer(modalityAlphaBeta)

    # Initialize the cross-attention module
    cross_attn = CrossAttentionB(modalityAlphaBetaProjected.shape[-1], modalityGamma.shape[-1], d_out_kq, d_out_v).to(device)

    # Cross-attention: AlphaBeta -> Gamma
    alphaBetaGamma = cross_attn(modalityAlphaBetaProjected, modalityGamma)  # Shape: (batch_size, seq_len_alphaBeta, d_out_v)

    # Cross-attention: Gamma -> AlphaBeta
    gammaAlphaBeta = cross_attn(modalityGamma, modalityAlphaBetaProjected)  # Shape: (batch_size, seq_len_gamma, d_out_v)

    # Get the sequence lengths for both modalities
    seq_len_alphaBeta = alphaBetaGamma.size(1)
    seq_len_gamma = gammaAlphaBeta.size(1)

    # Pad the smaller sequence to match the larger one (expanding to before)
    max_seq_len = max(seq_len_alphaBeta, seq_len_gamma)

    if seq_len_alphaBeta < max_seq_len:
        alphaBetaGamma = torch.nn.functional.pad(alphaBetaGamma, (0, 0, 0, max_seq_len - seq_len_alphaBeta), value=0)

    if seq_len_gamma < max_seq_len:
        gammaAlphaBeta = torch.nn.functional.pad(gammaAlphaBeta, (0, 0, 0, max_seq_len - seq_len_gamma), value=0)

    # Concatenate along the feature dimension (-1)
    multimodal_representation = torch.cat((alphaBetaGamma, gammaAlphaBeta), dim=-1)  # Shape: (batch_size, max(seq_len_alphaBeta, seq_len_gamma), 2 * d_out_v)

    # Apply Global Average Pooling across the feature (sequence to before)
    GAP = torch.mean(multimodal_representation, dim=1)  # Shape: (batch_size, 2 * d_out_v)

    return GAP


In [9]:
class SMCAModel(nn.Module):
    def __init__(self, d_out_kq, d_out_v, device):
        super(SMCAModel, self).__init__()
        self.d_out_kq = d_out_kq
        self.d_out_v = d_out_v
        self.device = device
    
    def forward(self, modalityAlpha, modalityBeta, modalityGamma):
        # Stage 1: Cross attention between modalityAlpha and modalityBeta
        modalityAlphaBeta = SMCAStage1(modalityAlpha, modalityBeta, self.d_out_kq, self.d_out_v, self.device)

        # Stage 2: Cross attention with modalityAlphaBeta (as query) and modalityGamma (as key-value)
        multimodal_representation = SMCAStage2(modalityAlphaBeta, modalityGamma, self.d_out_kq, self.d_out_v, self.device)
 
       
        return multimodal_representation

Test on one instance

In [10]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SMCAModel(768, 768, device) # Dimension for d_out_kq and d_out_v

# Select the first file from each modality directories (for testing)
video_file_name, video_feature = video_features[5]
audio_file_name, audio_feature = audio_features[5]
text_file_name, text_feature = text_features[5]

# Print the file names
print("\nSelected File Names:")
print("Audio file:", audio_file_name)
print("Video file:", video_file_name)
print("Text file:", text_file_name)

video_feature = video_feature.unsqueeze(0)  # Add batch dimension
text_feature = text_feature.unsqueeze(0)    # Add batch dimension

modalityAlpha=audio_feature.to(device) 
modalityBeta=text_feature.to(device)
modalityGamma=video_feature.to(device)

# Apply linear transformation to match dimensions
linear_transform_Alpha = LinearTransformations(modalityAlpha.shape[-1], 768).to(device)
linear_transform_Beta = LinearTransformations(modalityBeta.shape[-1], 768).to(device)
linear_transform_Gamma = LinearTransformations(modalityGamma.shape[-1], 768).to(device)

modalityAlpha = linear_transform_Alpha(modalityAlpha).to(device)
modalityBeta = linear_transform_Beta(modalityBeta).to(device)
modalityGamma = linear_transform_Gamma(modalityGamma).to(device)

print("Audio: ",modalityAlpha.shape)
print("Text: ",modalityBeta.shape)
print("Video: ",modalityGamma.shape)

outputs = model(modalityAlpha, modalityBeta, modalityGamma)

print("Stage 2:", outputs.shape)





Selected File Names:
Audio file: C:\Users\edjin\OneDrive\Documents\Programming Files\Thesis\SMCA\misc\audio_fe\logmel_spectrograms\feature_tt0042767.npy
Video file: C:\Users\edjin\OneDrive\Documents\Programming Files\Thesis\SMCA\misc\visualStream_ViT\feature_vectors\tt0042767_features.npy
Text file: C:\Users\edjin\OneDrive\Documents\Programming Files\Thesis\SMCA\misc\textStream_BERT\feature_vectors\feature_vectors\tt0042767.npy
Audio:  torch.Size([1, 197, 768])
Text:  torch.Size([1, 768])
Video:  torch.Size([1, 113, 768])
Stage 2: torch.Size([1, 1536])


Test on Entire Dataset

In [11]:
# Load the labels DataFrame
id_label_df = pd.read_excel('../misc/MM-Trailer_dataset.xlsx')

# Define the directories
text_features_dir = 'C:\\Users\\edjin\\OneDrive\\Documents\\Programming Files\\Thesis\\SMCA\\misc\\textStream_BERT\\feature_vectors\\feature_vectors'
audio_features_dir = 'C:\\Users\\edjin\\OneDrive\\Documents\\Programming Files\\Thesis\\SMCA\\misc\\audio_fe\\logmel_spectrograms'
video_features_dir = 'C:\\Users\\edjin\\OneDrive\\Documents\\Programming Files\\Thesis\\SMCA\\misc\\visualStream_ViT\\feature_vectors'

# Load the feature vectors from each directory
text_features = load_npy_files(text_features_dir)
audio_features = load_npy_files(audio_features_dir)
video_features = load_npy_files(video_features_dir)

print(f"Number of text feature vectors loaded: {len(text_features)}")
print(f"Number of audio feature vectors loaded: {len(audio_features)}")
print(f"Number of video feature vectors loaded: {len(video_features)}")

# Drop unnecessary columns
id_label_df = id_label_df.drop(columns=['Movie Title', 'URL'])

# Splitting data for training, validation, and testing
train_df, val_test_df = train_test_split(id_label_df, test_size=0.3, random_state=42)

# Further splitting remaining set into validation and test sets
val_df, test_df = train_test_split(val_test_df, test_size=0.5, random_state=42)

print("-" * 30)

# Create datasets
train_dataset = MultimodalDataset(train_df, text_features, audio_features, video_features)
val_dataset = MultimodalDataset(val_df, text_features, audio_features, video_features)
test_dataset = MultimodalDataset(test_df, text_features, audio_features, video_features)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True, num_workers=0, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True, num_workers=0, collate_fn=collate_fn)

# Combine all data for K-fold cross-validation
full_dataset = MultimodalDataset(id_label_df, text_features, audio_features, video_features)

Number of text feature vectors loaded: 1353
Number of audio feature vectors loaded: 1353
Number of video feature vectors loaded: 1353
------------------------------
Missing files:
IMDBid: tt2494280
IMDBid: tt1724962
IMDBid: tt1152836
IMDBid: tt0389790
IMDBid: tt3053228
IMDBid: tt1045778
IMDBid: tt1758795
IMDBid: tt0099385
IMDBid: tt2917484
IMDBid: tt4769836
IMDBid: tt0089652
IMDBid: tt0465494
IMDBid: tt3675748
IMDBid: tt2126362
IMDBid: tt0988083
IMDBid: tt2101341
IMDBid: tt0401997
IMDBid: tt1661461
IMDBid: tt1313139
IMDBid: tt1094661
IMDBid: tt5162658
IMDBid: tt0104839
IMDBid: tt1288558
IMDBid: tt5962210
IMDBid: tt2937696
IMDBid: tt0284363
IMDBid: tt5580390
IMDBid: tt2293750
IMDBid: tt2980472
IMDBid: tt0082186
IMDBid: tt0924129
IMDBid: tt0988595
IMDBid: tt1349482
IMDBid: tt4158096
IMDBid: tt1403241
IMDBid: tt2713642
IMDBid: tt1682940
IMDBid: tt10327354
IMDBid: tt1087842
IMDBid: tt1800302
IMDBid: tt0113855
IMDBid: tt2504022
IMDBid: tt7248248
IMDBid: tt1720164
IMDBid: tt1336621
IMDBid: t

In [12]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model
model = SMCAModel(768, 768, device)  # Dimension for d_out_kq and d_out_v

print(f"Model parameters: {list(model.parameters())}")


# Training loop
for text_features, audio_features, video_features, targets in train_dataloader:
    # Move features to the specified device
    text_features = text_features.to(device)
    audio_features = audio_features.to(device)
    video_features = video_features.to(device)

    # Squeeze the audio features to remove the extra dimension
    audio_features = audio_features.squeeze(1) 

    # Print dimensions for debugging
    print("Text Dimension: ", text_features.shape)  
    print("Audio Dimension: ", audio_features.shape)  
    print("Video Dimension: ", video_features.shape) 

    # Apply linear transformations to match dimensions
    linear_transform_Alpha = LinearTransformations(audio_features.shape[-1], 768).to(device) 
    linear_transform_Beta = LinearTransformations(text_features.shape[-1], 768).to(device)   
    linear_transform_Gamma = LinearTransformations(video_features.shape[-1], 768).to(device)    

    # Transform features to match the target dimension
    modalityAlpha = linear_transform_Alpha(audio_features).to(device)  
    modalityBeta = linear_transform_Beta(text_features).to(device)    
    modalityGamma = linear_transform_Gamma(video_features).to(device)

    # Print shapes after transformation to verify the batch dimension
    print("Transformed Audio Dimension: ", modalityAlpha.shape)  
    print("Transformed Text Dimension: ", modalityBeta.shape)    
    print("Transformed Video Dimension: ", modalityGamma.shape)  

    # Pass inputs through the SMCA model
    outputs = model(
        modalityAlpha=modalityAlpha,  # Ensure to pass transformed modalities
        modalityBeta=modalityBeta,
        modalityGamma=modalityGamma,
    )

    print("Stage 2:", outputs.shape)  # Check the output shape
    print("--------")


Model parameters: []
Text Dimension:  torch.Size([16, 1024])
Audio Dimension:  torch.Size([16, 197, 768])
Video Dimension:  torch.Size([16, 180, 768])
Transformed Audio Dimension:  torch.Size([16, 197, 768])
Transformed Text Dimension:  torch.Size([16, 768])
Transformed Video Dimension:  torch.Size([16, 180, 768])
Stage 2: torch.Size([16, 1536])
--------
Text Dimension:  torch.Size([16, 1024])
Audio Dimension:  torch.Size([16, 197, 768])
Video Dimension:  torch.Size([16, 162, 768])
Transformed Audio Dimension:  torch.Size([16, 197, 768])
Transformed Text Dimension:  torch.Size([16, 768])
Transformed Video Dimension:  torch.Size([16, 162, 768])
Stage 2: torch.Size([16, 1536])
--------
Text Dimension:  torch.Size([16, 1024])
Audio Dimension:  torch.Size([16, 197, 768])
Video Dimension:  torch.Size([16, 169, 768])
Transformed Audio Dimension:  torch.Size([16, 197, 768])
Transformed Text Dimension:  torch.Size([16, 768])
Transformed Video Dimension:  torch.Size([16, 169, 768])
Stage 2: tor

### Model Training

In [13]:
def train_model(model, dense_layer, dataloader, criterion, optimizer, device):
    model.train()
    dense_layer.train()  # Set the model to training mode
    total_loss = 0.0

    for text_features, audio_features, video_features, targets in dataloader:
        text_features, audio_features, video_features, targets = (
            text_features.to(device),
            audio_features.to(device),
            video_features.to(device),
            targets.to(device).view(-1)
        )
        
        optimizer.zero_grad()
        
        # Pass inputs through SMCA model
        # Squeeze the audio features to remove the extra dimension
        audio_features = audio_features.squeeze(1) 

        # Apply linear transformations to match dimensions
        linear_transform_Alpha = LinearTransformations(audio_features.shape[-1], 768).to(device)   
        linear_transform_Beta = LinearTransformations(text_features.shape[-1], 768).to(device)      
        linear_transform_Gamma = LinearTransformations(video_features.shape[-1], 768).to(device)       

        # Transform features to match the target dimension
        modalityAlpha = linear_transform_Alpha(audio_features).to(device)     
        modalityBeta = linear_transform_Beta(text_features).to(device)       
        modalityGamma = linear_transform_Gamma(video_features).to(device)   
        
        outputs = model(
            modalityAlpha=modalityAlpha,  # Ensure to pass transformed modalities
            modalityBeta=modalityBeta,
            modalityGamma=modalityGamma,
        )

        # Pass the fused features through the dense layer
        predictions = dense_layer(outputs).view(-1)

        # Compute loss
        loss = criterion(predictions, targets)
        total_loss += loss.item()
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)

In [14]:
def evaluate_model(model, dense_layer, dataloader, criterion, device):
    model.eval()
    dense_layer.eval()
    total_loss = 0.0

    # Initialize the metrics for binary classification
    precision_metric = BinaryPrecision().to(device)
    recall_metric = BinaryRecall().to(device)
    f1_metric = BinaryF1Score().to(device)

    precision_metric.reset()
    recall_metric.reset()
    f1_metric.reset()

    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
         for text_features, audio_features, video_features, targets in dataloader:
            text_features, audio_features, video_features, targets = (
                text_features.to(device),
                audio_features.to(device),
                video_features.to(device),
                targets.to(device).view(-1)
            )
            
            # Pass inputs through SMCA model
            # Squeeze the audio features to remove the extra dimension
            audio_features = audio_features.squeeze(1) 

            # Apply linear transformations to match dimensions
            linear_transform_Alpha = LinearTransformations(audio_features.shape[-1], 768).to(device)    
            linear_transform_Beta = LinearTransformations(text_features.shape[-1], 768).to(device)      
            linear_transform_Gamma = LinearTransformations(video_features.shape[-1], 768).to(device)       

            # Transform features to match the target dimension
            modalityAlpha = linear_transform_Alpha(audio_features).to(device)     
            modalityBeta = linear_transform_Beta(text_features).to(device)      
            modalityGamma = linear_transform_Gamma(video_features).to(device)   
            
            outputs = model(modalityAlpha=modalityAlpha, modalityBeta=modalityBeta, modalityGamma=modalityGamma)

            # Pass the fused features through the dense layer
            predictions = dense_layer(outputs).view(-1) 

            all_predictions.extend(predictions.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
            
            # Compute loss
            loss = criterion(predictions, targets)
            total_loss += loss.item()

            # Apply threshold to get binary predictions
            preds = (predictions > 0.5).float()
            
            # Update the precision, recall, and F1 score metrics
            precision_metric.update(preds.long(), targets.long())
            recall_metric.update(preds.long(), targets.long())
            f1_metric.update(preds.long(), targets.long())

    # Compute precision, recall, and F1 score
    precision = precision_metric.compute().item()
    recall = recall_metric.compute().item()
    f1_score = f1_metric.compute().item()

    average_loss = total_loss / len(dataloader)
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")
    
    return average_loss, precision, recall, f1_score


In [15]:
def test_model(model, dense_layer, dataloader, criterion, device):
    model.eval()
    dense_layer.eval()  # Set the model to evaluation mode
    total_loss = 0

    # Initialize the metrics for binary classification
    precision_metric = BinaryPrecision().to(device)
    recall_metric = BinaryRecall().to(device)
    f1_metric = BinaryF1Score().to(device)

    with torch.no_grad():
        for text_features, audio_features, video_features, targets in dataloader:
            text_features, audio_features, video_features, targets = (
                text_features.to(device),
                audio_features.to(device),
                video_features.to(device),
                targets.to(device).view(-1)
            )
            
            # Pass inputs through SMCA model
            # Squeeze the audio features to remove the extra dimension
            audio_features = audio_features.squeeze(1) 

            # Apply linear transformations to match dimensions
            linear_transform_Alpha = LinearTransformations(audio_features.shape[-1], 768).to(device)    
            linear_transform_Beta = LinearTransformations(text_features.shape[-1], 768).to(device)      
            linear_transform_Gamma = LinearTransformations(video_features.shape[-1], 768).to(device)       

            # Transform features to match the target dimension
            modalityAlpha = linear_transform_Alpha(audio_features).to(device)     
            modalityBeta = linear_transform_Beta(text_features).to(device)       
            modalityGamma = linear_transform_Gamma(video_features).to(device)   
            
            outputs = model(modalityAlpha=modalityAlpha, modalityBeta=modalityBeta, modalityGamma=modalityGamma)
            
            # Pass the fused features through the dense layer
            predictions = dense_layer(outputs).view(-1)  
            
            # Compute loss
            loss = criterion(predictions, targets)
            total_loss += loss.item()

            # Apply threshold to get binary predictions
            preds = (predictions > 0.5).float()
            
            # Update the precision, recall, and F1 score metrics
            precision_metric.update(preds.long(), targets.long())
            recall_metric.update(preds.long(), targets.long())
            f1_metric.update(preds.long(), targets.long())

    # Compute precision, recall, and F1 score
    precision = precision_metric.compute().item()
    recall = recall_metric.compute().item()
    f1_score = f1_metric.compute().item()

    average_loss = total_loss / len(dataloader)

    print(f"Test Loss: {average_loss:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1_score:.4f}")

    return average_loss, precision, recall, f1_score


In [16]:
def get_optimizer(parameters, lr=1e-3):
    # Create an optimizer, for example, Adam
    return optim.Adam(parameters, lr=lr)

In [17]:
if __name__ == "__main__":
    torch.manual_seed(42)

    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")

    # Determine the output dimensions
    output_dim = 768

    # Initialize the SMCA model A
    model = SMCAModel(768, 768, device)  # Dimension for d_out_kq and d_out_v
    model.to(device)  # Move the model to the correct device

    # Initialize the DenseLayer with the largest output size
    dense_layer = DenseLayer(input_size=output_dim*2).to(device)  # Initialize and move to the correct device

    # Define the loss function and optimizer
    criterion = BCELoss()  # Use appropriate loss function
    
    for param in model.parameters():
        if param.grad is None:
            print("No gradient for:", param)
    optimizer = get_optimizer(list(model.parameters()) + list(dense_layer.parameters()))


    # Training loop
    num_epochs = 10  # Set the number of epochs you want to train for
   
    for epoch in range(num_epochs):
        print("-" * 30)
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Ensure you have a dataloader that yields inputs and targets
        train_loss = train_model(model=model, dense_layer=dense_layer, dataloader=train_dataloader, criterion=criterion, optimizer=optimizer, device=device)

        # Validate step
        val_loss, precision, recall, f1_score = evaluate_model(model=model, dense_layer=dense_layer, dataloader=val_dataloader, criterion=criterion, device=device)

        print(f"Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

    # Testing the model
    print("-" * 30)
    print("Testing the model on the test set...")
    test_loss, test_precision, test_recall, test_f1_score = test_model(model=model, dense_layer=dense_layer, dataloader=test_dataloader, criterion=criterion, device=device)


Device: cuda
------------------------------
Epoch 1/10
Precision: 0.1111
Recall: 0.0192
F1 Score: 0.0328
Training Loss: 0.6867, Validation Loss: 0.6828
------------------------------
Epoch 2/10
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
Training Loss: 0.6756, Validation Loss: 0.6671
------------------------------
Epoch 3/10
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
Training Loss: 0.6621, Validation Loss: 0.6585
------------------------------
Epoch 4/10
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
Training Loss: 0.6514, Validation Loss: 0.6505
------------------------------
Epoch 5/10
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
Training Loss: 0.6434, Validation Loss: 0.6431
------------------------------
Epoch 6/10
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
Training Loss: 0.6346, Validation Loss: 0.6410
------------------------------
Epoch 7/10
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
Training Loss: 0.6239, Validation Loss: 0.6256
--------------

### K-Fold Evaluation

In [18]:
def cross_validate_model(
    dataset, 
    model_class, 
    dense_layer_class, 
    num_folds, 
    num_epochs, 
    batch_size, 
    learning_rate,
    output_file,
    device=None
):
    # Set device configuration
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")
    
    # Ensure the output directory exists
    output_dir = os.path.dirname(output_file)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)  # Creates the directory if it does not exist
    
    # Determine the output dimensions
    output_dim = 768

    # Initialize the KFold splitter
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # lists to store metrics for each fold
    fold_losses = []
    fold_precisions = []
    fold_recalls = []
    fold_f1_scores = []
    f1_scores_per_fold = []  # List to store F1 scores for each fold
    


    # Perform K-Fold Cross-Validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
        print("-" * 50)
        print(f"------ Fold {fold + 1 }/{num_folds} ------")

        # Create data loaders for the train and validation sets
        train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
        val_sampler = torch.utils.data.SubsetRandomSampler(val_idx)
        
        train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler, collate_fn=collate_fn)
        val_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=val_sampler, collate_fn=collate_fn)
        
        # Initialize the model, dense layer, criterion, and optimizer for each fold
        model = model_class(768, 768, device).to(device)
        
        dense_layer = dense_layer_class(input_size=output_dim*2).to(device)
        criterion = BCELoss()
        optimizer = get_optimizer(list(model.parameters()) + list(dense_layer.parameters()), lr=learning_rate)

        # Training loop for each fold
        for epoch in range(num_epochs):
            print(f"Epoch {epoch + 1}/{num_epochs}")
            
            # Train and evaluate the model on the training and validation sets
            train_loss = train_model(model=model, dense_layer=dense_layer, dataloader=train_dataloader, criterion=criterion, optimizer=optimizer, device=device)
            val_loss, precision, recall, f1_score = evaluate_model(model=model, dense_layer=dense_layer, dataloader=val_dataloader, criterion=criterion, device=device)
            
            print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
            print(f"Validation Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1_score:.4f}")
        
        # Store the validation metrics for this fold
        print(f"Results for Fold {fold}: Validation Loss = {val_loss:.4f}, Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1_score:.4f}")
        fold_losses.append(val_loss)
        fold_precisions.append(precision)
        fold_recalls.append(recall)
        fold_f1_scores.append(f1_score)
        f1_scores_per_fold.append(f1_score)  # Save F1 score for the current fold

    # Calculate the average metrics across all folds
    avg_loss = np.mean(fold_losses)
    avg_precision = np.mean(fold_precisions)
    avg_recall = np.mean(fold_recalls)
    avg_f1_score = np.mean(fold_f1_scores)

    print("-" * 50)
    print("\nK-Fold Cross-Validation Results:")
    print(f"Average Loss: {avg_loss:.4f}")
    print(f"Average Precision: {avg_precision:.4f}")
    print(f"Average Recall: {avg_recall:.4f}")
    print(f"Average F1 Score: {avg_f1_score:.4f}")

    # Save F1 scores per fold to a .npy file
    np.save(output_file, np.array(f1_scores_per_fold))
    print(f"F1 scores per fold saved to {output_file}")

    return avg_loss, avg_precision, avg_recall, avg_f1_score

In [19]:
# Run k-fold cross-validation   
results = cross_validate_model(
    dataset=full_dataset,
    model_class=SMCAModel,
    dense_layer_class=DenseLayer,
    num_folds=10,
    num_epochs=10,
    batch_size=32,
    learning_rate=1e-5,
    output_file="results/SMCA-F1_scores.npy"
)


Device: cuda
--------------------------------------------------
------ Fold 1/10 ------
Epoch 1/10
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
Train Loss: 0.6879, Validation Loss: 0.6845
Validation Precision: 0.0000, Recall: 0.0000, F1 Score: 0.0000
Epoch 2/10
Precision: 0.3333
Recall: 0.0294
F1 Score: 0.0541
Train Loss: 0.6889, Validation Loss: 0.6865
Validation Precision: 0.3333, Recall: 0.0294, F1 Score: 0.0541
Epoch 3/10
Precision: 0.0968
Recall: 0.0882
F1 Score: 0.0923
Train Loss: 0.6887, Validation Loss: 0.6918
Validation Precision: 0.0968, Recall: 0.0882, F1 Score: 0.0923
Epoch 4/10
Precision: 0.2083
Recall: 0.2941
F1 Score: 0.2439
Train Loss: 0.6894, Validation Loss: 0.6929
Validation Precision: 0.2083, Recall: 0.2941, F1 Score: 0.2439
Epoch 5/10
Precision: 0.2500
Recall: 0.2353
F1 Score: 0.2424
Train Loss: 0.6879, Validation Loss: 0.6873
Validation Precision: 0.2500, Recall: 0.2353, F1 Score: 0.2424
Epoch 6/10
Precision: 0.2581
Recall: 0.2353
F1 Score: 0.2462
Train Loss: