### Prerequisite Packages

In [43]:
import sys
import os
import pandas as pd
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torcheval.metrics import BinaryPrecision, BinaryRecall, BinaryF1Score
from sklearn.model_selection import train_test_split, KFold

In [44]:
sys.path.append('../')

from modules.cross_attentionb import CrossAttentionB
from modules.dataloader import load_npy_files
from modules.classifier import DenseLayer, BCELoss, CustomLoss, BCEWithLogits
from modules.linear_transformation import LinearTransformations

### Data Loading

In [45]:
class MultimodalDataset(Dataset):
    def __init__(self, id_label_df, text_features, audio_features, video_features):
        self.id_label_df = id_label_df
        
        # Convert feature lists to dictionaries for fast lookup
        self.text_features = {os.path.basename(file).split('.')[0]: tensor for file, tensor in text_features}
        self.audio_features = {os.path.basename(file).split('_')[1].split('.')[0]: tensor for file, tensor in audio_features}
        self.video_features = {os.path.basename(file).split('_')[0]: tensor for file, tensor in video_features}

        # List to store missing files
        self.missing_files = []

        # Filter out entries with missing files
        self.valid_files = self._filter_valid_files()

    def _filter_valid_files(self):
        valid_indices = []
        missing_files = []

        for idx in range(len(self.id_label_df)):
            imdbid = self.id_label_df.iloc[idx]['IMDBid']

            # Check if the IMDBid exists in each modality's features
            if imdbid in self.text_features and imdbid in self.audio_features and imdbid in self.video_features:
                valid_indices.append(idx)
            else:
                missing_files.append({'IMDBid': imdbid})

        # Filter id_label_df to only include valid rows
        self.id_label_df = self.id_label_df.iloc[valid_indices].reset_index(drop=True)
        self.missing_files = missing_files

        # Return valid indices
        return valid_indices

    def __len__(self):
        return len(self.valid_files)

    def __getitem__(self, idx):
        # Get the original index from the filtered valid files
        original_idx = self.valid_files[idx]
        imdbid = self.id_label_df.iloc[original_idx]['IMDBid']
        label = self.id_label_df.iloc[original_idx]['Label']

        # Retrieve data from the loaded features
        text_data = self.text_features.get(imdbid, torch.zeros((1024,)))
        audio_data = self.audio_features.get(imdbid, torch.zeros((1, 197, 768)))
        video_data = self.video_features.get(imdbid, torch.zeros((95, 768)))
        
        # Define label mapping
        label_map = {'red': 1, 'green': 0} 
        
        # Convert labels to tensor using label_map
        try:
            label_data = torch.tensor([label_map[label]], dtype=torch.float32)
        except KeyError as e:
            print(f"Error: Label '{e}' not found in label_map.")
            raise

        return text_data, audio_data, video_data, label_data


In [46]:
def collate_fn(batch):
    text_data, audio_data, video_data, label_data = zip(*batch)

    # Convert lists to tensors
    text_data = torch.stack(text_data)
    audio_data = torch.stack(audio_data)

    # Padding for video data
    # Determine maximum length of video sequences in the batch
    video_lengths = [v.size(0) for v in video_data]
    max_length = max(video_lengths)

    # Pad video sequences to the maximum length
    video_data_padded = torch.stack([
        F.pad(v, (0, 0, 0, max_length - v.size(0)), "constant", 0)
        for v in video_data
    ])

    # Convert labels to tensor and ensure the shape [batch_size, 1]
    label_data = torch.stack(label_data)  # Convert list of tensors to a single tensor

    return text_data, audio_data, video_data_padded, label_data

In [47]:
# Load the labels DataFrame
id_label_df = pd.read_excel('../../misc/MM-Trailer_dataset.xlsx')

# Define the directories
text_features_dir = '../../misc/text_features'
audio_features_dir = '../../misc/audio_features'
video_features_dir = '../../misc/video_features'

# Load the feature vectors from each directory
text_features = load_npy_files(text_features_dir)
audio_features = load_npy_files(audio_features_dir)
video_features = load_npy_files(video_features_dir)

print(f"Number of text feature vectors loaded: {len(text_features)}")
print(f"Number of audio feature vectors loaded: {len(audio_features)}")
print(f"Number of video feature vectors loaded: {len(video_features)}")

# Drop unnecessary columns
id_label_df = id_label_df.drop(columns=['Movie Title', 'URL'])

full_dataset = MultimodalDataset(id_label_df, text_features, audio_features, video_features)

# First, filter the id_label_df using the valid indices before creating dataset splits
filtered_id_label_df = id_label_df.iloc[full_dataset.valid_files].reset_index(drop=True)

# Now, perform train-test split on the filtered DataFrame
train_df, val_test_df = train_test_split(
    filtered_id_label_df, test_size=0.3, random_state=42, stratify=filtered_id_label_df['Label'])

# Further splitting remaining set into validation and test sets
val_df, test_df = train_test_split(
    val_test_df, test_size=0.5, random_state=42, stratify=val_test_df['Label'])

print("-" * 30)

# Now, create datasets based on these splits
train_dataset = MultimodalDataset(train_df, text_features, audio_features, video_features)
val_dataset = MultimodalDataset(val_df, text_features, audio_features, video_features)
test_dataset = MultimodalDataset(test_df, text_features, audio_features, video_features)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=0, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=0, collate_fn=collate_fn)


Number of text feature vectors loaded: 1353
Number of audio feature vectors loaded: 1353
Number of video feature vectors loaded: 1353
------------------------------


### Important Functions

In [48]:
# Cross Attention Function
def PairCrossAttention(modalityAlpha, modalityBeta, d_out_kq=768, d_out_v=768):
    cross_attn = CrossAttentionB(modalityAlpha.shape[-1], modalityBeta.shape[-1], d_out_kq, d_out_v)
    modalityAlphaBeta = cross_attn(modalityAlpha, modalityBeta)
    return modalityAlphaBeta

### SMCA Functions and Model

In [49]:
def SMCAStage1(modalityAlpha, modalityBeta, d_out_kq, d_out_v, device):
    cross_attn = CrossAttentionB(modalityAlpha.shape[-1], modalityBeta.shape[-1], d_out_kq, d_out_v).to(device)

    # Cross-attention: Alpha -> Beta
    alphaBeta = cross_attn(modalityAlpha, modalityBeta)  # Shape: (batch_size, num_queries, d_out_v)

    # Cross-attention: Beta -> Alpha
    betaAlpha = cross_attn(modalityBeta, modalityAlpha)  # Shape: (batch_size, num_kv, d_out_v)

    # Get the sequence lengths
    seq_len_alpha = alphaBeta.size(1)  # This is num_queries
    seq_len_beta = betaAlpha.size(1)    # This is num_kv

    # Instead of expanding, use padding or trimming
    max_seq_len = max(seq_len_alpha, seq_len_beta)

    # Ensure both alphaBeta and betaAlpha are of shape (batch_size, max_seq_len, d_out_v)
    if seq_len_alpha < max_seq_len:
        alphaBeta = torch.nn.functional.pad(alphaBeta, (0, 0, 0, max_seq_len - seq_len_alpha), value=0)

    if seq_len_beta < max_seq_len:
        betaAlpha = torch.nn.functional.pad(betaAlpha, (0, 0, 0, max_seq_len - seq_len_beta), value=0)

    # Concatenate cross-attention outputs along the feature dimension (-1)
    modalityAlphaBeta = torch.cat((alphaBeta, betaAlpha), dim=-1)  # Shape: (batch_size, max_seq_len, 2 * d_out_v)

    return modalityAlphaBeta


In [50]:
class ProjectionLayer(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ProjectionLayer, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.linear(x)

def SMCAStage2(modalityAlphaBeta, modalityGamma, d_out_kq, d_out_v, device):
    # modalityAlphaBeta: (batch_size, seq_len, 2 * d_out_v) [output of Stage 1]
    
    # Initialize the projection layer for modalityAlphaBeta
    projection_layer = ProjectionLayer(modalityAlphaBeta.shape[-1], d_out_v).to(device)

    # Project modalityAlphaBeta to (batch_size, seq_len, d_out_v)
    modalityAlphaBetaProjected = projection_layer(modalityAlphaBeta)

    # Initialize the cross-attention module
    cross_attn = CrossAttentionB(modalityAlphaBetaProjected.shape[-1], modalityGamma.shape[-1], d_out_kq, d_out_v).to(device)

    # Cross-attention: AlphaBeta -> Gamma
    alphaBetaGamma = cross_attn(modalityAlphaBetaProjected, modalityGamma)  # Shape: (batch_size, seq_len_alphaBeta, d_out_v)

    # Cross-attention: Gamma -> AlphaBeta
    gammaAlphaBeta = cross_attn(modalityGamma, modalityAlphaBetaProjected)  # Shape: (batch_size, seq_len_gamma, d_out_v)

    # Get the sequence lengths for both modalities
    seq_len_alphaBeta = alphaBetaGamma.size(1)
    seq_len_gamma = gammaAlphaBeta.size(1)

    # Pad the smaller sequence to match the larger one (expanding to before)
    max_seq_len = max(seq_len_alphaBeta, seq_len_gamma)

    if seq_len_alphaBeta < max_seq_len:
        alphaBetaGamma = torch.nn.functional.pad(alphaBetaGamma, (0, 0, 0, max_seq_len - seq_len_alphaBeta), value=0)

    if seq_len_gamma < max_seq_len:
        gammaAlphaBeta = torch.nn.functional.pad(gammaAlphaBeta, (0, 0, 0, max_seq_len - seq_len_gamma), value=0)

    # Concatenate along the feature dimension (-1)
    multimodal_representation = torch.cat((alphaBetaGamma, gammaAlphaBeta), dim=-1)  # Shape: (batch_size, max(seq_len_alphaBeta, seq_len_gamma), 2 * d_out_v)

    # Apply Global Average Pooling across the feature (sequence to before)
    GAP = torch.mean(multimodal_representation, dim=1)  # Shape: (batch_size, 2 * d_out_v)

    return GAP


In [51]:
class SMCAModel(nn.Module):
    def __init__(self, d_out_kq, d_out_v, device):
        super(SMCAModel, self).__init__()
        self.d_out_kq = d_out_kq
        self.d_out_v = d_out_v
        self.device = device
    
    def forward(self, modalityAlpha, modalityBeta, modalityGamma):
        # Stage 1: Cross attention between modalityAlpha and modalityBeta
        modalityAlphaBeta = SMCAStage1(modalityAlpha, modalityBeta, self.d_out_kq, self.d_out_v, self.device)

        # Stage 2: Cross attention with modalityAlphaBeta (as query) and modalityGamma (as key-value)
        multimodal_representation = SMCAStage2(modalityAlphaBeta, modalityGamma, self.d_out_kq, self.d_out_v, self.device)

        return multimodal_representation

In [52]:
def train_model(model, dense_layer, dataloader, criterion, optimizer, device):
    model.train()
    dense_layer.train()  # Set the model to training mode
    total_loss = 0.0

    for text_features, audio_features, video_features, targets in dataloader:
        text_features, audio_features, video_features, targets = (
            text_features.to(device),
            audio_features.to(device),
            video_features.to(device),
            targets.to(device).view(-1)
        )
        
        optimizer.zero_grad()
        
        # Pass inputs through SMCA model
        # Squeeze the audio features to remove the extra dimension
        audio_features = audio_features.squeeze(1) 

        # Apply linear transformations to match dimensions
        linear_transform_Alpha = LinearTransformations(audio_features.shape[-1], 768) 
        linear_transform_Beta = LinearTransformations(text_features.shape[-1], 768)   
        linear_transform_Gamma = LinearTransformations(video_features.shape[-1], 768)    

        # Transform features to match the target dimension
        modalityAlpha = linear_transform_Alpha(audio_features)  
        modalityBeta = linear_transform_Beta(text_features)    
        modalityGamma = linear_transform_Gamma(video_features)
        
        outputs = model(
            modalityAlpha=modalityAlpha,  # Ensure to pass transformed modalities
            modalityBeta=modalityBeta,
            modalityGamma=modalityGamma,
        )

        # Pass the fused features through the dense layer
        predictions = dense_layer(outputs).view(-1)

        # Compute loss
        loss = criterion(predictions, targets)
        total_loss += loss.item()
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)

In [53]:
def evaluate_model(model, dense_layer, dataloader, criterion, device):
    model.eval()
    dense_layer.eval()
    total_loss = 0.0

    # Initialize the metrics for binary classification
    precision_metric = BinaryPrecision().to(device)
    recall_metric = BinaryRecall().to(device)
    f1_metric = BinaryF1Score().to(device)  # Unweighted F1 Score

    precision_metric.reset()
    recall_metric.reset()
    f1_metric.reset()

    total_tp = 0
    total_fp = 0
    total_fn = 0

    with torch.no_grad():
        for text_features, audio_features, video_features, targets in dataloader:
            text_features, audio_features, video_features, targets = (
                text_features.to(device),
                audio_features.to(device),
                video_features.to(device),
                targets.to(device).view(-1)
            )
            
            # Pass inputs through SMCA model
            audio_features = audio_features.squeeze(1) 

            # Apply linear transformations to match dimensions
            linear_transform_Alpha = LinearTransformations(audio_features.shape[-1], 768) 
            linear_transform_Beta = LinearTransformations(text_features.shape[-1], 768)   
            linear_transform_Gamma = LinearTransformations(video_features.shape[-1], 768)    

            # Transform features to match the target dimension
            modalityAlpha = linear_transform_Alpha(audio_features)  
            modalityBeta = linear_transform_Beta(text_features)    
            modalityGamma = linear_transform_Gamma(video_features)
            
            outputs = model(modalityAlpha=modalityAlpha, modalityBeta=modalityBeta, modalityGamma=modalityGamma)

            # Pass the fused features through the dense layer
            predictions = dense_layer(outputs).view(-1) 

            # Compute loss
            loss = criterion(predictions, targets)
            total_loss += loss.item()

            # Apply threshold to get binary predictions
            preds = (predictions > 0.5).float()

            # Update precision, recall, and unweighted F1 score metrics
            precision_metric.update(preds.long(), targets.long())
            recall_metric.update(preds.long(), targets.long())
            f1_metric.update(preds.long(), targets.long())

            # Calculate TP, FP, FN for weighted F1 Score
            tp = ((preds == 1) & (targets == 1)).sum().item()
            fp = ((preds == 1) & (targets == 0)).sum().item()
            fn = ((preds == 0) & (targets == 1)).sum().item()

            total_tp += tp
            total_fp += fp
            total_fn += fn

    # Compute precision, recall, and unweighted F1 score
    precision = precision_metric.compute().item()
    recall = recall_metric.compute().item()
    unweighted_f1_score = f1_metric.compute().item()  # Unweighted F1 Score

    # Compute the weighted F1 Score
    if total_tp + total_fp > 0 and total_tp + total_fn > 0:
        weighted_precision = total_tp / (total_tp + total_fp)
        weighted_recall = total_tp / (total_tp + total_fn)
        weighted_f1_score = 2 * (weighted_precision * weighted_recall) / (weighted_precision + weighted_recall)

        # Print metrics for debugging
        print(f"TP: {total_tp}, FP: {total_fp}, FN: {total_fn}")
        print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, Unweighted F1 Score: {unweighted_f1_score:.4f}")
        print(f"Weighted Precision: {weighted_precision:.4f}, Weighted Recall: {weighted_recall:.4f}, Weighted F1 Score: {weighted_f1_score:.4f}")
    else:
        weighted_f1_score = 0.0  # or np.nan, depending on your preference

    

    average_loss = total_loss / len(dataloader)
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Unweighted F1 Score: {unweighted_f1_score:.4f}")
    print(f"Weighted F1 Score: {weighted_f1_score:.4f}")
    
    return average_loss, precision, recall, unweighted_f1_score, weighted_f1_score

In [54]:
def test_model(model, dense_layer, dataloader, criterion, device):
    model.eval()
    dense_layer.eval()
    total_loss = 0.0

    # Initialize the metrics for binary classification
    precision_metric = BinaryPrecision().to(device)
    recall_metric = BinaryRecall().to(device)
    f1_metric = BinaryF1Score().to(device)  # Unweighted F1 Score

    precision_metric.reset()
    recall_metric.reset()
    f1_metric.reset()

    total_tp = 0
    total_fp = 0
    total_fn = 0

    with torch.no_grad():
        for text_features, audio_features, video_features, targets in dataloader:
            text_features, audio_features, video_features, targets = (
                text_features.to(device),
                audio_features.to(device),
                video_features.to(device),
                targets.to(device).view(-1)
            )
            
            # Pass inputs through SMCA model
            audio_features = audio_features.squeeze(1) 

            # Apply linear transformations to match dimensions
            linear_transform_Alpha = LinearTransformations(audio_features.shape[-1], 768) 
            linear_transform_Beta = LinearTransformations(text_features.shape[-1], 768)   
            linear_transform_Gamma = LinearTransformations(video_features.shape[-1], 768)    

            # Transform features to match the target dimension
            modalityAlpha = linear_transform_Alpha(audio_features)  
            modalityBeta = linear_transform_Beta(text_features)    
            modalityGamma = linear_transform_Gamma(video_features)
            
            outputs = model(modalityAlpha=modalityAlpha, modalityBeta=modalityBeta, modalityGamma=modalityGamma)

            # Pass the fused features through the dense layer
            predictions = dense_layer(outputs).view(-1) 

            # Compute loss
            loss = criterion(predictions, targets)
            total_loss += loss.item()

            # Apply threshold to get binary predictions
            preds = (predictions > 0.5).float()

            # Update precision, recall, and unweighted F1 score metrics
            precision_metric.update(preds.long(), targets.long())
            recall_metric.update(preds.long(), targets.long())
            f1_metric.update(preds.long(), targets.long())

            # Calculate TP, FP, FN for weighted F1 Score
            tp = ((preds == 1) & (targets == 1)).sum().item()
            fp = ((preds == 1) & (targets == 0)).sum().item()
            fn = ((preds == 0) & (targets == 1)).sum().item()

            total_tp += tp
            total_fp += fp
            total_fn += fn

    # Compute precision, recall, and unweighted F1 score
    precision = precision_metric.compute().item()
    recall = recall_metric.compute().item()
    unweighted_f1_score = f1_metric.compute().item()  # Unweighted F1 Score

    # Compute the weighted F1 Score
    if total_tp + total_fp > 0 and total_tp + total_fn > 0:
        weighted_precision = total_tp / (total_tp + total_fp)
        weighted_recall = total_tp / (total_tp + total_fn)
        weighted_f1_score = 2 * (weighted_precision * weighted_recall) / (weighted_precision + weighted_recall)
    else:
        weighted_f1_score = 0.0  # or np.nan, depending on your preference

    average_loss = total_loss / len(dataloader)
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Unweighted F1 Score: {unweighted_f1_score:.4f}")
    print(f"Weighted F1 Score: {weighted_f1_score:.4f}")
    
    return average_loss, precision, recall, unweighted_f1_score, weighted_f1_score

In [55]:
def get_optimizer(parameters, lr=1e-3):
    # Create an optimizer, for example, Adam
    return optim.Adam(parameters, lr=lr)

In [None]:
if __name__ == "__main__":
    torch.manual_seed(42)

    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")

    # Determine the output dimensions
    output_dim = 768

    # Initialize the SMCA model A
    model = SMCAModel(768, 768, device)  # Dimension for d_out_kq and d_out_v
    model.to(device)  # Move the model to the correct device

    # Initialize the DenseLayer with the largest output size
    dense_layer = DenseLayer(input_size=output_dim*2).to(device)  # Initialize and move to the correct device

    # Define the loss function and optimizer
    criterion = BCELoss()  # Use appropriate loss function
    
    for param in model.parameters():
        if param.grad is None:
            print("No gradient for:", param)
    optimizer = get_optimizer(list(model.parameters()) + list(dense_layer.parameters()))


    # Training loop
    num_epochs = 20  # Set the number of epochs you want to train for
   
    for epoch in range(num_epochs):
        print("-" * 30)
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Ensure you have a dataloader that yields inputs and targets
        train_loss = train_model(model=model, dense_layer=dense_layer, dataloader=train_dataloader, criterion=criterion, optimizer=optimizer, device=device)

        # Validate step
        val_loss, precision, recall, f1_score, weighted_f1 = evaluate_model(model=model, dense_layer=dense_layer, dataloader=val_dataloader, criterion=criterion, device=device)

        print(f"Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

    # Testing the model
    print("-" * 30)
    print("Testing the model on the test set...")
    test_loss, test_precision, test_recall, test_f1_score, weighted_f1 = test_model(model=model, dense_layer=dense_layer, dataloader=test_dataloader, criterion=criterion, device=device)


Device: cpu
------------------------------
Epoch 1/20
Precision: 0.0000
Recall: 0.0000
Unweighted F1 Score: 0.0000
Weighted F1 Score: 0.0000
Training Loss: 0.6872, Validation Loss: 0.6815
------------------------------
Epoch 2/20
Precision: 0.0000
Recall: 0.0000
Unweighted F1 Score: 0.0000
Weighted F1 Score: 0.0000
Training Loss: 0.6746, Validation Loss: 0.6683
------------------------------
Epoch 3/20
Precision: 0.0000
Recall: 0.0000
Unweighted F1 Score: 0.0000
Weighted F1 Score: 0.0000
Training Loss: 0.6638, Validation Loss: 0.6620
------------------------------
Epoch 4/20
Precision: 0.0000
Recall: 0.0000
Unweighted F1 Score: 0.0000
Weighted F1 Score: 0.0000
Training Loss: 0.6538, Validation Loss: 0.6475
------------------------------
Epoch 5/20
Precision: 0.0000
Recall: 0.0000
Unweighted F1 Score: 0.0000
Weighted F1 Score: 0.0000
Training Loss: 0.6429, Validation Loss: 0.6397
------------------------------
Epoch 6/20
Precision: 0.0000
Recall: 0.0000
Unweighted F1 Score: 0.0000
Weigh