In [18]:
import sys
import os
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.model_selection import train_test_split, KFold

In [19]:
sys.path.append('../../')

from modules.cross_attentionb import CrossAttentionB
from modules.dataloader import load_npy_files
from modules.classifier import DenseLayer, BCELoss
from modules.linear_transformation import LinearTransformations
from modules.output_max import output_max
from evaluation_validation.train_model import train_model
from evaluation_validation.evaluate_model import evaluate_model
from evaluation_validation.test_model import test_model

### Data Loading

In [20]:

class MultimodalDataset(Dataset):
    def __init__(self, id_label_df, text_features, audio_features, video_features):
        self.id_label_df = id_label_df
        
        # Convert feature lists to dictionaries for fast lookup
        self.text_features = {os.path.basename(file).split('.')[0]: tensor for file, tensor in text_features}
        self.audio_features = {os.path.basename(file).split('_')[1].split('.')[0]: tensor for file, tensor in audio_features}
        self.video_features = {os.path.basename(file).split('_')[0]: tensor for file, tensor in video_features}

        # List to store missing files
        self.missing_files = []

        # Filter out entries with missing files
        self.valid_files = self._filter_valid_files()


    def _filter_valid_files(self):
        valid_files = []
        for idx in range(len(self.id_label_df)):
            imdbid = self.id_label_df.iloc[idx]['IMDBid']

            # Check if the IMDBid exists in each modality's features
            if imdbid in self.text_features and imdbid in self.audio_features and imdbid in self.video_features:
                valid_files.append(idx)
            else:
                self.missing_files.append({'IMDBid': imdbid})

        return valid_files

    def __len__(self):
        return len(self.valid_files)

    def __getitem__(self, idx):
        # Get the original index from the filtered valid files
        original_idx = self.valid_files[idx]
        imdbid = self.id_label_df.iloc[original_idx]['IMDBid']
        label = self.id_label_df.iloc[original_idx]['Label']

        # Retrieve data from the loaded features
        text_data = self.text_features.get(imdbid, torch.zeros((1024,)))
        audio_data = self.audio_features.get(imdbid, torch.zeros((1, 197, 768)))
        video_data = self.video_features.get(imdbid, torch.zeros((95, 768)))
        
        # Define label mapping
        label_map = {'red': 0, 'green': 1} 
        
        # Convert labels to tensor using label_map
        try:
            label_data = torch.tensor([label_map[label]], dtype=torch.float32)  # Ensure labels are integers
        except KeyError as e:
            print(f"Error: Label '{e}' not found in label_map.")
            raise
        
        # Debugging output
        if label_data.shape[0] == 0:
            print(f"Empty target for IMDBid {imdbid} at index {idx}")

        return text_data, audio_data, video_data, label_data


In [21]:
def collate_fn(batch):
    text_data, audio_data, video_data, label_data = zip(*batch)

    # Convert lists to tensors
    text_data = torch.stack(text_data)
    audio_data = torch.stack(audio_data)

    # Padding for video data
    # Determine maximum length of video sequences in the batch
    video_lengths = [v.size(0) for v in video_data]
    max_length = max(video_lengths)

    # Pad video sequences to the maximum length
    video_data_padded = torch.stack([
        F.pad(v, (0, 0, 0, max_length - v.size(0)), "constant", 0)
        for v in video_data
    ])

    # Convert labels to tensor and ensure the shape [batch_size, 1]
    label_data = torch.stack(label_data)  # Convert list of tensors to a single tensor

    return text_data, audio_data, video_data_padded, label_data

In [22]:
# Load the labels DataFrame
id_label_df = pd.read_excel('../../misc/MM-Trailer_dataset.xlsx')

# Define the directories
text_features_dir = 'D:\\Projects\\Thesis\\Text'
audio_features_dir = 'D:\\Projects\\Thesis\\Audio'
video_features_dir = 'D:\\Projects\\Thesis\\Video'


# Load the feature vectors from each directory
text_features = load_npy_files(text_features_dir)
audio_features = load_npy_files(audio_features_dir)
video_features = load_npy_files(video_features_dir)

print(f"Number of text feature vectors loaded: {len(text_features)}")
print(f"Number of audio feature vectors loaded: {len(audio_features)}")
print(f"Number of video feature vectors loaded: {len(video_features)}")

# Splitting data for training, validation, and testing
train_df, val_test_df = train_test_split(id_label_df, test_size=0.3, random_state=42)

# Further splitting remaining set into validation and test sets
val_df, test_df = train_test_split(val_test_df, test_size=0.5, random_state=42)

# Create datasets
train_dataset = MultimodalDataset(train_df, text_features, audio_features, video_features)
val_dataset = MultimodalDataset(val_df, text_features, audio_features, video_features)
test_dataset = MultimodalDataset(test_df, text_features, audio_features, video_features)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=0, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=0, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=0, collate_fn=collate_fn)

# Combine all data for K-fold cross-validation
full_dataset = MultimodalDataset(id_label_df, text_features, audio_features, video_features)


Number of text feature vectors loaded: 1353
Number of audio feature vectors loaded: 1353
Number of video feature vectors loaded: 1353


### SMCA Model Classes

In [23]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Stage 1 of SMCA
def SMCAStage1(modalityAlpha, modalityBeta, d_out_kq, d_out_v, device):
    cross_attn = CrossAttentionB(modalityAlpha.shape[-1], modalityBeta.shape[-1], d_out_kq, d_out_v).to(device)
    modalityAlphaBeta = cross_attn(modalityAlpha, modalityBeta)
    return modalityAlphaBeta

In [24]:
def SMCAStage2_ModelB(modalityGamma, modalityAlphaBeta, d_out_kq, d_out_v, device):
    cross_attn = CrossAttentionB(
        d_in_query=modalityGamma.shape[-1],
        d_in_kv=modalityAlphaBeta.shape[-1],
        d_out_kq=d_out_kq,
        d_out_v=d_out_v
    ).to(device)
    multimodal_representation = cross_attn(modalityGamma, modalityAlphaBeta)
    return multimodal_representation

In [25]:
class SMCAModelB(nn.Module):
    def __init__(self, d_out_kq, d_out_v, device):
        super(SMCAModelB, self).__init__()
        self.d_out_kq = d_out_kq
        self.d_out_v = d_out_v
        self.device = device
        
    def forward(self, modalityAlpha, modalityBeta, modalityGamma, device):
        # Stage 1: Cross attention between modalityAlpha and modalityBeta
        modalityAlphaBeta = SMCAStage1(modalityAlpha, modalityBeta, self.d_out_kq, self.d_out_v, device)
        
        # Stage 2: Cross attention with modalityGamma (as query) and modalityAlphaBeta (as key-value)
        multimodal_representation = SMCAStage2_ModelB(modalityGamma, modalityAlphaBeta, self.d_out_kq, self.d_out_v, device)
        
        # Flatten the output
        return torch.flatten(multimodal_representation, start_dim=1)  # Flatten all dimensions except batch


### Model Training Functions

In [26]:
def get_optimizer(parameters, lr=1e-4):
    # Create an optimizer, for example, Adam
    return optim.Adam(parameters, lr=lr)

### Fusion Model B

In [27]:
if __name__ == "__main__":
    torch.manual_seed(42)

    # Device configuration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")

    # Determine the output dimensions
    output_dim = 768

    # Initialize the SMCA model A
    model = SMCAModelB(512, 256, device) # Dimension for d_out_kq and d_out_v
    model.to(device)  # Move the model to the correct device


    # Loop through the dataloaders to determine the largest output size
    max_output_size_train = output_max(model, train_dataloader, device)
    max_output_size_val = output_max(model, val_dataloader, device)
    max_output_size_test = output_max(model, test_dataloader, device)

    # Get the overall largest output size
    max_output_size = max(max_output_size_train, max_output_size_val, max_output_size_test)

    # Initialize the DenseLayer with the largest output size
    dense_layer = DenseLayer(input_size=512).to(device)  # Initialize and move to the correct device

    
    # Define the loss function and optimizer
    criterion = BCELoss()  # Use appropriate loss function
    optimizer = get_optimizer(dense_layer.parameters())  # Pass only the dense layer parameters

    # Training loop
    num_epochs = 10  # Set the number of epochs you want to train for

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Ensure you have a dataloader that yields inputs and targets
        train_loss = train_model(model=model, dense_layer=dense_layer, dataloader=train_dataloader, criterion=criterion, optimizer=optimizer, device=device)
        
        # Validate model
        val_loss, precision, recall, f1_score = evaluate_model(model=model, dense_layer=dense_layer, dataloader=val_dataloader, criterion=criterion, device=device)

        print(f"Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
        print("-" * 30)
    
    # Testing the model
    print("Testing the model on the test set...")
    test_loss, test_precision, test_recall, test_f1_score = test_model(model=model, dense_layer=dense_layer, dataloader=test_dataloader, criterion=criterion, device=device)
    


Device: cuda
Epoch 1/10
Evaluation Loss: 0.6920
Precision: 0.7455
Recall: 0.5655
F1 Score: 0.6431
Training Loss: 0.6976, Validation Loss: 0.6920
------------------------------
Epoch 2/10
Evaluation Loss: 0.6914
Precision: 0.6818
Recall: 0.5172
F1 Score: 0.5882
Training Loss: 0.6925, Validation Loss: 0.6914
------------------------------
Epoch 3/10
Evaluation Loss: 0.6897
Precision: 0.7376
Recall: 0.7172
F1 Score: 0.7273
Training Loss: 0.6922, Validation Loss: 0.6897
------------------------------
Epoch 4/10
Evaluation Loss: 0.6884
Precision: 0.7424
Recall: 0.6759
F1 Score: 0.7076
Training Loss: 0.6891, Validation Loss: 0.6884
------------------------------
Epoch 5/10
Evaluation Loss: 0.6845
Precision: 0.7452
Recall: 0.8069
F1 Score: 0.7748
Training Loss: 0.6866, Validation Loss: 0.6845
------------------------------
Epoch 6/10
Evaluation Loss: 0.6842
Precision: 0.7322
Recall: 0.9241
F1 Score: 0.8171
Training Loss: 0.6841, Validation Loss: 0.6842
------------------------------
Epoch 7/1