In [17]:
import sys
import os
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, Subset
from torcheval.metrics import BinaryPrecision, BinaryRecall, BinaryF1Score
from sklearn.model_selection import train_test_split, KFold

In [18]:
# Append the path for module imports
sys.path.append('../')

# Import custom modules
from modules.cross_attention import CrossAttention
from modules.dataloader import load_npy_files
from modules.classifier import DenseLayer, BCELoss

### Data Loading

In [19]:
class MultimodalDataset(Dataset):
    def __init__(self, id_label_df, text_features, audio_features, video_features):
        self.id_label_df = id_label_df
        
        # Convert feature lists to dictionaries for fast lookup
        self.text_features = {os.path.basename(file).split('.')[0]: tensor for file, tensor in text_features}
        self.audio_features = {os.path.basename(file).split('_')[1].split('.')[0]: tensor for file, tensor in audio_features}
        self.video_features = {os.path.basename(file).split('_')[0]: tensor for file, tensor in video_features}

        # List to store missing files
        self.missing_files = []

        # Filter out entries with missing files
        self.valid_files = self._filter_valid_files()


    def _filter_valid_files(self):
        valid_files = []
        for idx in range(len(self.id_label_df)):
            imdbid = self.id_label_df.iloc[idx]['IMDBid']

            # Check if the IMDBid exists in each modality's features
            if imdbid in self.text_features and imdbid in self.audio_features and imdbid in self.video_features:
                valid_files.append(idx)
            else:
                self.missing_files.append({'IMDBid': imdbid})

        # Print missing files after checking all
        if self.missing_files:
            print("Missing files:")
            for item in self.missing_files:
                print(f"IMDBid: {item['IMDBid']}")
            print(f"Total IMDB IDs with missing files: {len(self.missing_files)}")
        else:
            print("No missing files.")

        return valid_files

    def __len__(self):
        return len(self.valid_files)

    def __getitem__(self, idx):
        # Get the original index from the filtered valid files
        original_idx = self.valid_files[idx]
        imdbid = self.id_label_df.iloc[original_idx]['IMDBid']
        label = self.id_label_df.iloc[original_idx]['Label']

        # Retrieve data from the loaded features
        text_data = self.text_features.get(imdbid, torch.zeros((1024,)))
        audio_data = self.audio_features.get(imdbid, torch.zeros((1, 197, 768)))
        video_data = self.video_features.get(imdbid, torch.zeros((95, 768)))
        
        # Define label mapping
        label_map = {'red': 0, 'green': 1} 
        
        # Convert labels to tensor using label_map
        try:
            label_data = torch.tensor([label_map[label]], dtype=torch.float32)  # Ensure labels are integers
        except KeyError as e:
            print(f"Error: Label '{e}' not found in label_map.")
            raise

        return text_data, audio_data, video_data, label_data


In [20]:
def collate_fn(batch):
    text_data, audio_data, video_data, label_data = zip(*batch)

    # Convert lists to tensors
    text_data = torch.stack(text_data)
    audio_data = torch.stack(audio_data)

    # Padding for video data
    # Determine maximum length of video sequences in the batch
    video_lengths = [v.size(0) for v in video_data]
    max_length = max(video_lengths)

    # Pad video sequences to the maximum length
    video_data_padded = torch.stack([
        F.pad(v, (0, 0, 0, max_length - v.size(0)), "constant", 0)
        for v in video_data
    ])

    # Convert labels to tensor and ensure the shape [batch_size, 1]
    label_data = torch.stack(label_data)  # Convert list of tensors to a single tensor

    return text_data, audio_data, video_data_padded, label_data

In [21]:
# Load the labels DataFrame
id_label_df = pd.read_excel('/Users/kyleandrecastro/Documents/GitHub/SMCA/misc/MM-Trailer_dataset.xlsx')

# Define the directories
text_features_dir = '/Users/kyleandrecastro/Documents/GitHub/SMCA/misc/textStream_BERT/feature_vectors/feature_vectors/'
audio_features_dir = '/Users/kyleandrecastro/Documents/GitHub/SMCA/misc/audio_fe/logmel_spectrograms/'
video_features_dir = '/Users/kyleandrecastro/Documents/GitHub/SMCA/misc/visualStream_ViT/feature_vectors/'

# Load the feature vectors from each directory
text_features = load_npy_files(text_features_dir)
audio_features = load_npy_files(audio_features_dir)
video_features = load_npy_files(video_features_dir)

print(f"Number of text feature vectors loaded: {len(text_features)}")
print(f"Number of audio feature vectors loaded: {len(audio_features)}")
print(f"Number of video feature vectors loaded: {len(video_features)}")

Number of text feature vectors loaded: 1353
Number of audio feature vectors loaded: 1353
Number of video feature vectors loaded: 1353


In [22]:
# Cross Attention Function
def PairCrossAttention(modalityAlpha, modalityBeta, d_out_kq=768, d_out_v=768):
    cross_attn = CrossAttention(modalityAlpha.shape[-1], modalityBeta.shape[-1], d_out_kq, d_out_v)
    modalityAlphaBeta = cross_attn(modalityAlpha, modalityBeta)
    return modalityAlphaBeta

In [23]:
def HadamardProduct(tensor1, tensor2):
    # Ensure both tensors have the same shape
    if tensor1.shape != tensor2.shape:
        raise ValueError("Tensors must have the same shape for Hadamard product.")
    
    # Compute the Hadamard product
    return tensor1 * tensor2

In [24]:
class Flatten(nn.Module):
    def forward(self, x):
        # Flatten the input tensor except the batch dimension
        return x.view(x.size(0), -1)

In [25]:
class EmbracementLayer(nn.Module):
    def __init__(self, d_in, d_out):
        super(EmbracementLayer, self).__init__()
        self.fc = nn.Linear(d_in, d_out)
        self.norm = nn.LayerNorm(d_out)
        self.activation = nn.ReLU()

    def forward(self, video_features, audio_features, text_features):
        # Concatenate features along the last dimension
        combined_features = torch.cat([video_features, audio_features, text_features], dim=-1)
        
        # Apply linear transformation
        transformed_features = self.fc(combined_features)
        
        # Apply normalization and activation
        norm_features = self.norm(transformed_features)
        output = self.activation(norm_features)
        
        return output

In [26]:
def get_optimizer(parameters, lr=1e-3):
    # Create an optimizer, for example, Adam
    return optim.Adam(parameters, lr=lr)

In [27]:
def train_model(dense_layer, dataloader, criterion, optimizer, device):
    dense_layer.train()  # Set the model to training mode
    total_loss = 0.0

    for batch in dataloader:
        # Assuming batch is a tuple (fused_features, targets)
        fused_features, targets = batch

        # Move tensors to the specified device
        fused_features = fused_features.to(device)
        targets = targets.to(device)

        # Clear gradients
        optimizer.zero_grad()

        # Pass the fused features through the dense layer
        predictions = dense_layer(fused_features).squeeze()  # Ensure correct shape

        # Compute loss
        loss = criterion(predictions, targets)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


In [28]:
def evaluate_model(dense_layer, dataloader, criterion, device):
    dense_layer.eval()
    total_loss = 0

    # Initialize the metrics for binary classification
    precision_metric = BinaryPrecision().to(device)
    recall_metric = BinaryRecall().to(device)
    f1_metric = BinaryF1Score().to(device)
    
    with torch.no_grad():
        for fused_features, targets in dataloader:
            fused_features, targets = (
                fused_features.to(device),  # Use fused features directly
                targets.to(device).squeeze()
            )

            # Pass the fused features through the dense layer
            predictions = dense_layer(fused_features).squeeze()  
            
            # Compute loss
            loss = criterion(predictions, targets)
            total_loss += loss.item()

            # Apply threshold to get binary predictions
            preds = (predictions > 0.5).float()
            
            # Update the precision, recall, and F1 score metrics
            precision_metric.update(preds.long(), targets.long())
            recall_metric.update(preds.long(), targets.long())
            f1_metric.update(preds.long(), targets.long())

    # Compute precision, recall, and F1 score
    precision = precision_metric.compute().item()
    recall = recall_metric.compute().item()
    f1_score = f1_metric.compute().item()

    average_loss = total_loss / len(dataloader)

    print(f"Evaluation Loss: {average_loss:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")
    
    return average_loss, precision, recall, f1_score


In [29]:
def test_model(dense_layer, dataloader, criterion, device):
    dense_layer.eval()  # Set the model to evaluation mode
    total_loss = 0

    # Initialize the metrics for binary classification
    precision_metric = BinaryPrecision().to(device)
    recall_metric = BinaryRecall().to(device)
    f1_metric = BinaryF1Score().to(device)

    with torch.no_grad():
        for fused_features, targets in dataloader:
            fused_features, targets = (
                fused_features.to(device),  # Move fused features to device
                targets.to(device).squeeze()  # Ensure targets are the correct shape
            )

            # Pass the fused features through the dense layer
            predictions = dense_layer(fused_features).squeeze()  
            
            # Compute loss
            loss = criterion(predictions, targets)
            total_loss += loss.item()

            # Apply threshold to get binary predictions
            preds = (predictions > 0.5).float()
            
            # Update the precision, recall, and F1 score metrics
            precision_metric.update(preds.long(), targets.long())
            recall_metric.update(preds.long(), targets.long())
            f1_metric.update(preds.long(), targets.long())

    # Compute precision, recall, and F1 score
    precision = precision_metric.compute().item()
    recall = recall_metric.compute().item()
    f1_score = f1_metric.compute().item()

    average_loss = total_loss / len(dataloader)

    print(f"Test Loss: {average_loss:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1_score:.4f}")

    return average_loss, precision, recall, f1_score


### Fusion

In [30]:
# Initialize list to store fused features for each file
fused_features_list = []
labels_list = []
batch_size = 32  # Adjust based on your available memory

for batch_start in range(0, len(text_features), batch_size):
    batch_end = min(batch_start + batch_size, len(text_features))
    batch_text_features = text_features[batch_start:batch_end]
    batch_audio_features = audio_features[batch_start:batch_end]
    batch_video_features = video_features[batch_start:batch_end]
    print(f"Batch: {batch_start}")
    # Loop through all the files in the dataset
    for i in range(len(batch_text_features)):
        # Extract features for the current file
        text_file_name, text_feature = text_features[i]  # Renamed to avoid shadowing
        audio_file_name, audio_feature = audio_features[i]  # Renamed to avoid shadowing
        video_file_name, video_feature = video_features[i]  # Renamed to avoid shadowing

        # print(f"Processing file {text_file_name}")

        # Check if any features are missing
        if text_feature is None or audio_feature is None or video_feature is None:
            print(f"Skipping file {i + 1}/{len(text_features)}: Missing features for {text_file_name}, {audio_file_name}, {video_file_name}")
            continue  # Skip to the next iteration

        # print("Text file name:", text_file_name)
        # print("Audio file name:", audio_file_name)
        # print("Video file name:", video_file_name)

        print("Text features shape:", text_feature.shape)
        print("Audio features shape:", audio_feature.shape)
        print("Video features shape:", video_feature.shape)

        # Reshape features
        audio_feature = audio_feature.squeeze(0)  # Changed from audio_features to audio_feature
        text_feature = text_feature.unsqueeze(0)  # Changed from text_features to text_feature

        # print("text_features shape:", text_feature.shape)
        # print("audio_features shape:", audio_feature.shape)
        # print("video_features shape:", video_feature.shape, '\n')


        with torch.no_grad():
            # Cross-Attention for every possible pair
            text_video = PairCrossAttention(text_feature, video_feature)
            text_audio = PairCrossAttention(text_feature, audio_feature)
            audio_video = PairCrossAttention(audio_feature, video_feature)
            audio_text = PairCrossAttention(audio_feature, text_feature)
            video_audio = PairCrossAttention(video_feature, audio_feature)
            video_text = PairCrossAttention(video_feature, text_feature)

            # print("video_audio shape:", video_audio.shape)
            # print("video_text shape:", video_text.shape)
            # print("audio_video shape:", audio_video.shape)
            # print("audio_text shape:", audio_text.shape)
            # print("text_video shape:", text_video.shape)
            # print("text_audio shape:", text_audio.shape, '\n')

            # Combine Cross-Attention outputs using Hadamard product
            text_combined = HadamardProduct(text_video, text_audio)
            audio_combined = HadamardProduct(audio_video, audio_text)
            video_combined = HadamardProduct(video_audio, video_text)

            # Fusion using Embracement Layer
            d_in = video_combined.shape[-1] + audio_combined.shape[-1] + text_combined.shape[-1]
            embracement_layer = EmbracementLayer(d_in, d_in)
            
            # Fused features for the current file
            fused_features = embracement_layer(video_combined[-1], audio_combined[-1], text_combined[-1])
            
            print("Fused Features Shape:", fused_features.shape, '\n')

            # Append the fused features and the corresponding label to the lists
            fused_features_list.append(fused_features)
            label = id_label_df.iloc[i]['Label']  # Assuming you have a column 'Label'
            labels_list.append(label)

        del text_feature, audio_feature, video_feature
        del text_video, text_audio, audio_video, audio_text, video_audio, video_text
        
# Stack all fused features into a tensor for training
fused_features_tensor = torch.stack(fused_features_list)

# Convert labels to tensor
label_map = {'red': 0, 'green': 1}  # Adjust if your labels differ
labels_tensor = torch.tensor([label_map[label] for label in labels_list], dtype=torch.float32)


Batch: 0
Text features shape: torch.Size([1024])
Audio features shape: torch.Size([1, 197, 768])
Video features shape: torch.Size([79, 768])
Fused Features Shape: torch.Size([2304]) 

Text features shape: torch.Size([1024])
Audio features shape: torch.Size([1, 197, 768])
Video features shape: torch.Size([129, 768])
Fused Features Shape: torch.Size([2304]) 

Text features shape: torch.Size([1024])
Audio features shape: torch.Size([1, 197, 768])
Video features shape: torch.Size([145, 768])
Fused Features Shape: torch.Size([2304]) 

Text features shape: torch.Size([1024])
Audio features shape: torch.Size([1, 197, 768])
Video features shape: torch.Size([136, 768])
Fused Features Shape: torch.Size([2304]) 

Text features shape: torch.Size([1024])
Audio features shape: torch.Size([1, 197, 768])
Video features shape: torch.Size([82, 768])
Fused Features Shape: torch.Size([2304]) 

Text features shape: torch.Size([1024])
Audio features shape: torch.Size([1, 197, 768])
Video features shape: tor

In [31]:
# Splitting data for training, validation, and testing
train_df, val_test_df, train_labels, val_test_labels = train_test_split(
    fused_features_tensor, labels_tensor, test_size=0.3, random_state=42)

# Further splitting remaining set into validation and test sets
val_df, test_df, val_labels, test_labels = train_test_split(
    val_test_df, val_test_labels, test_size=0.5, random_state=42)

# Create DataLoaders
train_loader = DataLoader(list(zip(train_df, train_labels)), batch_size=32, shuffle=True)
val_loader = DataLoader(list(zip(val_df, val_labels)), batch_size=32, shuffle=False)
test_loader = DataLoader(list(zip(test_df, test_labels)), batch_size=32, shuffle=False)

In [32]:
if __name__ == "__main__":
    torch.manual_seed(42)

    # Check for device availability
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
        device = torch.device("cpu")

    # Input dimension based on fused features tensor
    input_dim = fused_features_tensor.shape[1]  # Adjust if necessary
    dense_layer = DenseLayer(input_size=input_dim).to(device)  # Initialize and move the dense layer to the correct device

    # Define the loss function and optimizer
    criterion = BCELoss()  # Use appropriate loss function
    optimizer = get_optimizer(dense_layer.parameters())  # Pass only the dense layer parameters

    # Training loop
    num_epochs = 10  # Set the number of epochs you want to train for

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Ensure you have a dataloader that yields inputs and targets
        train_loss = train_model(dense_layer=dense_layer, dataloader=train_loader, criterion=criterion, optimizer=optimizer, device=device)
        
        # Validate model
        val_loss, precision, recall, f1_score = evaluate_model(dense_layer=dense_layer, dataloader=val_loader, criterion=criterion, device=device)

        print(f"Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
        print("-" * 30)
    
    # Testing the model
    print("Testing the model on the test set...")
    test_loss, test_precision, test_recall, test_f1_score = test_model(dense_layer=dense_layer, dataloader=test_loader, criterion=criterion, device=device)


Epoch 1/10
Evaluation Loss: 0.6870
Precision: 0.6284
Recall: 0.7381
F1 Score: 0.6788
Training Loss: 0.6829, Validation Loss: 0.6870
------------------------------
Epoch 2/10
Evaluation Loss: 0.6734
Precision: 0.6178
Recall: 0.9365
F1 Score: 0.7445
Training Loss: 0.5852, Validation Loss: 0.6734
------------------------------
Epoch 3/10
Evaluation Loss: 0.6765
Precision: 0.6237
Recall: 0.9603
F1 Score: 0.7563
Training Loss: 0.5145, Validation Loss: 0.6765
------------------------------
Epoch 4/10
Evaluation Loss: 0.6827
Precision: 0.6188
Recall: 0.8889
F1 Score: 0.7296
Training Loss: 0.4625, Validation Loss: 0.6827
------------------------------
Epoch 5/10
Evaluation Loss: 0.7082
Precision: 0.6193
Recall: 0.9683
F1 Score: 0.7554
Training Loss: 0.4146, Validation Loss: 0.7082
------------------------------
Epoch 6/10
Evaluation Loss: 0.6963
Precision: 0.6175
Recall: 0.8968
F1 Score: 0.7314
Training Loss: 0.3791, Validation Loss: 0.6963
------------------------------
Epoch 7/10
Evaluation 