In [1]:

import sys
import os
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, Dataset, Subset
from torcheval.metrics import BinaryPrecision, BinaryRecall, BinaryF1Score
from sklearn.model_selection import train_test_split, KFold


In [2]:
# Append the path for module imports
sys.path.append('../')

# Import custom modules
from modules.cross_attentionb import CrossAttentionB
from modules.dataloader import load_npy_files
from modules.classifier import DenseLayer, BCELoss, BCEWithLogits

### Data Loading

In [3]:
# Load the labels DataFrame
id_label_df = pd.read_excel('../misc/MM-Trailer_dataset.xlsx')

# Define the directories
text_features_dir = 'C:\\Users\\edjin\\OneDrive\\Documents\\Programming Files\\Thesis\\SMCA\\misc\\textStream_BERT\\feature_vectors\\feature_vectors'
audio_features_dir = 'C:\\Users\\edjin\\OneDrive\\Documents\\Programming Files\\Thesis\\SMCA\\misc\\audio_fe\\logmel_spectrograms'
video_features_dir = 'C:\\Users\\edjin\\OneDrive\\Documents\\Programming Files\\Thesis\\SMCA\\misc\\visualStream_ViT\\feature_vectors'

# Load the feature vectors from each directory
text_features = load_npy_files(text_features_dir)
audio_features = load_npy_files(audio_features_dir)
video_features = load_npy_files(video_features_dir)

print(f"Number of text feature vectors loaded: {len(text_features)}")
print(f"Number of audio feature vectors loaded: {len(audio_features)}")
print(f"Number of video feature vectors loaded: {len(video_features)}")

Number of text feature vectors loaded: 1353
Number of audio feature vectors loaded: 1353
Number of video feature vectors loaded: 1353


In [4]:
# Cross Attention Function
def PairCrossAttention(modalityAlpha, modalityBeta, d_out_kq=768, d_out_v=768):
    cross_attn = CrossAttentionB(modalityAlpha.shape[-1], modalityBeta.shape[-1], d_out_kq, d_out_v)
    modalityAlphaBeta = cross_attn(modalityAlpha, modalityBeta)
    return modalityAlphaBeta

In [5]:
def HadamardProduct(tensor1, tensor2):
    # Ensure both tensors have the same shape
    if tensor1.shape != tensor2.shape:
        raise ValueError("Tensors must have the same shape for Hadamard product.")
    
    # Compute the Hadamard product
    return tensor1 * tensor2

In [6]:
class EmbracementLayer(nn.Module):
    def __init__(self, d_in, d_out):
        super(EmbracementLayer, self).__init__()
        self.fc = nn.Linear(d_in, d_out)
        self.norm = nn.LayerNorm(d_out)
        self.activation = nn.ReLU()

    def forward(self, video_features, audio_features, text_features):
        # Concatenate features along the last dimension
        combined_features = torch.cat([video_features, audio_features, text_features], dim=-1)
        
        # Apply linear transformation
        transformed_features = self.fc(combined_features)
        
        # Apply normalization and activation
        norm_features = self.norm(transformed_features)
        output = self.activation(norm_features)
        
        return output

### Model Training

In [7]:
def get_optimizer(parameters, lr=1e-3):
    # Create an optimizer, for example, Adam
    return optim.Adam(parameters, lr=lr)

In [8]:
def train_model(dense_layer, dataloader, criterion, optimizer, device):
    dense_layer.train()  # Set the model to training mode
    total_loss = 0.0

    for batch in dataloader:
        # Assuming batch is a tuple (fused_features, targets)
        fused_features, targets = batch

        # Move tensors to the specified device
        fused_features = fused_features.to(device)
        targets = targets.to(device)

        # Clear gradients
        optimizer.zero_grad()

        # Pass the fused features through the dense layer
        predictions = dense_layer(fused_features).view(-1)    # Ensure correct shape

        # Compute loss
        loss = criterion(predictions, targets)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


In [9]:
def evaluate_model(dense_layer, dataloader, criterion, device):
    dense_layer.eval()
    total_loss = 0

    # Initialize the metrics for binary classification
    precision_metric = BinaryPrecision().to(device)
    recall_metric = BinaryRecall().to(device)
    f1_metric = BinaryF1Score().to(device)
    
    with torch.no_grad():
        for fused_features, targets in dataloader:
            fused_features, targets = (
                fused_features.to(device),  # Use fused features directly
                targets.to(device).squeeze()
            )

            # Pass the fused features through the dense layer
            predictions = dense_layer(fused_features).view(-1)
            
            # Compute loss
            loss = criterion(predictions, targets)
            total_loss += loss.item()

            # Apply threshold to get binary predictions
            preds = (predictions > 0.5).float()
            
            # Update the precision, recall, and F1 score metrics
            precision_metric.update(preds.long(), targets.long())
            recall_metric.update(preds.long(), targets.long())
            f1_metric.update(preds.long(), targets.long())

    # Compute precision, recall, and F1 score
    precision = precision_metric.compute().item()
    recall = recall_metric.compute().item()
    f1_score = f1_metric.compute().item()

    average_loss = total_loss / len(dataloader)

    print(f"Evaluation Loss: {average_loss:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")
    
    return average_loss, precision, recall, f1_score


In [10]:
def test_model(dense_layer, dataloader, criterion, device):
    dense_layer.eval()  # Set the model to evaluation mode
    total_loss = 0

    # Initialize the metrics for binary classification
    precision_metric = BinaryPrecision().to(device)
    recall_metric = BinaryRecall().to(device)
    f1_metric = BinaryF1Score().to(device)

    with torch.no_grad():
        for fused_features, targets in dataloader:
            fused_features, targets = (
                fused_features.to(device),  # Move fused features to device
                targets.to(device).squeeze()  # Ensure targets are the correct shape
            )

            # Pass the fused features through the dense layer
            predictions = dense_layer(fused_features).view(-1)  
            
            # Compute loss
            loss = criterion(predictions, targets)
            total_loss += loss.item()

            # Apply threshold to get binary predictions
            preds = (predictions > 0.5).float()
            
            # Update the precision, recall, and F1 score metrics
            precision_metric.update(preds.long(), targets.long())
            recall_metric.update(preds.long(), targets.long())
            f1_metric.update(preds.long(), targets.long())

    # Compute precision, recall, and F1 score
    precision = precision_metric.compute().item()
    recall = recall_metric.compute().item()
    f1_score = f1_metric.compute().item()

    average_loss = total_loss / len(dataloader)

    print(f"Test Loss: {average_loss:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1_score:.4f}")

    return average_loss, precision, recall, f1_score


### Fusion

In [11]:
# Initialize list to store fused features for each file
fused_features_list = []
labels_list = []
batch_size = 32  # Adjust based on your available memory

for batch_start in range(0, len(text_features), batch_size):
    batch_end = min(batch_start + batch_size, len(text_features))
    batch_text_features = text_features[batch_start:batch_end]
    batch_audio_features = audio_features[batch_start:batch_end]
    batch_video_features = video_features[batch_start:batch_end]
    print(f"Batch: {batch_start}")
    # Loop through all the files in the dataset
    for i in range(len(batch_text_features)):
        # Extract features for the current file
        text_file_name, text_feature = text_features[i]  # Renamed to avoid shadowing
        audio_file_name, audio_feature = audio_features[i]  # Renamed to avoid shadowing
        video_file_name, video_feature = video_features[i]  # Renamed to avoid shadowing

        # print(f"Processing file {text_file_name}")

        # Check if any features are missing
        if text_feature is None or audio_feature is None or video_feature is None:
            print(f"Skipping file {i + 1}/{len(text_features)}: Missing features for {text_file_name}, {audio_file_name}, {video_file_name}")
            continue  # Skip to the next iteration

        # print("Text file name:", text_file_name)
        # print("Audio file name:", audio_file_name)
        # print("Video file name:", video_file_name)

        print("Text features shape:", text_feature.shape)
        print("Audio features shape:", audio_feature.shape)
        print("Video features shape:", video_feature.shape)

        # Reshape features
        audio_feature = audio_feature.squeeze(0)  # Changed from audio_features to audio_feature
        text_feature = text_feature.unsqueeze(0)  # Changed from text_features to text_feature

        # print("text_features shape:", text_feature.shape)
        # print("audio_features shape:", audio_feature.shape)
        # print("video_features shape:", video_feature.shape, '\n')


        with torch.no_grad():
            # Cross-Attention for every possible pair
            text_video = PairCrossAttention(text_feature, video_feature)
            text_audio = PairCrossAttention(text_feature, audio_feature)
            audio_video = PairCrossAttention(audio_feature, video_feature)
            audio_text = PairCrossAttention(audio_feature, text_feature)
            video_audio = PairCrossAttention(video_feature, audio_feature)
            video_text = PairCrossAttention(video_feature, text_feature)

            # print("video_audio shape:", video_audio.shape)
            # print("video_text shape:", video_text.shape)
            # print("audio_video shape:", audio_video.shape)
            # print("audio_text shape:", audio_text.shape)
            # print("text_video shape:", text_video.shape)
            # print("text_audio shape:", text_audio.shape, '\n')

            # Combine Cross-Attention outputs using Hadamard product
            text_combined = HadamardProduct(text_video, text_audio)
            audio_combined = HadamardProduct(audio_video, audio_text)
            video_combined = HadamardProduct(video_audio, video_text)

            # Fusion using Embracement Layer
            d_in = video_combined.shape[-1] + audio_combined.shape[-1] + text_combined.shape[-1]
            embracement_layer = EmbracementLayer(d_in, d_in)
            
            # Fused features for the current file
            fused_features = embracement_layer(video_combined[-1], audio_combined[-1], text_combined[-1])
            
            print("Fused Features Shape:", fused_features.shape, '\n')

            # Append the fused features and the corresponding label to the lists
            fused_features_list.append(fused_features)
            label = id_label_df.iloc[i]['Label']  # Assuming you have a column 'Label'
            labels_list.append(label)

        del text_feature, audio_feature, video_feature
        del text_video, text_audio, audio_video, audio_text, video_audio, video_text
        
# Stack all fused features into a tensor for training
fused_features_tensor = torch.stack(fused_features_list)

# Convert labels to tensor
label_map = {'red': 1, 'green': 0}  # Adjust if your labels differ
labels_tensor = torch.tensor([label_map[label] for label in labels_list], dtype=torch.float32)


Batch: 0
Text features shape: torch.Size([1024])
Audio features shape: torch.Size([1, 197, 768])
Video features shape: torch.Size([95, 768])
Fused Features Shape: torch.Size([2304]) 

Text features shape: torch.Size([1024])
Audio features shape: torch.Size([1, 197, 768])
Video features shape: torch.Size([124, 768])
Fused Features Shape: torch.Size([2304]) 

Text features shape: torch.Size([1024])
Audio features shape: torch.Size([1, 197, 768])
Video features shape: torch.Size([127, 768])
Fused Features Shape: torch.Size([2304]) 

Text features shape: torch.Size([1024])
Audio features shape: torch.Size([1, 197, 768])
Video features shape: torch.Size([136, 768])
Fused Features Shape: torch.Size([2304]) 

Text features shape: torch.Size([1024])
Audio features shape: torch.Size([1, 197, 768])
Video features shape: torch.Size([116, 768])
Fused Features Shape: torch.Size([2304]) 

Text features shape: torch.Size([1024])
Audio features shape: torch.Size([1, 197, 768])
Video features shape: to

In [12]:
# Splitting data for training, validation, and testing
train_df, val_test_df, train_labels, val_test_labels = train_test_split(
    fused_features_tensor, labels_tensor, test_size=0.3, random_state=42)

# Further splitting remaining set into validation and test sets
val_df, test_df, val_labels, test_labels = train_test_split(
    val_test_df, val_test_labels, test_size=0.5, random_state=42)

# Create DataLoaders
train_loader = DataLoader(list(zip(train_df, train_labels)), batch_size=32, shuffle=True)
val_loader = DataLoader(list(zip(val_df, val_labels)), batch_size=32, shuffle=False)
test_loader = DataLoader(list(zip(test_df, test_labels)), batch_size=32, shuffle=False)


In [13]:
if __name__ == "__main__":
    torch.manual_seed(42)

    # Check for device availability
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
        device = torch.device("cpu")

    # Input dimension based on fused features tensor
    input_dim = fused_features_tensor.shape[1]  # Adjust if necessary
    dense_layer = DenseLayer(input_size=input_dim).to(device)  # Initialize and move the dense layer to the correct device

    # Define the loss function and optimizer
    criterion = BCELoss()  # Use appropriate loss function
    optimizer = get_optimizer(dense_layer.parameters())  # Pass only the dense layer parameters

    # Training loop
    num_epochs = 10  # Set the number of epochs you want to train for

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Ensure you have a dataloader that yields inputs and targets
        train_loss = train_model(dense_layer=dense_layer, dataloader=train_loader, criterion=criterion, optimizer=optimizer, device=device)
        
        # Validate model
        val_loss, precision, recall, f1_score = evaluate_model(dense_layer=dense_layer, dataloader=val_loader, criterion=criterion, device=device)

        print(f"Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
        print("-" * 30)
    
    # Testing the model
    print("Testing the model on the test set...")
    test_loss, test_precision, test_recall, test_f1_score = test_model(dense_layer=dense_layer, dataloader=test_loader, criterion=criterion, device=device)


Epoch 1/10
Evaluation Loss: 0.6981
Precision: 0.3544
Recall: 0.3636
F1 Score: 0.3590
Training Loss: 0.6798, Validation Loss: 0.6981
------------------------------
Epoch 2/10
Evaluation Loss: 0.6740
Precision: 0.3810
Recall: 0.1039
F1 Score: 0.1633
Training Loss: 0.5861, Validation Loss: 0.6740
------------------------------
Epoch 3/10
Evaluation Loss: 0.6790
Precision: 0.4286
Recall: 0.1558
F1 Score: 0.2286
Training Loss: 0.5219, Validation Loss: 0.6790
------------------------------
Epoch 4/10
Evaluation Loss: 0.6884
Precision: 0.3333
Recall: 0.1948
F1 Score: 0.2459
Training Loss: 0.4729, Validation Loss: 0.6884
------------------------------
Epoch 5/10
Evaluation Loss: 0.7035
Precision: 0.5000
Recall: 0.0390
F1 Score: 0.0723
Training Loss: 0.4269, Validation Loss: 0.7035
------------------------------
Epoch 6/10
Evaluation Loss: 0.6998
Precision: 0.3684
Recall: 0.1818
F1 Score: 0.2435
Training Loss: 0.3941, Validation Loss: 0.6998
------------------------------
Epoch 7/10
Evaluation 

### K-Fold Evaluation

In [14]:
def cross_validate_model(
    fused_features_tensor, 
    labels_tensor, 
    num_folds, 
    num_epochs, 
    batch_size, 
    learning_rate,
    output_file,
    device=None
):
    # Set device configuration
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Device: {device}")
  
    # Ensure the output directory exists
    output_dir = os.path.dirname(output_file)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)  # Creates the directory if it does not exist
    
    # Create a TensorDataset from the features and labels
    dataset = TensorDataset(fused_features_tensor, labels_tensor)

    # Initialize KFold
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # lists to store metrics for each fold
    fold_losses = []
    fold_precisions = []
    fold_recalls = []
    fold_f1_scores = []
    f1_scores_per_fold = []  # List to store F1 scores for each fold

    for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
        print("-" * 50)
        print(f"------ Fold {fold + 1 }/{num_folds} ------")
     
        # Create data loaders for the train and validation sets
        train_sampler = torch.utils.data.SubsetRandomSampler(train_idx)
        val_sampler = torch.utils.data.SubsetRandomSampler(val_idx)
        
        train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler)
        val_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=val_sampler)

        # Initialize dense_layer, criterion, and optimizer
        dense_layer = DenseLayer(input_size=input_dim).to(device)  # Initialize and move the dense layer to the correct device  # Initialize your model here
        criterion = BCELoss()  # Example loss for binary classification
        optimizer = get_optimizer(dense_layer.parameters(), lr=learning_rate)

        # Train the model for the specified number of epochs
        for epoch in range(num_epochs):
            print(f"Epoch {epoch + 1}/{num_epochs}")
        
            train_loss = train_model(dense_layer, train_dataloader, criterion, optimizer, device)
            val_loss, precision, recall, f1_score = evaluate_model(dense_layer, val_dataloader, criterion, device)
            
            print(f"Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
            print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1_score:.4f}")
            
        # Store the validation metrics for this fold
        print(f"Results for Fold {fold}: Validation Loss = {val_loss:.4f}, Precision = {precision:.4f}, Recall = {recall:.4f}, F1 Score = {f1_score:.4f}")
        fold_losses.append(val_loss)
        fold_precisions.append(precision)
        fold_recalls.append(recall)
        fold_f1_scores.append(f1_score)
        f1_scores_per_fold.append(f1_score)  # Save F1 score for the current fold

    # Calculate the average metrics across all folds
    avg_loss = np.mean(fold_losses)
    avg_precision = np.mean(fold_precisions)
    avg_recall = np.mean(fold_recalls)
    avg_f1_score = np.mean(fold_f1_scores)

    print("-" * 50)
    print("\nK-Fold Cross-Validation Results:")
    print(f"Average Loss: {avg_loss:.4f}")
    print(f"Average Precision: {avg_precision:.4f}")
    print(f"Average Recall: {avg_recall:.4f}")
    print(f"Average F1 Score: {avg_f1_score:.4f}")
    
    # Save F1 scores per fold to a .npy file
    np.save(output_file, np.array(f1_scores_per_fold))
    print(f"F1 scores per fold saved to {output_file}")

    return avg_loss, avg_precision, avg_recall, avg_f1_score


In [27]:
# Run k-fold cross-validation   
results = cross_validate_model(
    fused_features_tensor, 
    labels_tensor, 
    num_folds=5,
    num_epochs=50,
    batch_size=32,
    learning_rate=1e-5,
    output_file="results/simulParallel-Xie-F1_scores.npy"
)


# Input dimension based on fused features tensor
input_dim = fused_features_tensor.shape[1]  # Adjust if necessary
dense_layer = DenseLayer(input_size=input_dim).to(device)  # Initialize and move the dense layer to the correct device
criterion = BCELoss()  # Use appropriate loss function

# Set device configuration
if device is None:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")
    
# Testing the model
print("Testing the model on the test set...")
test_loss, test_precision, test_recall, test_f1_score = test_model(dense_layer=dense_layer, dataloader=test_loader, criterion=criterion, device=device)


Device: cuda
--------------------------------------------------
------ Fold 1/5 ------
Epoch 1/50
Evaluation Loss: 0.7036
Precision: 0.3846
Recall: 0.4369
F1 Score: 0.4091
Training Loss: 0.7194, Validation Loss: 0.7036
Precision: 0.3846, Recall: 0.4369, F1 Score: 0.4091
Epoch 2/50
Evaluation Loss: 0.6808
Precision: 0.3939
Recall: 0.2524
F1 Score: 0.3077
Training Loss: 0.6832, Validation Loss: 0.6808
Precision: 0.3939, Recall: 0.2524, F1 Score: 0.3077
Epoch 3/50
Evaluation Loss: 0.6749
Precision: 0.4103
Recall: 0.1553
F1 Score: 0.2254
Training Loss: 0.6631, Validation Loss: 0.6749
Precision: 0.4103, Recall: 0.1553, F1 Score: 0.2254
Epoch 4/50
Evaluation Loss: 0.6754
Precision: 0.3889
Recall: 0.0680
F1 Score: 0.1157
Training Loss: 0.6530, Validation Loss: 0.6754
Precision: 0.3889, Recall: 0.0680, F1 Score: 0.1157
Epoch 5/50
Evaluation Loss: 0.6803
Precision: 0.2000
Recall: 0.0194
F1 Score: 0.0354
Training Loss: 0.6475, Validation Loss: 0.6803
Precision: 0.2000, Recall: 0.0194, F1 Score: 