In [78]:
import sys
import pandas as pd
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
from torcheval.metrics import BinaryPrecision, BinaryRecall, BinaryF1Score
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, KFold

In [79]:
# Append the path for module imports
sys.path.append('../')

# Import custom modules
from modules.cross_attentionb import CrossAttentionB
from modules.dataloader import load_npy_files
from modules.classifier import DenseLayer, BCELoss

### Data Loading

In [80]:
# Load the labels DataFrame
id_label_df = pd.read_excel('../misc/MM-Trailer_dataset.xlsx')

# Define the directories
text_features_dir = '/Users/david/Documents/THESIS/DATA/TEXT'
audio_features_dir = '/Users/david/Documents/THESIS/DATA/AUDIO'
video_features_dir = '/Users/david/Documents/THESIS/DATA/VIDEO'

# Load the feature vectors from each directory
text_features = load_npy_files(text_features_dir)
audio_features = load_npy_files(audio_features_dir)
video_features = load_npy_files(video_features_dir)

print(f"Number of text feature vectors loaded: {len(text_features)}")
print(f"Number of audio feature vectors loaded: {len(audio_features)}")
print(f"Number of video feature vectors loaded: {len(video_features)}")

Number of text feature vectors loaded: 1353
Number of audio feature vectors loaded: 1353
Number of video feature vectors loaded: 1353


In [81]:
import math

In [82]:
import torch.nn.functional as F

In [83]:
## MCA Class
class MutualCrossAttention(nn.Module):
    def __init__(self, dropout):
        super(MutualCrossAttention, self).__init__()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x1, x2):
        # Assign x1 and x2 to query and key
        query = x1
        key = x2
        d = query.shape[-1]

        # Basic attention mechanism formula to get intermediate output A
        scores = torch.bmm(query, key.transpose(1, 2)) / math.sqrt(d)
        output_A = torch.bmm(self.dropout(F.softmax(scores, dim=-1)), x2)
        # Basic attention mechanism formula to get intermediate output B
        scores = torch.bmm(key, query.transpose(1, 2)) / math.sqrt(d)
        output_B = torch.bmm(self.dropout(F.softmax(scores, dim=-1)), x1)

        # Make the summation of the two intermediate outputs
        output = output_A + output_B  # shape (1280, 32, 60)

        return output

In [84]:
# Initialize list to store fused features for each file
fused_features_list = []
labels_list = []
batch_size = 32  # Adjust based on your available memory

# Assuming MCA is initialized beforehand with the required dropout parameter
attention_merge = MutualCrossAttention(0.2)

for batch_start in range(0, len(text_features), batch_size):
    batch_end = min(batch_start + batch_size, len(text_features))
    batch_text_features = text_features[batch_start:batch_end]
    batch_audio_features = audio_features[batch_start:batch_end]
    batch_video_features = video_features[batch_start:batch_end]
    print(f"Batch: {batch_start}")
    
    # Loop through all the files in the dataset
    for i in range(len(batch_text_features)):
        # Extract features for the current file
        text_file_name, text_feature = batch_text_features[i]  # Renamed to avoid shadowing
        audio_file_name, audio_feature = batch_audio_features[i]  # Renamed to avoid shadowing
        video_file_name, video_feature = batch_video_features[i]  # Renamed to avoid shadowing

        # Check if any features are missing
        if text_feature is None or audio_feature is None or video_feature is None:
            print(f"Skipping file {i + 1}/{len(batch_text_features)}: Missing features for {text_file_name}, {audio_file_name}, {video_file_name}")
            continue  # Skip to the next iteration

        # Reshape features
        #audio_feature = audio_feature.squeeze(0)  # Changed from audio_features to audio_feature
        text_feature = text_feature.unsqueeze(0).unsqueeze(0)  # Changed from text_features to text_feature
        
        text_feature = torch.nn.Linear(1024, 768)(text_feature)  # Project to 768
        
        print("Text features shape:", text_feature.shape)
        print("Audio features shape:", audio_feature.shape)

        with torch.no_grad():
            # Cross-Attention for each modality using the MCA class
            fused_features = attention_merge(audio_feature, text_feature)
            
            print("Fused Features Shape:", fused_features.shape, '\n')

            # Append the fused features and the corresponding label to the lists
            fused_features_list.append(fused_features)
            label = id_label_df.iloc[i]['Label']  # Assuming you have a column 'Label'
            labels_list.append(label)

        del text_feature, audio_feature, video_feature
        
# Stack all fused features into a tensor for training
fused_features_tensor = torch.stack(fused_features_list)

# Convert labels to tensor
label_map = {'red': 1, 'green': 0}  # Adjust if your labels differ
labels_tensor = torch.tensor([label_map[label] for label in labels_list], dtype=torch.float32)


Batch: 0
Text features shape: torch.Size([1, 1, 768])
Audio features shape: torch.Size([1, 197, 768])
Fused Features Shape: torch.Size([1, 197, 768]) 

Text features shape: torch.Size([1, 1, 768])
Audio features shape: torch.Size([1, 197, 768])
Fused Features Shape: torch.Size([1, 197, 768]) 

Text features shape: torch.Size([1, 1, 768])
Audio features shape: torch.Size([1, 197, 768])
Fused Features Shape: torch.Size([1, 197, 768]) 

Text features shape: torch.Size([1, 1, 768])
Audio features shape: torch.Size([1, 197, 768])
Fused Features Shape: torch.Size([1, 197, 768]) 

Text features shape: torch.Size([1, 1, 768])
Audio features shape: torch.Size([1, 197, 768])
Fused Features Shape: torch.Size([1, 197, 768]) 

Text features shape: torch.Size([1, 1, 768])
Audio features shape: torch.Size([1, 197, 768])
Fused Features Shape: torch.Size([1, 197, 768]) 

Text features shape: torch.Size([1, 1, 768])
Audio features shape: torch.Size([1, 197, 768])
Fused Features Shape: torch.Size([1, 197

In [85]:
import numpy as np

In [86]:
# normalize
norm_features = np.array(fused_features_tensor)


x_min = norm_features.min(axis=(1, 2), keepdims=True)
x_max = norm_features.max(axis=(1, 2), keepdims=True)
mca_normal = (norm_features-x_min)/(x_max-x_min)

print(mca_normal.shape)
print(type(mca_normal))

  norm_features = np.array(fused_features_tensor)


(1353, 1, 197, 768)
<class 'numpy.ndarray'>


In [87]:
# Splitting data for training, validation, and testing
train_df, val_test_df, train_labels, val_test_labels = train_test_split(
    mca_normal, labels_tensor, test_size=0.3, random_state=42, stratify=labels_tensor)

# Further splitting remaining set into validation and test sets
val_df, test_df, val_labels, test_labels = train_test_split(
    val_test_df, val_test_labels, test_size=0.5, random_state=42, stratify=val_test_labels)

# Create DataLoaders
train_loader = DataLoader(list(zip(train_df, train_labels)), batch_size=16, shuffle=True)
val_loader = DataLoader(list(zip(val_df, val_labels)), batch_size=16, shuffle=False)
test_loader = DataLoader(list(zip(test_df, test_labels)), batch_size=16, shuffle=False)

In [133]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DenseClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims=[786, 512], dropout_prob=0.5):
        super(DenseClassifier, self).__init__()
        
        # Define the sequence of fully connected layers with dropout
        self.fc1 = nn.Linear(input_dim, hidden_dims[0])  # First fully connected layer
        self.fc2 = nn.Linear(hidden_dims[0], hidden_dims[1])  # Second fully connected layer
        self.fc3 = nn.Linear(hidden_dims[1], 1)  # Final layer for binary classification

        # Define dropout
        self.dropout = nn.Dropout(dropout_prob)  # Dropout with specified probability

    def forward(self, x):
        # Remove extra dimension if necessary
        x = x.squeeze(1) if x.dim() == 4 else x  # Adjust if needed based on input shape

        # Apply global average pooling across the sequence length dimension (seq_len)
        x = x.mean(dim=1)  # Results in shape (batch_size, feature_dim)

        # Forward pass through the fully connected layers with ReLU activations and dropout
        x = F.relu(self.fc1(x))
        x = self.dropout(x)  # Apply dropout after first layer
        x = F.relu(self.fc2(x))
        x = self.dropout(x)  # Apply dropout after second layer

        # Final layer produces logits for binary classification
        logits = self.fc3(x)

        return logits


In [134]:
# Test the DenseClassifier with random data
input_dim = 197 * 768  # Flattened input size
batch_size = 16
model = DenseClassifier(768)

# Create a random input tensor of shape (batch_size, seq_len, feature_dim)
x = torch.randn(batch_size, 197, 768)

# Forward pass
output = model(x)

print(output.shape)  # Should be (batch_size, 1) for binary classification


torch.Size([16, 1])


In [135]:
for batch in train_loader:
        # Assuming batch is a tuple (fused_features, targets)
        fused_features, targets = batch

        print(targets.shape)

torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16])
torch.Size([16

In [136]:
def get_optimizer(parameters, lr=1e-4):
    # Create an optimizer, for example, Adam
    return optim.Adam(parameters, lr=lr)

In [137]:
def train_model(dense_layer, dataloader, criterion, optimizer, device):
    dense_layer.train()  # Set the model to training mode
    total_loss = 0.0

    for batch in dataloader:
        # Assuming batch is a tuple (fused_features, targets)
        fused_features, targets = batch

        # Move tensors to the specified device
        fused_features = fused_features.to(device).squeeze()
        # Make sure the target tensor has the shape (batch_size, 1)
        targets = targets.unsqueeze(1)
        targets = targets.to(device)

        # Clear gradients
        optimizer.zero_grad()

        # Pass the fused features through the dense layer
        predictions = dense_layer(fused_features)  # Ensure correct shape

        # Compute loss
        loss = criterion(predictions, targets)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


In [138]:
def evaluate_model(dense_layer, dataloader, criterion, device):
    dense_layer.eval()
    total_loss = 0

    # Initialize the metrics for binary classification
    precision_metric = BinaryPrecision().to(device)
    recall_metric = BinaryRecall().to(device)
    f1_metric = BinaryF1Score().to(device)
    
    with torch.no_grad():
        for fused_features, targets in dataloader:
            fused_features, targets = (
                fused_features.to(device),  # Use fused features directly
                targets.to(device).squeeze()
            )

            # Pass the fused features through the dense layer
            predictions = dense_layer(fused_features).squeeze()
            
            # Compute loss
            loss = criterion(predictions, targets)
            total_loss += loss.item()

            # Apply threshold to get binary predictions
            preds = (torch.sigmoid(predictions) > 0.5).float()
            
            # Update the precision, recall, and F1 score metrics
            precision_metric.update(preds.long(), targets.long())
            recall_metric.update(preds.long(), targets.long())
            f1_metric.update(preds.long(), targets.long())

    # Compute precision, recall, and F1 score
    precision = precision_metric.compute().item()
    recall = recall_metric.compute().item()
    f1_score = f1_metric.compute().item()

    average_loss = total_loss / len(dataloader)

    print(f"Evaluation Loss: {average_loss:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")
    
    return average_loss, precision, recall, f1_score


In [139]:
def test_model(dense_layer, dataloader, criterion, device):
    dense_layer.eval()  # Set the model to evaluation mode
    total_loss = 0

    # Initialize the metrics for binary classification
    precision_metric = BinaryPrecision().to(device)
    recall_metric = BinaryRecall().to(device)
    f1_metric = BinaryF1Score().to(device)

    with torch.no_grad():
        for fused_features, targets in dataloader:
            fused_features, targets = (
                fused_features.to(device),  # Move fused features to device
                targets.to(device).squeeze()  # Ensure targets are the correct shape
            )

            # Pass the fused features through the dense layer
            predictions = dense_layer(fused_features).squeeze()
            
            # Compute loss
            loss = criterion(predictions, targets)
            total_loss += loss.item()

            # Apply threshold to get binary predictions
            preds = (torch.sigmoid(predictions) > 0.5).float()
            
            # Update the precision, recall, and F1 score metrics
            precision_metric.update(preds.long(), targets.long())
            recall_metric.update(preds.long(), targets.long())
            f1_metric.update(preds.long(), targets.long())

    # Compute precision, recall, and F1 score
    precision = precision_metric.compute().item()
    recall = recall_metric.compute().item()
    f1_score = f1_metric.compute().item()

    average_loss = total_loss / len(dataloader)

    print(f"Test Loss: {average_loss:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1_score:.4f}")

    return average_loss, precision, recall, f1_score


In [140]:
if __name__ == "__main__":
    torch.manual_seed(42)

    # Check for device availability
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    else:
        device = torch.device("cpu")

    # Input dimension based on fused features tensor
    input_dim = fused_features_tensor.shape[1]  # Adjust if necessary
    dense_layer = DenseClassifier(input_dim=768).to(device)  # Initialize and move the dense layer to the correct device

    # Define the loss function and optimizer
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(2.94))  # Use appropriate loss function
    optimizer = get_optimizer(dense_layer.parameters())  # Pass only the dense layer parameters

    # Training loop
    num_epochs = 20  # Set the number of epochs you want to train for

    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Ensure you have a dataloader that yields inputs and targets
        train_loss = train_model(dense_layer=dense_layer, dataloader=train_loader, criterion=criterion, optimizer=optimizer, device=device)
        
        # Validate model
        val_loss, precision, recall, f1_score = evaluate_model(dense_layer=dense_layer, dataloader=val_loader, criterion=criterion, device=device)

        print(f"Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
        print("-" * 30)
    
    # Testing the model
    print("Testing the model on the test set...")
    test_loss, test_precision, test_recall, test_f1_score = test_model(dense_layer=dense_layer, dataloader=test_loader, criterion=criterion, device=device)


Epoch 1/20
Evaluation Loss: 1.1152
Precision: 0.3448
Recall: 1.0000
F1 Score: 0.5128
Training Loss: 1.1323, Validation Loss: 1.1152
------------------------------
Epoch 2/20
Evaluation Loss: 1.1164
Precision: 0.3448
Recall: 1.0000
F1 Score: 0.5128
Training Loss: 1.1188, Validation Loss: 1.1164
------------------------------
Epoch 3/20
Evaluation Loss: 1.1186
Precision: 0.3448
Recall: 1.0000
F1 Score: 0.5128
Training Loss: 1.1213, Validation Loss: 1.1186
------------------------------
Epoch 4/20
Evaluation Loss: 1.1166
Precision: 0.3448
Recall: 1.0000
F1 Score: 0.5128
Training Loss: 1.1098, Validation Loss: 1.1166
------------------------------
Epoch 5/20
Evaluation Loss: 1.1156
Precision: 0.3448
Recall: 1.0000
F1 Score: 0.5128
Training Loss: 1.1165, Validation Loss: 1.1156
------------------------------
Epoch 6/20
Evaluation Loss: 1.1152
Precision: 0.3448
Recall: 1.0000
F1 Score: 0.5128
Training Loss: 1.1110, Validation Loss: 1.1152
------------------------------
Epoch 7/20
Evaluation 