### Prerequisite Packages

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from torcheval.metrics import BinaryPrecision, BinaryRecall, BinaryF1Score
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score

In [2]:
sys.path.append('../../')

from modules.cross_attentionb import CrossAttentionB
from modules.dataloader import load_npy_files
from modules.classifier import DenseLayer, BCELoss, CustomLoss, BCEWithLogits
from modules.linear_transformation import LinearTransformations

### Data Loading

In [3]:
class MultimodalDataset(Dataset):
    def __init__(self, id_label_df, text_features, audio_features, video_features):
        self.id_label_df = id_label_df
        
        # Convert feature lists to dictionaries for fast lookup
        self.text_features = {os.path.basename(file).split('.')[0]: tensor for file, tensor in text_features}
        self.audio_features = {os.path.basename(file).split('_')[1].split('.')[0]: tensor for file, tensor in audio_features}
        self.video_features = {os.path.basename(file).split('_')[0]: tensor for file, tensor in video_features}

        # List to store missing files
        self.missing_files = []

        # Filter out entries with missing files
        self.valid_files = self._filter_valid_files()

    def _filter_valid_files(self):
        valid_indices = []
        missing_files = []

        for idx in range(len(self.id_label_df)):
            imdbid = self.id_label_df.iloc[idx]['IMDBid']

            # Check if the IMDBid exists in each modality's features
            if imdbid in self.text_features and imdbid in self.audio_features and imdbid in self.video_features:
                valid_indices.append(idx)
            else:
                missing_files.append({'IMDBid': imdbid})

        # Filter id_label_df to only include valid rows
        self.id_label_df = self.id_label_df.iloc[valid_indices].reset_index(drop=True)
        self.missing_files = missing_files

        # Return valid indices
        return valid_indices

    def __len__(self):
        return len(self.valid_files)

    def __getitem__(self, idx):
        # Get the original index from the filtered valid files
        original_idx = self.valid_files[idx]
        imdbid = self.id_label_df.iloc[original_idx]['IMDBid']
        label = self.id_label_df.iloc[original_idx]['Label']

        # Retrieve data from the loaded features
        text_data = self.text_features.get(imdbid, torch.zeros((1024,)))
        audio_data = self.audio_features.get(imdbid, torch.zeros((1, 197, 768)))
        video_data = self.video_features.get(imdbid, torch.zeros((95, 768)))
        
        # Define label mapping
        label_map = {'red': 1, 'green': 0} 
        
        # Convert labels to tensor using label_map
        try:
            label_data = torch.tensor([label_map[label]], dtype=torch.float32)
        except KeyError as e:
            print(f"Error: Label '{e}' not found in label_map.")
            raise

        return text_data, audio_data, video_data, label_data


In [4]:
def collate_fn(batch):
    text_data, audio_data, video_data, label_data = zip(*batch)

    # Convert lists to tensors
    text_data = torch.stack(text_data)
    audio_data = torch.stack(audio_data)

    # Padding for video data
    # Determine maximum length of video sequences in the batch
    video_lengths = [v.size(0) for v in video_data]
    max_length = max(video_lengths)

    # Pad video sequences to the maximum length
    video_data_padded = torch.stack([
        F.pad(v, (0, 0, 0, max_length - v.size(0)), "constant", 0)
        for v in video_data
    ])

    # Convert labels to tensor and ensure the shape [batch_size, 1]
    label_data = torch.stack(label_data)  # Convert list of tensors to a single tensor

    return text_data, audio_data, video_data_padded, label_data

In [5]:
# Load the labels DataFrame
id_label_df = pd.read_excel('../../misc/MM-Trailer_dataset.xlsx')

# Define the directories
text_features_dir = '../../misc/textStream_BERT/feature_vectors'
audio_features_dir = '../../misc/audio_fe/logmel_spectrograms'
video_features_dir = '../../misc/visualStream_ViT'

# Load the feature vectors from each directory
text_features = load_npy_files(text_features_dir)
audio_features = load_npy_files(audio_features_dir)
video_features = load_npy_files(video_features_dir)

print(f"Number of text feature vectors loaded: {len(text_features)}")
print(f"Number of audio feature vectors loaded: {len(audio_features)}")
print(f"Number of video feature vectors loaded: {len(video_features)}")

# Drop unnecessary columns
id_label_df = id_label_df.drop(columns=['Movie Title', 'URL'])

full_dataset = MultimodalDataset(id_label_df, text_features, audio_features, video_features)

# First, filter the id_label_df using the valid indices before creating dataset splits
filtered_id_label_df = id_label_df.iloc[full_dataset.valid_files].reset_index(drop=True)

# perform train-test split on the filtered DataFrame
train_df, val_test_df = train_test_split(
    filtered_id_label_df, test_size=0.3, random_state=42, stratify=filtered_id_label_df['Label'])

# Further splitting remaining set into validation and test sets
val_df, test_df = train_test_split(
    val_test_df, test_size=0.5, random_state=42, stratify=val_test_df['Label'])

print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

print("Train label distribution:", train_df['Label'].value_counts())
print("Validation label distribution:", val_df['Label'].value_counts())
print("Test label distribution:", test_df['Label'].value_counts())

print("-" * 40)

# create datasets based on these splits
train_dataset = MultimodalDataset(train_df, text_features, audio_features, video_features)
val_dataset = MultimodalDataset(val_df, text_features, audio_features, video_features)
test_dataset = MultimodalDataset(test_df, text_features, audio_features, video_features)

# Calculate weights for the classes in the training set
class_counts = train_df['Label'].value_counts().to_dict()
class_weights = {label: 1.0 / count for label, count in class_counts.items()}
sample_weights = [class_weights[label] for label in train_df['Label']]

# Initialize the WeightedRandomSampler
weighted_sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True
)

# RESAMPLING
# train_dataloader = DataLoader(train_dataset, batch_size=32, sampler=weighted_sampler, num_workers=0, collate_fn=collate_fn)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True, num_workers=0, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=0, collate_fn=collate_fn)


Number of text feature vectors loaded: 1353
Number of audio feature vectors loaded: 1353
Number of video feature vectors loaded: 1353
(947, 2)
(203, 2)
(203, 2)
Train label distribution: Label
green    707
red      240
Name: count, dtype: int64
Validation label distribution: Label
green    151
red       52
Name: count, dtype: int64
Test label distribution: Label
green    152
red       51
Name: count, dtype: int64
----------------------------------------


In [6]:
for idx in range(len(train_dataset)):
    # Get the original index and IMDb ID
    original_idx = full_dataset.valid_files[idx]
    imdbid = full_dataset.id_label_df.iloc[original_idx]['IMDBid']
    label = full_dataset.id_label_df.iloc[original_idx]['Label']
    
    # Retrieve filenames for each modality
    text_filename = next((file for file, _ in text_features if imdbid in file), "Not found")
    audio_filename = next((file for file, _ in audio_features if imdbid in file), "Not found")
    video_filename = next((file for file, _ in video_features if imdbid in file), "Not found")
    
    # Print details
    print(f"IMDb ID: {imdbid}, Label: {label}")
    print(f"Text File: {text_filename}")
    print(f"Audio File: {audio_filename}")
    print(f"Video File: {video_filename}")
    print("-" * 40)


IMDb ID: tt0790799, Label: green
Text File: ../../misc/textStream_BERT/feature_vectors/tt0790799.npy
Audio File: ../../misc/audio_fe/logmel_spectrograms/feature_tt0790799.npy
Video File: ../../misc/visualStream_ViT/tt0790799_features.npy
----------------------------------------
IMDb ID: tt0478087, Label: green
Text File: ../../misc/textStream_BERT/feature_vectors/tt0478087.npy
Audio File: ../../misc/audio_fe/logmel_spectrograms/feature_tt0478087.npy
Video File: ../../misc/visualStream_ViT/tt0478087_features.npy
----------------------------------------
IMDb ID: tt1629439, Label: red
Text File: ../../misc/textStream_BERT/feature_vectors/tt1629439.npy
Audio File: ../../misc/audio_fe/logmel_spectrograms/feature_tt1629439.npy
Video File: ../../misc/visualStream_ViT/tt1629439_features.npy
----------------------------------------
IMDb ID: tt1649444, Label: red
Text File: ../../misc/textStream_BERT/feature_vectors/tt1649444.npy
Audio File: ../../misc/audio_fe/logmel_spectrograms/feature_tt1649

### Inspecting the Data

In [7]:
# Sample index to inspect
sample_idx = 0

# Retrieve the sample
sample = train_dataset[sample_idx]

# Access each modality feature by index
text_feature = sample[0]
audio_feature = sample[1]
video_feature = sample[2]
label = sample[3]

# Print the dimensions
print("Text feature size:", text_feature.size())
print("Audio feature size:", audio_feature.size())
print("Video feature size:", video_feature.size())
print("Label size:", label.size())


Text feature size: torch.Size([1024])
Audio feature size: torch.Size([1, 197, 768])
Video feature size: torch.Size([65, 768])
Label size: torch.Size([1])


### Important Functions for Indiv Modality Check

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import recall_score, precision_score, f1_score


### TEXT ONLY

In [9]:
class TextModel(nn.Module):
    def __init__(self, input_size=1024):
        super(TextModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [10]:
def train_text_model(model, dataloader, criterion, optimizer, device='cpu'):
    model.train()
    total_loss = 0

    for text_features, _, _, labels in dataloader:  # Only use text data
            #text_features, labels = text_features.to(device), labels.to(device).view(-1)

            optimizer.zero_grad()

            outputs = model(text_features)

            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
        
    return avg_loss  # Optionally, return the last average loss if needed


In [11]:
def evaluate_text_model(model, dataloader, criterion, device='cpu'):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient calculation for evaluation
        for text_features, _, _, labels in dataloader:  # Only use text data
            #text_features, labels = text_features.to(device), labels.to(device).view(-1)
            outputs = model(text_features)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            outputs = torch.sigmoid(outputs)

            all_preds.extend((outputs > 0.5).int().tolist())
            all_labels.extend(labels.int().tolist())

    # Calculate metrics
    recall = recall_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    avg_loss = total_loss / len(dataloader)

    print(f"Text Model Evaluation - Loss: {avg_loss:.4f}, Recall: {recall:.4f}, Precision: {precision:.4f}, F1: {f1:.4f}")

    return avg_loss, precision, recall, f1


In [12]:
def test_text_model(model, dataloader, criterion, device='cpu'):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient calculation for evaluation
        for text_features, _, _, labels in dataloader:  # Only use text data
            #text_features, labels = text_features.to(device), labels.to(device).view(-1)
            outputs = model(text_features)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            outputs = torch.sigmoid(outputs)

            all_preds.extend((outputs > 0.5).int().tolist())
            all_labels.extend(labels.int().tolist())

    # Calculate metrics
    recall = recall_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    avg_loss = total_loss / len(dataloader)
    
    print(f"Text Model Test - Loss: {avg_loss:.4f}, Test Recall: {recall:.4f}, Test Precision: {precision:.4f}, Test F1: {f1:.4f}")

    return avg_loss, precision, recall, f1


In [13]:
if __name__ == "__main__":
    torch.manual_seed(42)

    # Define models for each modality
    text_model = TextModel()

    # Define the criterion
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(2.94))

    for name, param in text_model.named_parameters():
        if param.grad is None:
            print("model:", "No gradient for:", name)
            
    # Train each modality
    optimizer = optim.Adam(text_model.parameters(), lr=0.0001)
    num_epochs = 10

    print("\nTraining Text Model...")
    for epoch in range(num_epochs):
        print("-" * 40)
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Ensure you have a dataloader that yields inputs and targets
        train_loss = train_text_model(text_model, train_dataloader, criterion, optimizer, device='cpu')

        # Validate step
        val_loss, precision, recall, f1 = evaluate_text_model(text_model, val_dataloader, criterion, device='cpu')

        print(f"Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

    # Testing the model
    print("-" * 40)
    print("Testing the model on the test set...")
    test_loss, test_precision, test_recall, test_f1_score = test_text_model(text_model, test_dataloader, criterion, device='cpu')


model: No gradient for: fc1.weight
model: No gradient for: fc1.bias
model: No gradient for: fc2.weight
model: No gradient for: fc2.bias

Training Text Model...
----------------------------------------
Epoch 1/10
Text Model Evaluation - Loss: 0.9881, Recall: 0.8846, Precision: 0.3239, F1: 0.4742
Training Loss: 1.0208, Validation Loss: 0.9881
----------------------------------------
Epoch 2/10
Text Model Evaluation - Loss: 0.9360, Recall: 0.7885, Precision: 0.3694, F1: 0.5031
Training Loss: 0.9333, Validation Loss: 0.9360
----------------------------------------
Epoch 3/10
Text Model Evaluation - Loss: 0.8942, Recall: 0.5385, Precision: 0.4828, F1: 0.5091
Training Loss: 0.8659, Validation Loss: 0.8942
----------------------------------------
Epoch 4/10
Text Model Evaluation - Loss: 0.8670, Recall: 0.5577, Precision: 0.5000, F1: 0.5273
Training Loss: 0.8169, Validation Loss: 0.8670
----------------------------------------
Epoch 5/10
Text Model Evaluation - Loss: 0.8541, Recall: 0.5577, Pr

### AUDIO MODEL ONLY


In [14]:
import torch
import torch.nn as nn

class AudioModel(nn.Module):
    def __init__(self):
        super(AudioModel, self).__init__()
        self.fc1 = nn.Linear(197 * 768, 512)  # Flattened input size
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, 1)  # Output layer for binary classification

    def forward(self, x):
        # Assuming x has shape [batch_size, 197, 768]
        x = x.view(x.size(0), -1)  # Flatten to [batch_size, 197 * 768]
        x = self.relu(self.fc1(x))  # First fully connected layer
        x = self.fc2(x)  # Output layer (logits)
        return x

In [15]:
def train_audio_model(model, dataloader, criterion, optimizer, device='cpu'):
    model.train()
    total_loss = 0

    for _, audio_features, _, labels in dataloader:  # Only use text data
            
            optimizer.zero_grad()

            outputs = model(audio_features)

            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
        
    return avg_loss  # Optionally, return the last average loss if needed


In [16]:
def evaluate_audio_model(model, dataloader, criterion, device='cpu'):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient calculation for evaluation
        for _, audio_features, _, labels in dataloader:  # Only use text data
            #text_features, labels = text_features.to(device), labels.to(device).view(-1)
            outputs = model(audio_features)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            outputs = torch.sigmoid(outputs)

            all_preds.extend((outputs > 0.5).int().tolist())
            all_labels.extend(labels.int().tolist())

    # Calculate metrics
    recall = recall_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    avg_loss = total_loss / len(dataloader)

    print(f"Audio Model Evaluation - Loss: {avg_loss:.4f}, Recall: {recall:.4f}, Precision: {precision:.4f}, F1: {f1:.4f}")

    return avg_loss, precision, recall, f1


In [17]:
def test_audio_model(model, dataloader, criterion, device='cpu'):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient calculation for evaluation
        for _, audio_features, _, labels in dataloader:  # Only use text data
            #text_features, labels = text_features.to(device), labels.to(device).view(-1)
            outputs = model(audio_features)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            outputs = torch.sigmoid(outputs)

            all_preds.extend((outputs > 0.5).int().tolist())
            all_labels.extend(labels.int().tolist())

    # Calculate metrics
    recall = recall_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    avg_loss = total_loss / len(dataloader)
    
    print(f"Audio Model Test - Loss: {avg_loss:.4f}, Test Recall: {recall:.4f}, Test Precision: {precision:.4f}, Test F1: {f1:.4f}")

    return avg_loss, precision, recall, f1


In [18]:
if __name__ == "__main__":
    torch.manual_seed(42)

    # Define models for each modality
    audio_model = AudioModel()

    # Define the criterion
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(2.94))

    for name, param in audio_model.named_parameters():
        if param.grad is None:
            print("model:", "No gradient for:", name)
            
    # Train each modality
    optimizer = optim.Adam(audio_model.parameters(), lr=0.0001)
    num_epochs = 10

    print("\nTraining on Audio Model...")
    for epoch in range(num_epochs):
        print("-" * 40)
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Ensure you have a dataloader that yields inputs and targets
        train_loss = train_audio_model(audio_model, train_dataloader, criterion, optimizer, device='cpu')

        # Validate step
        val_loss, precision, recall, f1 = evaluate_audio_model(audio_model, val_dataloader, criterion, device='cpu')

        print(f"Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

    # Testing the model
    print("-" * 40)
    print("Testing the model on the test set...")
    test_loss, test_precision, test_recall, test_f1_score = test_audio_model(audio_model, test_dataloader, criterion, device='cpu')


model: No gradient for: fc1.weight
model: No gradient for: fc1.bias
model: No gradient for: fc2.weight
model: No gradient for: fc2.bias

Training on Audio Model...
----------------------------------------
Epoch 1/10
Audio Model Evaluation - Loss: 1.0020, Recall: 0.9808, Precision: 0.3269, F1: 0.4904
Training Loss: 1.2558, Validation Loss: 1.0020
----------------------------------------
Epoch 2/10
Audio Model Evaluation - Loss: 0.8905, Recall: 0.4423, Precision: 0.5349, F1: 0.4842
Training Loss: 0.8054, Validation Loss: 0.8905
----------------------------------------
Epoch 3/10
Audio Model Evaluation - Loss: 0.7970, Recall: 0.6923, Precision: 0.5294, F1: 0.6000
Training Loss: 0.7262, Validation Loss: 0.7970
----------------------------------------
Epoch 4/10
Audio Model Evaluation - Loss: 0.8086, Recall: 0.5962, Precision: 0.5962, F1: 0.5962
Training Loss: 0.6570, Validation Loss: 0.8086
----------------------------------------
Epoch 5/10
Audio Model Evaluation - Loss: 0.7446, Recall: 0

### VIDEO ONLY

In [19]:
import torch
import torch.nn as nn

class VideoModel(nn.Module):
    def __init__(self, dropout_rate=0.2):
        super(VideoModel, self).__init__()
        self.fc1 = nn.Linear(768, 512)
        self.dropout1 = nn.Dropout(dropout_rate)  # Dropout after the first layer
        self.fc2 = nn.Linear(512, 256)  # Additional hidden layer
        self.dropout2 = nn.Dropout(dropout_rate)  # Dropout after the second layer
        self.fc3 = nn.Linear(256, 128)  # Another hidden layer
        self.dropout3 = nn.Dropout(dropout_rate)  # Dropout after the third layer
        self.fc4 = nn.Linear(128, 64)   # Deeper hidden layer
        self.dropout4 = nn.Dropout(dropout_rate)  # Dropout after the fourth layer
        self.fc5 = nn.Linear(64, 1)      # Output layer
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.mean(dim=1)  # Global Average Pooling over sequence dimension
        x = self.relu(self.fc1(x))
        x = self.dropout1(x)  # Apply dropout after first layer
        x = self.relu(self.fc2(x))
        x = self.dropout2(x)  # Apply dropout after second layer
        x = self.relu(self.fc3(x))  # Pass through the new deeper layer
        x = self.dropout3(x)  # Apply dropout after third layer
        x = self.relu(self.fc4(x))   # Another activation after the deeper layer
        x = self.dropout4(x)  # Apply dropout after fourth layer
        x = self.fc5(x)  # Final output shape [batch_size, 1]
        return x  # Ensure output shape is [batch_size, 1]


In [20]:
def train_video_model(model, dataloader, criterion, optimizer, device='cpu'):
    model.train()
    total_loss = 0

    for _, _, video_features, labels in dataloader:  # Only use video data
            
            optimizer.zero_grad()
            
            outputs = model(video_features)

            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
        
    return avg_loss  # Optionally, return the last average loss if needed


In [21]:
def evaluate_video_model(model, dataloader, criterion, device='cpu'):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient calculation for evaluation
        for _, _, video_features, labels in dataloader:  # Only use text data
            #text_features, labels = text_features.to(device), labels.to(device).view(-1)
            outputs = model(video_features)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            outputs = torch.sigmoid(outputs)

            

            all_preds.extend((outputs > 0.5).int().tolist())
            all_labels.extend(labels.int().tolist())

    # Calculate metrics
    recall = recall_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    avg_loss = total_loss / len(dataloader)

    print(f"Audio Model Evaluation - Loss: {avg_loss:.4f}, Recall: {recall:.4f}, Precision: {precision:.4f}, F1: {f1:.4f}")

    return avg_loss, precision, recall, f1


In [22]:
def test_video_model(model, dataloader, criterion, device='cpu'):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient calculation for evaluation
        for _, _, video_features, labels in dataloader:  # Only use text data
            #text_features, labels = text_features.to(device), labels.to(device).view(-1)
            outputs = model(video_features)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            outputs = torch.sigmoid(outputs)

            all_preds.extend((outputs > 0.5).int().tolist())
            all_labels.extend(labels.int().tolist())

    # Calculate metrics
    recall = recall_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    avg_loss = total_loss / len(dataloader)
    
    print(f"Video Model Test - Loss: {avg_loss:.4f}, Test Recall: {recall:.4f}, Test Precision: {precision:.4f}, Test F1: {f1:.4f}")

    return avg_loss, precision, recall, f1


In [23]:
if __name__ == "__main__":
    torch.manual_seed(42)

    # Define models for each modality
    video_model = VideoModel()

    # Define the criterion
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(2))

    for name, param in video_model.named_parameters():
        if param.grad is None:
            print("model:", "No gradient for:", name)
            
    # Train each modality
    optimizer = optim.Adam(video_model.parameters(), lr=0.001)
    num_epochs = 10

    print("\nTraining on Video Model...")
    for epoch in range(num_epochs):
        print("-" * 40)
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Ensure you have a dataloader that yields inputs and targets
        train_loss = train_video_model(video_model, train_dataloader, criterion, optimizer, device='cpu')

        # Validate step
        val_loss, precision, recall, f1 = evaluate_video_model(video_model, val_dataloader, criterion, device='cpu')

        print(f"Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

    # Testing the model
    print("-" * 40)
    print("Testing the model on the test set...")
    test_loss, test_precision, test_recall, test_f1_score = test_video_model(video_model, test_dataloader, criterion, device='cpu')


model: No gradient for: fc1.weight
model: No gradient for: fc1.bias
model: No gradient for: fc2.weight
model: No gradient for: fc2.bias
model: No gradient for: fc3.weight
model: No gradient for: fc3.bias
model: No gradient for: fc4.weight
model: No gradient for: fc4.bias
model: No gradient for: fc5.weight
model: No gradient for: fc5.bias

Training on Video Model...
----------------------------------------
Epoch 1/10


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Audio Model Evaluation - Loss: 0.8194, Recall: 0.0000, Precision: 0.0000, F1: 0.0000
Training Loss: 0.8526, Validation Loss: 0.8194
----------------------------------------
Epoch 2/10
Audio Model Evaluation - Loss: 0.6737, Recall: 0.6346, Precision: 0.6226, F1: 0.6286
Training Loss: 0.7533, Validation Loss: 0.6737
----------------------------------------
Epoch 3/10
Audio Model Evaluation - Loss: 0.5533, Recall: 0.6538, Precision: 0.7727, F1: 0.7083
Training Loss: 0.5658, Validation Loss: 0.5533
----------------------------------------
Epoch 4/10
Audio Model Evaluation - Loss: 0.5217, Recall: 0.7885, Precision: 0.6029, F1: 0.6833
Training Loss: 0.4813, Validation Loss: 0.5217
----------------------------------------
Epoch 5/10
Audio Model Evaluation - Loss: 0.4761, Recall: 0.8654, Precision: 0.5921, F1: 0.7031
Training Loss: 0.4433, Validation Loss: 0.4761
----------------------------------------
Epoch 6/10
Audio Model Evaluation - Loss: 0.5264, Recall: 0.8654, Precision: 0.5844, F1: 0.

### Bimodal Fusion Classification

#### Concat

In [24]:
# import torch
# import torch.nn as nn

# class FusionModule(nn.Module):
#     def __init__(self, alpha_dim, beta_dim, hidden_dim, common_dim=768, num_heads=8):
#         super(FusionModule, self).__init__()
        
#         # Project ModalityAlpha and ModalityBeta to a common dimension
#         self.alpha_proj = nn.Linear(alpha_dim, common_dim) if alpha_dim != common_dim else nn.Identity()
#         self.beta_proj = nn.Linear(beta_dim, common_dim) if beta_dim != common_dim else nn.Identity()

#         # MultiheadAttention with the common embedding dimension
#         self.attn1 = nn.MultiheadAttention(embed_dim=common_dim, num_heads=num_heads, batch_first=True)
#         self.attn2 = nn.MultiheadAttention(embed_dim=common_dim, num_heads=num_heads, batch_first=True)
        
#         # Fully connected layer for final output fusion
#         self.fc = nn.Linear(common_dim * 2, hidden_dim)  # For concatenated outputs

#     def forward(self, alpha_features, beta_features):
#         # Project both modalities to the common embedding dimension
#         alpha_features = self.alpha_proj(alpha_features)  # Shape: (batch_size, seq_len_alpha, common_dim)
#         beta_features = self.beta_proj(beta_features)     # Shape: (batch_size, seq_len_beta, common_dim)

#         # First cross-attention: ModalityAlpha as Query, ModalityBeta as Key-Value
#         alpha_out, _ = self.attn1(alpha_features, beta_features, beta_features)  # (batch, seq_len_alpha, common_dim)
        
#         # Second cross-attention: ModalityBeta as Query, ModalityAlpha as Key-Value
#         beta_out, _ = self.attn2(beta_features, alpha_features, alpha_features)  # (batch, seq_len_beta, common_dim)

#         # Find the maximum sequence length between alpha_out and beta_out
#         max_seq_len = max(alpha_out.size(1), beta_out.size(1))

#         # Expand both outputs to the maximum sequence length
#         if alpha_out.size(1) < max_seq_len:
#             alpha_out = alpha_out.repeat_interleave(max_seq_len // alpha_out.size(1), dim=1)[: , :max_seq_len, :]
#         if beta_out.size(1) < max_seq_len:
#             beta_out = beta_out.repeat_interleave(max_seq_len // beta_out.size(1), dim=1)[: , :max_seq_len, :]

#         # Concatenate the outputs along the feature dimension
#         combined_output = torch.cat((alpha_out, beta_out), dim=-1)  # Shape: (batch_size, max_seq_len, common_dim * 2)
        
#         # Final linear layer to produce the fused output
#         fused_output = self.fc(combined_output)  # Shape: (batch_size, max_seq_len, hidden_dim)

#         return combined_output


#### AVG

In [25]:
class FusionModule(nn.Module):
    def __init__(self, alpha_dim, beta_dim, hidden_dim, common_dim=768, num_heads=8):
        super(FusionModule, self).__init__()
        
        # Project both modalities to common dimension
        self.alpha_proj = nn.Linear(alpha_dim, common_dim) if alpha_dim != common_dim else nn.Identity()
        self.beta_proj = nn.Linear(beta_dim, common_dim) if beta_dim != common_dim else nn.Identity()

        # MultiheadAttention with the common embedding dimension
        self.attn1 = nn.MultiheadAttention(embed_dim=common_dim, num_heads=num_heads, batch_first=True)
        self.attn2 = nn.MultiheadAttention(embed_dim=common_dim, num_heads=num_heads, batch_first=True)
        
        # Changed: Remove the concatenation FC layer since we're using averaging
        self.output_proj = nn.Linear(common_dim, hidden_dim)

    def forward(self, alpha_features, beta_features):
        # Project both modalities to the common embedding dimension
        alpha_features = self.alpha_proj(alpha_features)
        beta_features = self.beta_proj(beta_features)

        # Cross-attention in both directions
        alpha_out, _ = self.attn1(alpha_features, beta_features, beta_features)
        beta_out, _ = self.attn2(beta_features, alpha_features, alpha_features)

        # Find the maximum sequence length
        max_seq_len = max(alpha_out.size(1), beta_out.size(1))

        # Expand both outputs to the maximum sequence length
        if alpha_out.size(1) < max_seq_len:
            alpha_out = alpha_out.repeat_interleave(max_seq_len // alpha_out.size(1), dim=1)[:, :max_seq_len, :]
        if beta_out.size(1) < max_seq_len:
            beta_out = beta_out.repeat_interleave(max_seq_len // beta_out.size(1), dim=1)[:, :max_seq_len, :]

        # Use element-wise averaging instead of concatenation
        combined_output = (alpha_out + beta_out) / 2
        
        # Project to final hidden dimension
        output = self.output_proj(combined_output)
        
        return output

In [26]:
# Define dimensions
alpha_dim = 768  # Dimensionality of ModalityAlpha features
beta_dim = 1024   # Dimensionality of ModalityBeta features
hidden_dim = 768 # Dimensionality of the output from the fusion module

# Create an instance of the FusionModule
fusion_module = FusionModule(alpha_dim=alpha_dim, beta_dim=beta_dim, hidden_dim=hidden_dim)

# Example input tensors
batch_size = 32
seq_len_alpha = 1     # Sequence length for ModalityAlpha (e.g., audio)
seq_len_beta = 197    # Sequence length for ModalityBeta (e.g., text)

# Create random input tensors for alpha and beta features
alpha_features = torch.rand(batch_size, seq_len_alpha, alpha_dim)  # Shape: (batch_size, seq_len_alpha, alpha_dim)
beta_features = torch.rand(batch_size, seq_len_beta, beta_dim)     # Shape: (batch_size, seq_len_beta, beta_dim)

# Forward pass through the fusion module
fused_output = fusion_module(alpha_features, beta_features)

# Print the shape of the fused output
print(f"Fused Output Shape: {fused_output.shape}")  # Expected: (batch_size, max(seq_len_alpha, seq_len_beta), hidden_dim)

pooled_output = torch.mean(fused_output, dim=1) 

print("Pooled Fused Output: ", pooled_output.shape)


Fused Output Shape: torch.Size([32, 197, 768])
Pooled Fused Output:  torch.Size([32, 768])


In [27]:
for text_features, audio_features, _, labels in train_dataloader:  # Only use video data

    # Define dimensions
    audio_dim = 768  # Feature dimension of audio
    text_dim = 1024   # Feature dimension of text
    hidden_dim = 768  # Hidden dimension for fusion

    print(text_features.unsqueeze(1).shape)
    print(audio_features.squeeze(1).shape)
    
    # Create an instance of the FusionModule
    fusion_module = FusionModule(audio_dim, text_dim, hidden_dim)

    text_features = text_features.unsqueeze(1)
    audio_features = audio_features.squeeze(1)

    # Forward pass through the fusion module
    combined_output = fusion_module(audio_features, text_features)

    print('Fused AlphaBeta: ', combined_output.shape)



torch.Size([32, 1, 1024])
torch.Size([32, 197, 768])
Fused AlphaBeta:  torch.Size([32, 197, 768])
torch.Size([32, 1, 1024])
torch.Size([32, 197, 768])
Fused AlphaBeta:  torch.Size([32, 197, 768])
torch.Size([32, 1, 1024])
torch.Size([32, 197, 768])
Fused AlphaBeta:  torch.Size([32, 197, 768])
torch.Size([32, 1, 1024])
torch.Size([32, 197, 768])
Fused AlphaBeta:  torch.Size([32, 197, 768])
torch.Size([32, 1, 1024])
torch.Size([32, 197, 768])
Fused AlphaBeta:  torch.Size([32, 197, 768])
torch.Size([32, 1, 1024])
torch.Size([32, 197, 768])
Fused AlphaBeta:  torch.Size([32, 197, 768])
torch.Size([32, 1, 1024])
torch.Size([32, 197, 768])
Fused AlphaBeta:  torch.Size([32, 197, 768])
torch.Size([32, 1, 1024])
torch.Size([32, 197, 768])
Fused AlphaBeta:  torch.Size([32, 197, 768])
torch.Size([32, 1, 1024])
torch.Size([32, 197, 768])
Fused AlphaBeta:  torch.Size([32, 197, 768])
torch.Size([32, 1, 1024])
torch.Size([32, 197, 768])
Fused AlphaBeta:  torch.Size([32, 197, 768])
torch.Size([32, 1, 1

In [28]:
class AlphaBetaClassifier(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=768, dropout_prob=0.3):  # Changed input_dim from 1536 to 768
        super(AlphaBetaClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.batch_norm1 = nn.BatchNorm1d(hidden_dim)
        self.dropout = nn.Dropout(dropout_prob)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.batch_norm1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [29]:
def train_alphabeta_model(model, classifier, dataloader, criterion, optimizer, device='cpu'):
    model.train()
    classifier.train()
    total_loss = 0

    for text_features, audio_features, video_features, labels in dataloader:  # Only use video data
            
            optimizer.zero_grad()

            text_features = text_features.unsqueeze(1)
            audio_features = audio_features.squeeze(1)

            combined_output = fusion_module(audio_features, text_features)

            pooled_output = torch.mean(combined_output, dim=1) 
            
            logits = classifier(pooled_output)

            loss = criterion(logits, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
        
    return avg_loss  # Optionally, return the last average loss if needed


In [30]:
def evaluate_alphabeta_model(model, classifier, dataloader, criterion, device='cpu'):
    model.eval()  # Set the model to evaluation mode
    classifier.eval()  # Set the classifier to evaluation mode
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient calculation for evaluation
        for text_features, audio_features, video_features, labels in dataloader:

            text_features = text_features.unsqueeze(1)
            audio_features = audio_features.squeeze(1)

            # Get the combined output from the fusion module
            combined_output = model(audio_features, text_features)  

            pooled_output = torch.mean(combined_output, dim=1) 
            
            # Get logits from the classifier
            logits = classifier(pooled_output)

            # Calculate loss
            loss = criterion(logits, labels)
            total_loss += loss.item()

            # Apply sigmoid to get probabilities
            outputs = torch.sigmoid(logits)

            # Convert probabilities to binary predictions
            all_preds.extend((outputs > 0.5).int().tolist())
            all_labels.extend(labels.int().tolist())

    # Calculate metrics
    recall = recall_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    avg_loss = total_loss / len(dataloader)

    print(f"Alpha-Beta Model Evaluation - Loss: {avg_loss:.4f}, Recall: {recall:.4f}, Precision: {precision:.4f}, F1: {f1:.4f}")

    return avg_loss, precision, recall, f1


In [31]:
def test_alphabeta_model(model, classifier, dataloader, criterion, device='cpu'):
    model.eval()  # Set the model to evaluation mode
    classifier.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient calculation for evaluation
        for text_features, audio_features, _, labels in dataloader:
            
            text_features = text_features.unsqueeze(1)
            audio_features = audio_features.squeeze(1)

            # Forward pass through the model
            combined_output = model(audio_features, text_features)

            pooled_output = torch.mean(combined_output, dim=1) 

            logits = classifier(pooled_output)
            
            loss = criterion(logits, labels)
            total_loss += loss.item()

            # Apply sigmoid to obtain probabilities
            outputs = torch.sigmoid(logits)

            # Convert probabilities to binary predictions
            all_preds.extend((outputs > 0.5).int().tolist())
            all_labels.extend(labels.int().tolist())

    # Calculate metrics
    recall = recall_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    avg_loss = total_loss / len(dataloader)
    
    print(f"AlphaBeta Model Test - Loss: {avg_loss:.4f}, Test Recall: {recall:.4f}, Test Precision: {precision:.4f}, Test F1: {f1:.4f}")

    return avg_loss, precision, recall, f1


#### Concat

In [32]:
# if __name__ == "__main__":
#     torch.manual_seed(42)

#     # Define dimensions
#     audio_dim = 768  # Feature dimension of audio
#     text_dim = 1024   # Feature dimension of text
#     hidden_dim = 768  # Hidden dimension for fusion

#     # Define models for each modality
#     fusion_model = FusionModule(audio_dim, text_dim, hidden_dim)  # Define your FusionModule
#     classifier = AlphaBetaClassifier(hidden_dim*2)  # Define your Classifier

#     # Define the criterion
#     criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(2.94))

#     # Initialize the optimizer
#     optimizer = optim.Adam(list(fusion_model.parameters())+list(classifier.parameters()), lr=0.001)

#     num_epochs = 10

#     print("\nTraining on Alpha-Beta Model...")
#     for epoch in range(num_epochs):
#         print("-" * 40)
#         print(f"Epoch {epoch + 1}/{num_epochs}")

#         # Train step
#         train_loss = train_alphabeta_model(fusion_model, classifier, train_dataloader, criterion, optimizer, device='cpu')

#         # Validate step
#         val_loss, precision, recall, f1 = evaluate_alphabeta_model(fusion_model, classifier, val_dataloader, criterion, device='cpu')

#         print(f"Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

#     # Testing the model
#     print("-" * 40)
#     print("Testing the model on the test set...")
#     test_loss, test_precision, test_recall, test_f1_score = evaluate_alphabeta_model(fusion_model, classifier, test_dataloader, criterion, device='cpu')

#     print(f"Test Loss: {test_loss:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}, Test F1 Score: {test_f1_score:.4f}")


#### AVG

In [33]:
if __name__ == "__main__":
    torch.manual_seed(42)

    # Define dimensions
    audio_dim = 768  # Audio feature dimension
    text_dim = 1024  # Text feature dimension
    common_dim = 768 # Common projection dimension
    hidden_dim = 768 # Hidden dimension for fusion output

    # Initialize models
    fusion_model = FusionModule(
        alpha_dim=audio_dim,
        beta_dim=text_dim,
        hidden_dim=hidden_dim,
        common_dim=common_dim
    )
    classifier = AlphaBetaClassifier(input_dim=hidden_dim)  # Now takes hidden_dim as input

    # Define the criterion with class weighting
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(2.94))

    # Initialize the optimizer
    optimizer = optim.Adam(
        list(fusion_model.parameters()) + list(classifier.parameters()),
        lr=0.001
    )

    num_epochs = 10

    print("\nTraining Bimodal Fusion Model...")
    for epoch in range(num_epochs):
        print("-" * 40)
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Train step
        train_loss = train_alphabeta_model(
            fusion_model, 
            classifier, 
            train_dataloader, 
            criterion, 
            optimizer, 
            device='cpu'
        )

        # Validate step
        val_loss, precision, recall, f1 = evaluate_alphabeta_model(
            fusion_model, 
            classifier, 
            val_dataloader, 
            criterion, 
            device='cpu'
        )

        print(f"Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
        print(f"Validation Metrics - Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

    # Testing the model
    print("-" * 40)
    print("Testing the Bimodal Fusion model...")
    test_loss, test_precision, test_recall, test_f1_score = test_alphabeta_model(
        fusion_model, 
        classifier, 
        test_dataloader, 
        criterion, 
        device='cpu'
    )

    print(f"\nFinal Test Results:")
    print(f"Loss: {test_loss:.4f}")
    print(f"Precision: {test_precision:.4f}")
    print(f"Recall: {test_recall:.4f}")
    print(f"F1 Score: {test_f1_score:.4f}")


Training Bimodal Fusion Model...
----------------------------------------
Epoch 1/10
Alpha-Beta Model Evaluation - Loss: 1.0520, Recall: 0.8654, Precision: 0.2528, F1: 0.3913
Training Loss: 0.8806, Validation Loss: 1.0520
Validation Metrics - Precision: 0.2528, Recall: 0.8654, F1: 0.3913
----------------------------------------
Epoch 2/10
Alpha-Beta Model Evaluation - Loss: 1.0983, Recall: 0.7692, Precision: 0.2516, F1: 0.3791
Training Loss: 0.5653, Validation Loss: 1.0983
Validation Metrics - Precision: 0.2516, Recall: 0.7692, F1: 0.3791
----------------------------------------
Epoch 3/10
Alpha-Beta Model Evaluation - Loss: 1.3321, Recall: 0.9231, Precision: 0.2526, F1: 0.3967
Training Loss: 0.4664, Validation Loss: 1.3321
Validation Metrics - Precision: 0.2526, Recall: 0.9231, F1: 0.3967
----------------------------------------
Epoch 4/10
Alpha-Beta Model Evaluation - Loss: 1.3173, Recall: 0.1154, Precision: 0.1176, F1: 0.1165
Training Loss: 0.3501, Validation Loss: 1.3173
Validatio

# Three Modalities Concatentation

##### Concat

In [34]:
# class TrimodalFusionModule(nn.Module):
#     def __init__(self, text_dim, audio_dim, video_dim, hidden_dim, common_dim=768, num_heads=8):
#         super(TrimodalFusionModule, self).__init__()
        
#         # Project ModalityAlpha, ModalityBeta, and ModalityGamma to a common dimension
#         self.text_proj = nn.Linear(text_dim, common_dim) if text_dim != common_dim else nn.Identity()
#         self.audio_proj = nn.Linear(audio_dim, common_dim) if audio_dim != common_dim else nn.Identity()
#         self.video_proj = nn.Linear(video_dim, common_dim) if video_dim != common_dim else nn.Identity()

#         # MultiheadAttention with the common embedding dimension
#         self.attn1 = nn.MultiheadAttention(embed_dim=common_dim, num_heads=num_heads, batch_first=True)
#         self.attn2 = nn.MultiheadAttention(embed_dim=common_dim, num_heads=num_heads, batch_first=True)
#         self.attn3 = nn.MultiheadAttention(embed_dim=common_dim, num_heads=num_heads, batch_first=True)
        
#         # Fully connected layer for final output fusion
#         self.fc = nn.Linear(common_dim * 3, hidden_dim)  # For concatenated outputs

#     def forward(self, text_features, audio_features, video_features):
#         # Project all modalities to the common embedding dimension
#         text_features = self.text_proj(text_features)  # Shape: (batch_size, seq_len_text, common_dim)
#         audio_features = self.audio_proj(audio_features)  # Shape: (batch_size, seq_len_audio, common_dim)
#         video_features = self.video_proj(video_features)  # Shape: (batch_size, seq_len_video, common_dim)

#         # First cross-attention: ModalityAlpha as Query, ModalityBeta as Key-Value
#         text_out, _ = self.attn1(text_features, audio_features, audio_features) 
        
#         # Second cross-attention: ModalityBeta as Query, ModalityGamma as Key-Value
#         audio_out, _ = self.attn2(audio_features, video_features, video_features)

#         # Third cross-attention: ModalityGamma as Query, ModalityAlpha as Key-Value
#         video_out, _ = self.attn3(video_features, text_features, text_features)

#         # Find the maximum sequence length among text_out, audio_out, and video_out
#         max_seq_len = max(text_out.size(1), audio_out.size(1), video_out.size(1))

#         if text_out.size(1) < max_seq_len:
#             padding = max_seq_len - text_out.size(1)
#             text_out = nn.functional.pad(text_out, (0, 0, 0, padding))
#         else:
#             text_out = text_out[:, :max_seq_len, :]

#         if audio_out.size(1) < max_seq_len:
#             padding = max_seq_len - audio_out.size(1)
#             audio_out = nn.functional.pad(audio_out, (0, 0, 0, padding))
#         else:
#             audio_out = audio_out[:, :max_seq_len, :]

#         if video_out.size(1) < max_seq_len:
#             padding = max_seq_len - video_out.size(1)
#             video_out = nn.functional.pad(video_out, (0, 0, 0, padding))
#         else:
#             video_out = video_out[:, :max_seq_len, :]

#         # Concatenate the outputs along the feature dimension
#         combined_output = torch.cat((text_out, audio_out, video_out), dim=-1)  # Shape: (batch_size, max_seq_len, common_dim * 3)
        
#         # Final linear layer to produce the fused output
#         fused_output = self.fc(combined_output)  # Shape: (batch_size, max_seq_len, hidden_dim)

#         return fused_output

##### AVG

In [35]:
class TrimodalFusionModule(nn.Module):
    def __init__(self, text_dim, audio_dim, video_dim, hidden_dim, common_dim=768, num_heads=8):
        super(TrimodalFusionModule, self).__init__()
        
        # Project all modalities to common dimension
        self.text_proj = nn.Linear(text_dim, common_dim) if text_dim != common_dim else nn.Identity()
        self.audio_proj = nn.Linear(audio_dim, common_dim) if audio_dim != common_dim else nn.Identity()
        self.video_proj = nn.Linear(video_dim, common_dim) if video_dim != common_dim else nn.Identity()

        # MultiheadAttention layers
        self.attn1 = nn.MultiheadAttention(embed_dim=common_dim, num_heads=num_heads, batch_first=True)
        self.attn2 = nn.MultiheadAttention(embed_dim=common_dim, num_heads=num_heads, batch_first=True)
        self.attn3 = nn.MultiheadAttention(embed_dim=common_dim, num_heads=num_heads, batch_first=True)
        
        # Added: Output projection to desired hidden dimension
        self.output_proj = nn.Linear(common_dim, hidden_dim)

    def forward(self, text_features, audio_features, video_features):
        # Project all modalities to the common embedding dimension
        text_features = self.text_proj(text_features)
        audio_features = self.audio_proj(audio_features)
        video_features = self.video_proj(video_features)

        # Cross-attention between modalities
        text_out, _ = self.attn1(text_features, audio_features, audio_features)
        audio_out, _ = self.attn2(audio_features, video_features, video_features)
        video_out, _ = self.attn3(video_features, text_features, text_features)

        # Handle different sequence lengths
        max_seq_len = max(text_out.size(1), audio_out.size(1), video_out.size(1))

        # Pad or trim sequences to match max_seq_len
        if text_out.size(1) < max_seq_len:
            text_out = nn.functional.pad(text_out, (0, 0, 0, max_seq_len - text_out.size(1)))
        else:
            text_out = text_out[:, :max_seq_len, :]
        
        if audio_out.size(1) < max_seq_len:
            audio_out = nn.functional.pad(audio_out, (0, 0, 0, max_seq_len - audio_out.size(1)))
        else:
            audio_out = audio_out[:, :max_seq_len, :]

        if video_out.size(1) < max_seq_len:
            video_out = nn.functional.pad(video_out, (0, 0, 0, max_seq_len - video_out.size(1)))
        else:
            video_out = video_out[:, :max_seq_len, :]

        # Element-wise averaging
        combined_output = (text_out + audio_out + video_out) / 3
        
        # Project to final hidden dimension
        output = self.output_proj(combined_output)
        
        return output

In [36]:
class TrimodalClassifier(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=768, dropout_prob=0.3):  # Changed input_dim from 2304 to 768
        super(TrimodalClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.batch_norm1 = nn.BatchNorm1d(hidden_dim)
        self.dropout = nn.Dropout(dropout_prob)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.batch_norm1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [37]:
def train_trimodal_model(fusion_model, classifier, dataloader, criterion, optimizer, device='cpu'):
    fusion_model.train()
    classifier.train()
    total_loss = 0

    for text_features, audio_features, video_features, labels in dataloader:
        optimizer.zero_grad()

        # Prepare inputs
        text_features = text_features.unsqueeze(1)  # Add sequence dimension
        audio_features = audio_features.squeeze(1)  # Adjust audio features
        video_features = video_features  # Video features already in correct shape

        # Forward pass through fusion model
        combined_output = fusion_model(text_features, audio_features, video_features)

        # Global average pooling
        pooled_output = torch.mean(combined_output, dim=1)

        # Forward pass through classifier
        logits = classifier(pooled_output)

        # Calculate loss
        loss = criterion(logits, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss


In [38]:
def evaluate_trimodal_model(fusion_model, classifier, dataloader, criterion, device='cpu'):
    fusion_model.eval()
    classifier.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for text_features, audio_features, video_features, labels in dataloader:
            # Prepare inputs
            text_features = text_features.unsqueeze(1)
            audio_features = audio_features.squeeze(1)
            video_features = video_features

            # Forward passes
            combined_output = fusion_model(text_features, audio_features, video_features)
            pooled_output = torch.mean(combined_output, dim=1)
            logits = classifier(pooled_output)

            # Calculate loss
            loss = criterion(logits, labels)
            total_loss += loss.item()

            # Get predictions
            outputs = torch.sigmoid(logits)
            all_preds.extend((outputs > 0.5).int().tolist())
            all_labels.extend(labels.int().tolist())

    # Calculate metrics
    recall = recall_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    avg_loss = total_loss / len(dataloader)

    print(f"Trimodal Model Evaluation - Loss: {avg_loss:.4f}, Recall: {recall:.4f}, Precision: {precision:.4f}, F1: {f1:.4f}")

    return avg_loss, precision, recall, f1

In [39]:
def test_trimodal_model(fusion_model, classifier, dataloader, criterion, device='cpu'):
    fusion_model.eval()
    classifier.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for text_features, audio_features, video_features, labels in dataloader:
            # Prepare inputs
            text_features = text_features.unsqueeze(1)
            audio_features = audio_features.squeeze(1)
            video_features = video_features

            # Forward passes
            combined_output = fusion_model(text_features, audio_features, video_features)
            pooled_output = torch.mean(combined_output, dim=1)
            logits = classifier(pooled_output)

            # Calculate loss
            loss = criterion(logits, labels)
            total_loss += loss.item()

            # Get predictions
            outputs = torch.sigmoid(logits)
            all_preds.extend((outputs > 0.5).int().tolist())
            all_labels.extend(labels.int().tolist())

    # Calculate metrics
    recall = recall_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    avg_loss = total_loss / len(dataloader)

    print(f"Trimodal Model Test - Loss: {avg_loss:.4f}, Test Recall: {recall:.4f}, Test Precision: {precision:.4f}, Test F1: {f1:.4f}")

    return avg_loss, precision, recall, f1

#### Concat

In [40]:
# if __name__ == "__main__":
#     torch.manual_seed(42)

#     # Define dimensions
#     text_dim = 1024   # BERT features dimension
#     audio_dim = 768   # Audio features dimension
#     video_dim = 768   # Video features dimension
#     common_dim = 768  # Common projection dimension

#     # Initialize models
#     fusion_model = TrimodalFusionModule(text_dim, audio_dim, video_dim, common_dim)
#     classifier = TrimodalClassifier()

#     # Define loss function and optimizer
#     criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(2.94))
#     optimizer = optim.Adam(
#         list(fusion_model.parameters()) + list(classifier.parameters()),
#         lr=0.001
#     )

#     # Training loop
#     num_epochs = 10
#     print("\nTraining Trimodal Model...")
    
#     for epoch in range(num_epochs):
#         print("-" * 40)
#         print(f"Epoch {epoch + 1}/{num_epochs}")

#         # Train step
#         train_loss = train_trimodal_model(
#             fusion_model, classifier, train_dataloader, 
#             criterion, optimizer, device='cpu'
#         )

#         # Validation step
#         val_loss, precision, recall, f1 = evaluate_trimodal_model(
#             fusion_model, classifier, val_dataloader, 
#             criterion, device='cpu'
#         )

#         print(f"Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

#     # Final testing
#     print("-" * 40)
#     print("Testing the model on the test set...")
#     test_loss, test_precision, test_recall, test_f1_score = test_trimodal_model(
#         fusion_model, classifier, test_dataloader, 
#         criterion, device='cpu'
#     )

#     print(f"Test Results:")
#     print(f"Loss: {test_loss:.4f}")
#     print(f"Precision: {test_precision:.4f}")
#     print(f"Recall: {test_recall:.4f}")
#     print(f"F1 Score: {test_f1_score:.4f}")

#### AVG

In [41]:
if __name__ == "__main__":
    torch.manual_seed(42)

    # Define dimensions - all projected to common_dim before averaging
    text_dim = 1024
    audio_dim = 768
    video_dim = 768
    common_dim = 768
    hidden_dim = 768

    # Initialize models with new dimensions
    fusion_model = TrimodalFusionModule(text_dim, audio_dim, video_dim, hidden_dim, common_dim)
    classifier = TrimodalClassifier(input_dim=hidden_dim)  # Now takes hidden_dim as input

    # Define loss function and optimizer
    criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(2.94))
    optimizer = optim.Adam(
        list(fusion_model.parameters()) + list(classifier.parameters()),
        lr=0.001
    )

    # Training loop
    num_epochs = 10
    print("\nTraining Trimodal Model...")
    
    for epoch in range(num_epochs):
        print("-" * 40)
        print(f"Epoch {epoch + 1}/{num_epochs}")

        # Train step
        train_loss = train_trimodal_model(
            fusion_model, classifier, train_dataloader, 
            criterion, optimizer, device='cpu'
        )

        # Validation step
        val_loss, precision, recall, f1 = evaluate_trimodal_model(
            fusion_model, classifier, val_dataloader, 
            criterion, device='cpu'
        )

        print(f"Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

    # Final testing
    print("-" * 40)
    print("Testing the model on the test set...")
    test_loss, test_precision, test_recall, test_f1_score = test_trimodal_model(
        fusion_model, classifier, test_dataloader, 
        criterion, device='cpu'
    )

    print(f"Test Results:")
    print(f"Loss: {test_loss:.4f}")
    print(f"Precision: {test_precision:.4f}")
    print(f"Recall: {test_recall:.4f}")
    print(f"F1 Score: {test_f1_score:.4f}")


Training Trimodal Model...
----------------------------------------
Epoch 1/10
Trimodal Model Evaluation - Loss: 0.9334, Recall: 0.8846, Precision: 0.4071, F1: 0.5576
Training Loss: 0.8666, Validation Loss: 0.9334
----------------------------------------
Epoch 2/10
Trimodal Model Evaluation - Loss: 0.7462, Recall: 0.7115, Precision: 0.5968, F1: 0.6491
Training Loss: 0.5790, Validation Loss: 0.7462
----------------------------------------
Epoch 3/10
Trimodal Model Evaluation - Loss: 2.0347, Recall: 1.0000, Precision: 0.2826, F1: 0.4407
Training Loss: 0.3573, Validation Loss: 2.0347
----------------------------------------
Epoch 4/10
Trimodal Model Evaluation - Loss: 0.7924, Recall: 0.8462, Precision: 0.5500, F1: 0.6667
Training Loss: 0.2369, Validation Loss: 0.7924
----------------------------------------
Epoch 5/10
Trimodal Model Evaluation - Loss: 0.8971, Recall: 0.8846, Precision: 0.5287, F1: 0.6619
Training Loss: 0.1622, Validation Loss: 0.8971
-------------------------------------