# Multimodal Depression Detection Model

This notebook implements a deep learning model for depression detection using multiple modalities:
- Text data from transcripts
- Audio features from speech
- Facial features from video

The model uses a fusion architecture to combine predictions from individual modality-specific models.

In [None]:
# Standard library imports
from typing import Dict, List, Tuple

import joblib
import matplotlib.pyplot as plt
# Third-party imports
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
from sklearn.model_selection import ParameterGrid, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Local imports
from models.audio_rnn import AudioRNN
from models.face_strnn import FaceSTRNN
from models.multimodal_fusion import MultimodalFusion
from preprocessing.loader import ResultsLoader, TextLoader, AudioLoader, FaceLoader
from utils.training import MultimodalFusionTrainer

# Constants
RANDOM_STATE = 42
DATA_PERCENTAGE = 0.02  # Percentage of total data to use
BATCH_SIZE = 32
N_EPOCHS = 50
FIGURE_SIZE = (15, 8)

# Hyperparameter grid for model tuning
PARAM_GRID = {
    'learning_rate': [0.001, 0.0001],
    'weight_decay': [0.01, 0.001],
    'dropout': [0.2, 0.3]
}

# Device configuration
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

## Data Preparation

Load and preprocess data from all modalities (text, audio, and facial features).

In [None]:
from typing import Any


def prepare_data(percentage: float = DATA_PERCENTAGE, random_state: int = RANDOM_STATE) -> pd.DataFrame:
    """Load and prepare data from all modalities.
    
    Args:
        percentage: Percentage of total data to use
        random_state: Random seed for reproducibility
        
    Returns:
        DataFrame containing merged features from all modalities
    """
    # Initialize loaders
    results_loader = ResultsLoader()
    text_loader = TextLoader()
    audio_loader = AudioLoader()
    face_loader = FaceLoader()

    # Load data
    df_result = results_loader.get_data(percentage=percentage, random_state=random_state)
    df_text = text_loader.get_data(percentage=percentage, random_state=random_state)
    df_audio = audio_loader.get_data(
        percentage=percentage,
        random_state=random_state,
        ds_freq="10s",
        rw_size="10s"
    )
    df_face = face_loader.get_data(
        percentage=percentage,
        random_state=random_state,
        ds_freq="10s",
        rw_size="10s"
    )

    # Reset index for time series data to make ID and timestamp regular columns
    df_audio = df_audio.reset_index()
    df_face = df_face.reset_index()

    # Merge the time series modalities (audio and face) on both ID and timestamp
    df_timeseries = pd.merge(df_audio, df_face, on=['ID', 'TIMESTAMP'])

    # Group the time series data by ID to get sequence-level features
    df_timeseries_grouped = df_timeseries.groupby('ID').agg({
        col: 'mean' for col in df_timeseries.columns if col not in ['ID', 'TIMESTAMP']
    }).reset_index()

    # Merge with non-time series data (text and results)
    df = pd.merge(df_text, df_timeseries_grouped, on='ID')
    df = pd.merge(df, df_result, on='ID')

    return df

def load_models() -> Tuple[Any, nn.Module, nn.Module, StandardScaler, StandardScaler]:
    """Load individual models and their preprocessors.
    
    Returns:
        Tuple containing text model, audio model, face model, and their respective scalers
    """
    # Load text model
    text_model = joblib.load('text_model.joblib')

    import utils.training as train
    # Load audio and face models
    audio_model, audio_scaler = train.load_model(AudioRNN, "audio_model.pth", DEVICE)
    face_model, face_scaler = train.load_model(FaceSTRNN, "face_model.pth", DEVICE)

    return text_model, audio_model, face_model, audio_scaler, face_scaler

# Load and prepare data
df = prepare_data()

# Display data overview
print("Data Overview:")
display(df.head())
print("\nData Shape:", df.shape)
print("\nFeature Types:")
print(df.dtypes.value_counts())

## Data Splitting

Split the data into training, validation, and test sets while preserving temporal order.

In [None]:
def prepare_data_splits(df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Prepare data splits for training, validation, and testing.
    
    Args:
        df: DataFrame containing all features
        
    Returns:
        Tuple containing training, validation, and test data splits
    """
    # Prepare features and target
    X = df.drop(['PHQ_Binary'], axis=1)
    y = df['PHQ_Binary']
    
    # Create time series split
    tscv = TimeSeriesSplit(n_splits=5)
    train_indices = []
    test_indices = []
    
    # Get split indices while preserving temporal order
    for train_idx, test_idx in tscv.split(X):
        train_indices.append(train_idx)
        test_indices.append(test_idx)
    
    # Use the last fold for final train/test split
    X_train = X.iloc[train_indices[-1]]
    X_test = X.iloc[test_indices[-1]]
    y_train = y.iloc[train_indices[-1]]
    y_test = y.iloc[test_indices[-1]]
    
    # Further split training data into train and validation
    train_size = int(0.75 * len(X_train))
    X_train, X_val = X_train.iloc[:train_size], X_train.iloc[train_size:]
    y_train, y_val = y_train.iloc[:train_size], y_train.iloc[train_size:]
    
    return X_train, X_val, X_test, y_train, y_val, y_test

# Prepare data splits
X_train, X_val, X_test, y_train, y_val, y_test = prepare_data_splits(df)

## Model Training

Train the multimodal fusion model with hyperparameter tuning.

In [None]:
def create_data_loaders(X_train: np.ndarray, X_val: np.ndarray, X_test: np.ndarray,
                        y_train: np.ndarray, y_val: np.ndarray, y_test: np.ndarray,
                        batch_size: int = BATCH_SIZE) -> Tuple[DataLoader, DataLoader, DataLoader]:
    """Create PyTorch DataLoaders for training, validation, and testing.
    
    Args:
        X_train, X_val, X_test: Feature arrays
        y_train, y_val, y_test: Label arrays
        batch_size: Batch size for training
        
    Returns:
        Tuple containing train, validation, and test DataLoaders
    """
    # Convert data to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_train)
    X_val_tensor = torch.FloatTensor(X_val)
    X_test_tensor = torch.FloatTensor(X_test)
    y_train_tensor = torch.LongTensor(y_train.values)
    y_val_tensor = torch.LongTensor(y_val.values)
    y_test_tensor = torch.LongTensor(y_test.values)
    
    # Create datasets
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
    return train_loader, val_loader, test_loader

def train_model_with_grid_search(X_train: np.ndarray, y_train: np.ndarray, X_val: np.ndarray, y_val: np.ndarray,
                                 param_grid: Dict, n_epochs: int = N_EPOCHS) -> Tuple[Dict, List[Dict]]:
    """Perform grid search to find optimal hyperparameters.
    
    Args:
        X_train, y_train: Training data
        X_val, y_val: Validation data
        param_grid: Dictionary of hyperparameters to search
        n_epochs: Number of training epochs
        
    Returns:
        Tuple containing best parameters and all results
    """
    # Load individual models
    text_model, audio_model, face_model, audio_scaler, face_scaler = load_models()
    
    # Create data loaders
    train_loader, val_loader, _ = create_data_loaders(X_train, X_val, X_test, y_train, y_val, y_test)
    
    # Initialize tracking variables
    best_val_loss = float('inf')
    best_params = None
    results = []
    
    # Grid search
    for params in tqdm(ParameterGrid(param_grid)):
        # Create multimodal model
        model = MultimodalFusion(
            text_model,
            audio_model,
            face_model
        ).to(DEVICE)
        
        # Training setup
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=params['learning_rate'],
            weight_decay=params['weight_decay']
        )
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=0.1,
            patience=3,
            verbose=True
        )
        
        # Initialize trainer
        trainer = MultimodalFusionTrainer(
            model=model,
            criterion=criterion,
            optimizer=optimizer,
            scheduler=scheduler,
            device=DEVICE
        )
        
        # Train model
        train_losses, val_losses = trainer.train(
            train_loader=train_loader,
            val_loader=val_loader,
            n_epochs=n_epochs
        )
        
        # Record results
        final_val_loss = val_losses[-1]
        results.append({
            'params': params,
            'final_val_loss': final_val_loss,
            'train_losses': train_losses,
            'val_losses': val_losses
        })
        
        # Update best parameters
        if final_val_loss < best_val_loss:
            best_val_loss = final_val_loss
            best_params = params
            
    return best_params, results

# Train model with grid search
best_params, results = train_model_with_grid_search(X_train, y_train, X_val, y_val, PARAM_GRID)

# Print best parameters
print("\nBest parameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")
# print(f"Best validation loss: {best_val_loss:.4f}")

# Plot training curves for best model
plt.figure(figsize=FIGURE_SIZE)
best_result = min(results, key=lambda x: x['final_val_loss'])
plt.plot(best_result['train_losses'], label='Training Loss')
plt.plot(best_result['val_losses'], label='Validation Loss')
plt.title('Training and Validation Loss (Best Model)')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

## Model Evaluation

Evaluate the model's performance on the test set.

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

def evaluate_model(model: nn.Module, test_loader: DataLoader, device: torch.device) -> Tuple[np.ndarray, np.ndarray]:
    """Evaluate the model on the test set.
    
    Args:
        model: Trained PyTorch model
        test_loader: DataLoader containing test data
        device: Device to run evaluation on
        
    Returns:
        Tuple containing true labels and predicted labels
    """
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for X, y in test_loader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())
    
    return np.array(all_labels), np.array(all_preds)

# Create test loader
_, _, test_loader = create_data_loaders(X_train, X_val, X_test, y_train, y_val, y_test)

# Load individual models
text_model, audio_model, face_model, audio_scaler, face_scaler = load_models()

# Initialize best model
best_model = MultimodalFusion(
    text_model,
    audio_model,
    face_model
).to(DEVICE)

# Evaluate model
y_true, y_pred = evaluate_model(best_model, test_loader, DEVICE)

# Print classification report
print("Classification Report:")
print(classification_report(y_true, y_pred))

# Plot confusion matrix
plt.figure(figsize=FIGURE_SIZE)
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

## Save Model

Save the trained model for later use.

In [None]:
# Save the model
torch.save(best_model.state_dict(), 'multimodal_model.pth')
print("Model saved successfully!")