# Multimodal Depression Detection Model

In [1]:
from typing import Dict, List, Tuple

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import ParameterGrid, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

from models.audio_rnn import AudioRNN
from models.face_strnn import FaceSTRNN
from models.multimodal_fusion import MultimodalFusion
from preprocessing.loader_audio import AudioLoader
from preprocessing.loader_face import FaceLoader
from preprocessing.loader_results import ResultsLoader
from preprocessing.loader_text import TextLoader
from training.trainer_multimodal_fusion import MultimodalFusionTrainer

# Constants
RANDOM_STATE = 42
DATA_PERCENTAGE = 0.02  # Percentage of total data to use
BATCH_SIZE = 32
N_EPOCHS = 50
FIGURE_SIZE = (15, 8)

# Hyperparameter grid for model tuning
PARAM_GRID = {
    'learning_rate': [0.001, 0.0001],
    'weight_decay': [0.01, 0.001],
    'dropout': [0.2, 0.3]
}

# Device configuration
# DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: mps


[nltk_data] Downloading package stopwords to /Users/karlo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/karlo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/karlo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Data Preparation

In [2]:
from typing import Any
from utils.pca_utils import load_and_transform_pca


def prepare_data(
    percentage: float = DATA_PERCENTAGE, random_state: int = RANDOM_STATE
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    # Initialize loaders
    results_loader = ResultsLoader()
    text_loader = TextLoader()
    audio_loader = AudioLoader()
    face_loader = FaceLoader()

    # Load data
    df_result = results_loader.get_data(
        percentage=percentage, random_state=random_state
    )
    df_text = text_loader.get_data(percentage=percentage, random_state=random_state)
    df_audio = audio_loader.get_data(
        percentage=percentage, random_state=random_state, ds_freq="10s", rw_size="10s"
    )
    df_face = face_loader.get_data(
        percentage=percentage, random_state=random_state, ds_freq="10s", rw_size="10s"
    )

    # text features preprocessing
    # ...

    # Audio features PCA
    df_audio_pca = load_and_transform_pca(df_audio, ["models/pca_audio.pkl"])

    # Face features PCA
    df_face_pca = load_and_transform_pca(
        df_face,
        [
            "models/pca_face_action_units.pkl",
            "models/pca_face_gaze.pkl",
            "models/pca_face_pose.pkl",
        ],
    )

    return df_text, df_audio_pca, df_face_pca, df_result


def load_models() -> Tuple[Any, nn.Module, nn.Module, StandardScaler, StandardScaler]:
    # Load individual models and their preprocessors.
    # Load text model
    text_model = joblib.load("text_model.joblib")

    import training.trainer as train

    # Load audio and face models
    audio_model, audio_scaler = train.load_model(AudioRNN, "audio_model.pth", DEVICE)

    face_model, face_scaler = train.load_model(FaceSTRNN, "face_model.pth",DEVICE)

    return text_model, audio_model, face_model, audio_scaler, face_scaler


# Load and prepare data
df_text, df_audio, df_face, df_result = prepare_data()

# Load models
# text_model, audio_model, face_model, audio_scaler, face_scaler = load_models()

# Display data overview
print("Text Data:")
display(df_text.head())

print("\nAudio Data:")
display(df_audio.head())

print("\nFace Data:")
display(df_face.head())

print("\nResults Data:")
display(df_result.head())

Text Data:


Unnamed: 0_level_0,TRANSCRIPT_text
ID,Unnamed: 1_level_1
386,synch introv4confirmation hi im ellie thanks c...
391,sync introv4confirmation hi im ellie thanks co...



Audio Data:


Unnamed: 0_level_0,Unnamed: 1_level_0,FORMANT_F2,FORMANT_F3,FORMANT_F1,FORMANT_F4,FORMANT_F5,COVAREP_F0,COVAREP_H1H2,COVAREP_HMPDM_12,COVAREP_MCEP_0,COVAREP_HMPDM_19
ID,TIMESTAMP,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
386,0 days 00:00:00,1571.05937,2554.39041,616.2066,3445.9205,4352.0802,245.137862,3.553693,-0.042045,-10.86379,0.20337
386,0 days 00:00:10,1726.74306,2604.0072,719.887378,3483.4309,4403.4083,250.681,4.400846,0.161784,-11.367054,0.138979
386,0 days 00:00:20,1657.20697,2569.882,612.62378,3456.0944,4344.0295,248.715,3.023285,0.014002,-10.471386,0.145858
386,0 days 00:00:30,1657.41013,2599.1873,536.71279,3511.1177,4367.7521,234.705,3.291128,0.018634,-11.130249,0.219273
386,0 days 00:00:40,1633.53475,2605.3335,470.127492,3511.1365,4338.1031,220.724,4.272259,0.240675,-11.218786,0.339719



Face Data:


Unnamed: 0_level_0,Unnamed: 1_level_0,CLNFAUs_frame,CLNFAUs_AU12_c,CLNFAUs_AU04_c,CLNFAUs_AU28_c,CLNFAUs_AU15_c,CLNFAUs_AU23_c,CLNFAUs_AU45_c,CLNFAUs_AU04_r,CLNFAUs_AU01_r,CLNFAUs_AU25_r,...,CLNFgaze_confidence,CLNFpose_frame,CLNFpose_Tz,CLNFpose_Ty,CLNFpose_Tx,CLNFpose_success,CLNFpose_Rx,CLNFpose_confidence,CLNFpose_Rz,CLNFpose_Ry
ID,TIMESTAMP,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
386,0 days 00:00:00,150.5,-28.936667,-28.793333,-28.82,-28.816667,-28.3,-28.793333,0.009116,0.125502,0.053306,...,0.843113,150.5,526.019457,75.269915,61.237442,0.823333,0.273747,0.843113,0.056375,-0.056813
386,0 days 00:00:10,450.5,-14.65,-14.263333,-14.423333,-14.596667,-13.833333,-14.413333,0.004848,0.025667,0.012548,...,0.862236,450.5,554.51113,40.483648,64.456001,0.853333,0.013761,0.862236,-0.064077,-0.057492
386,0 days 00:00:20,750.5,0.53,0.276667,0.196667,0.063333,0.993333,0.176667,0.005845,0.183318,0.197998,...,0.976928,750.5,529.75702,34.315661,44.2876,1.0,0.065726,0.976928,0.01685,0.01755
386,0 days 00:00:30,1050.5,0.013333,0.613333,0.63,0.113333,1.0,0.1,0.001677,0.022389,0.005373,...,0.98011,1050.5,528.234877,35.625105,45.062875,1.0,0.049616,0.98011,0.003288,0.019775
386,0 days 00:00:40,1350.5,0.433333,0.45,0.796667,0.023333,1.0,0.07,0.0,0.003746,0.017477,...,0.981177,1350.5,531.534767,33.735612,46.42873,1.0,0.046129,0.981177,0.004121,0.018015



Results Data:


Unnamed: 0_level_0,PHQ_Binary
ID,Unnamed: 1_level_1
386,1
391,0


## Data Splitting

In [3]:
# This function aligns and merges the three modalities (text, audio, face) by a common set of keys (ID and time window).
# Audio and face are both time series data, so they are expected to have features extracted per time window (e.g., every 10s).
# Text is non-time series, but for fusion, we align each text sample to the same time window as audio/face (e.g., by transcript segment or by aggregating text features per window).
# The merge ensures that each row in the final dataset corresponds to a single sample with all three modalities for the same subject and time window.
# After merging, the function performs a stratified train/val/test split, so that all splits are aligned across modalities.
# This ensures that each sample in the split contains the correct text, audio, and face features for the same instance.

from sklearn.model_selection import train_test_split


def prepare_aligned_data_splits(
    df_text: pd.DataFrame,
    df_audio: pd.DataFrame,
    df_face: pd.DataFrame,
    df_result: pd.DataFrame,
    test_size: float = 0.2,
    val_size: float = 0.1,
    random_state: int = RANDOM_STATE
):
    # Merge on ID and time window (adjust 'window' to your actual time window column if needed)
    merge_keys = ['ID', 'window'] if 'window' in df_audio.columns else ['ID']
    df = df_result.copy()
    df_all = df_text.merge(df_audio, on=merge_keys, suffixes=('_text', '_audio'))
    df_all = df_all.merge(df_face, on=merge_keys, suffixes=('', '_face'))
    df_all = df_all.merge(df_result, on='ID')

    # Drop rows with missing values (optional, or handle differently)
    df_all = df_all.dropna()

    # Prepare features and target
    text_features = df_all['TRANSCRIPT_text']  # or your text feature columns
    audio_features = df_all[[col for col in df_audio.columns if col not in merge_keys]]
    face_features = df_all[[col for col in df_face.columns if col not in merge_keys]]
    y = df_all['PHQ_Binary']

    # Train/val/test split (stratified if possible)
    X = pd.DataFrame({
        'text': text_features,
        'audio': list(audio_features.values),
        'face': list(face_features.values)
    })
    y = y.reset_index(drop=True)

    # First split into train+val and test
    X_trainval, X_test, y_trainval, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    # Then split train+val into train and val
    val_relative_size = val_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_trainval, y_trainval, test_size=val_relative_size, random_state=random_state, stratify=y_trainval
    )

    # Return splits as tuples of (text, audio, face, y)
    def unpack_split(X_split, y_split):
        return {
            'text': list(X_split['text']),
            'audio': np.stack(X_split['audio']),
            'face': np.stack(X_split['face']),
            'label': y_split.values
        }

    return {
        'train': unpack_split(X_train, y_train),
        'val': unpack_split(X_val, y_val),
        'test': unpack_split(X_test, y_test)
    }
splits = prepare_aligned_data_splits(df_text, df_audio, df_face, df_result)


## Model Training

In [4]:
def create_data_loaders(
    splits: dict,
    text_vectorizer,
    batch_size: int = BATCH_SIZE
) -> Tuple[DataLoader, DataLoader, DataLoader]:
    def to_tensor(arr, dtype=torch.float32):
        return torch.tensor(np.array(arr), dtype=dtype)

    # Convert to tensors
    # The textual data is transformed into a numeric format using the first step from the pipeline
    X_train_text = to_tensor(text_vectorizer.transform(splits['train']['text']).toarray())
    X_train_audio = to_tensor(splits['train']['audio'])
    X_train_face = to_tensor(splits['train']['face'])
    y_train = torch.tensor(splits['train']['label'], dtype=torch.long)

    X_val_text = to_tensor(text_vectorizer.transform(splits['val']['text']).toarray())
    X_val_audio = to_tensor(splits['val']['audio'])
    X_val_face = to_tensor(splits['val']['face'])
    y_val = torch.tensor(splits['val']['label'], dtype=torch.long)

    X_test_text = to_tensor(text_vectorizer.transform(splits['test']['text']).toarray())
    X_test_audio = to_tensor(splits['test']['audio'])
    X_test_face = to_tensor(splits['test']['face'])
    y_test = torch.tensor(splits['test']['label'], dtype=torch.long)

    # Create datasets
    train_dataset = TensorDataset(X_train_text, X_train_audio, X_train_face, y_train)
    val_dataset = TensorDataset(X_val_text, X_val_audio, X_val_face, y_val)
    test_dataset = TensorDataset(X_test_text, X_test_audio, X_test_face, y_test)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    return train_loader, val_loader, test_loader


def train_model_with_grid_search(splits: dict, param_grid: Dict, n_epochs: int = N_EPOCHS) -> Tuple[Dict, List[Dict]]:
    #Perform grid search to find optimal hyperparameters.
    # Load individual models
    text_model, audio_model, face_model, audio_scaler, face_scaler = load_models()
    text_vectorizer = text_model.named_steps["tfidf"]
    text_feature_dim = len(text_vectorizer.get_feature_names_out())

    # Create data loaders
    train_loader, val_loader, _ = create_data_loaders(splits, text_vectorizer, batch_size=BATCH_SIZE)

    # Initialize tracking variables
    best_val_loss = float('inf')
    best_params = None
    results = []

    # Grid search
    for params in tqdm(ParameterGrid(param_grid)):
        # Create multimodal model
        model = MultimodalFusion(
            text_feature_dim,
            audio_model,
            face_model
        ).to(DEVICE)

        # Training setup
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=params['learning_rate'],
            weight_decay=params['weight_decay']
        )
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=0.1,
            patience=3,
            verbose=True
        )

        # Initialize trainer
        trainer = MultimodalFusionTrainer(
            model=model,
            criterion=criterion,
            optimizer=optimizer,
            scheduler=scheduler,
            device=DEVICE
        )

        # Train model
        train_losses, val_losses = trainer.train(
            train_loader=train_loader,
            val_loader=val_loader,
            n_epochs=n_epochs
        )

        # Record results
        final_val_loss = val_losses[-1]
        results.append({
            'params': params,
            'final_val_loss': final_val_loss,
            'train_losses': train_losses,
            'val_losses': val_losses
        })

        # Update best parameters
        if final_val_loss < best_val_loss:
            best_val_loss = final_val_loss
            best_params = params

    return best_params, results


# Train model with grid search
best_params, results = train_model_with_grid_search(splits, PARAM_GRID)

# Print best parameters
print("\nBest parameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")
# print(f"Best validation loss: {best_val_loss:.4f}")

# Plot training curves for best model
plt.figure(figsize=FIGURE_SIZE)
best_result = min(results, key=lambda x: x['final_val_loss'])
plt.plot(best_result['train_losses'], label='Training Loss')
plt.plot(best_result['val_losses'], label='Validation Loss')
plt.title('Training and Validation Loss (Best Model)')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()


Training:   0%|          | 0/341 [00:00<?, ?it/s][A
  0%|          | 0/8 [00:00<?, ?it/s]


RuntimeError: linear(): input and weight.T shapes cannot be multiplied (32x2 and 128x256)

## Model Evaluation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay


def evaluate_model(model: nn.Module, test_loader: DataLoader, device: torch.device) -> Tuple[np.ndarray, np.ndarray]:
    #Evaluate the model on the test set.
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for X, y in test_loader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())

    return np.array(all_labels), np.array(all_preds)


# Create test loader
_, _, test_loader = create_data_loaders(X_train, X_val, X_test, y_train, y_val, y_test)

# Load individual models
text_model, audio_model, face_model, audio_scaler, face_scaler = load_models()

# Initialize best model
best_model = MultimodalFusion(
    text_model,
    audio_model,
    face_model
).to(DEVICE)

# Evaluate model
y_true, y_pred = evaluate_model(best_model, test_loader, DEVICE)

# Print classification report
print("Classification Report:")
print(classification_report(y_true, y_pred))

# Plot confusion matrix
plt.figure(figsize=FIGURE_SIZE)
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

## Save Model

In [None]:
# Save the model
torch.save(best_model.state_dict(), 'multimodal_model.pth')
print("Model saved successfully!")