# Audio-based Depression Detection Model

This notebook implements a deep learning model for depression detection using audio features.
The model uses an LSTM architecture with attention mechanisms to process temporal audio data.

In [1]:
# Standard library imports
import os
from typing import Dict, List, Tuple, Optional

# Third-party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import TimeSeriesSplit
from tqdm import tqdm

# Local imports
from preprocessing.loader_results import ResultsLoader
from preprocessing.loader_audio import AudioLoader
from models.audio_rnn import AudioRNN
from utils.trainer_audio_rnn import AudioRNNTrainer

# Constants
RANDOM_STATE = 42
DATA_PERCENTAGE = 0.02  # Percentage of total data to use
BATCH_SIZE = 32
N_EPOCHS = 50
FIGURE_SIZE = (15, 8)

# Hyperparameter grid for model tuning
PARAM_GRID = {
    'hidden_size': [64, 128],
    'num_layers': [1, 2],
    'dropout': [0.2, 0.3],
    'learning_rate': [0.001, 0.0001],
}

# Device configuration
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

ImportError: cannot import name 'ResultsLoader' from 'preprocessing.loader' (/Users/karlo/College/Diplomski/Code/multimodal_depression_detection/DepressionDetection/preprocessing/loader.py)

## Data Preparation

Load and preprocess the audio data and depression labels.

In [None]:
def load_data(percentage: float = DATA_PERCENTAGE, random_state: int = RANDOM_STATE) -> Tuple[
    pd.DataFrame, pd.DataFrame]:
    """Load and prepare the audio data and depression labels.
    
    Args:
        percentage: Percentage of total data to use
        random_state: Random seed for reproducibility
        
    Returns:
        Tuple containing audio features DataFrame and results DataFrame
    """
    # Initialize loaders
    results_loader = ResultsLoader()
    audio_loader = AudioLoader()

    # Load data
    df_result = results_loader.get_data(percentage=percentage, random_state=random_state)
    df_audio = audio_loader.get_data(
        percentage=percentage,
        random_state=random_state,
        ds_freq="10s",
        rw_size="10s"
    )

    return df_audio, df_result


# Load the data
df_audio, df_result = load_data()

# Display data overview
print("Audio Features Overview:")
display(df_audio.head())
print("\nDepression Labels Overview:")
display(df_result.head())

## Exploratory Data Analysis

Analyze the distribution of audio features and their relationships.

In [None]:
def plot_feature_distributions(df: pd.DataFrame, features: List[str], fig_size: Tuple[int, int] = FIGURE_SIZE):
    """Plot distributions of specified features.
    
    Args:
        df: DataFrame containing the features
        features: List of feature names to plot
        fig_size: Figure size tuple (width, height)
    """
    # for plotting distributions of key audio features (provided as a param for flexibility)

    n_features = len(features)
    n_cols = min(2, n_features)
    n_rows = (n_features + 1) // 2

    fig, axes = plt.subplots(n_rows, n_cols, figsize=fig_size)
    axes = axes.ravel()

    for idx, feature in enumerate(features):
        sns.histplot(data=df[feature], ax=axes[idx])
        axes[idx].set_title(f'Distribution of {feature}')
        axes[idx].set_xlabel(feature)

    plt.tight_layout()
    plt.show()


def plot_energy_distribution(df: pd.DataFrame, fig_size: Tuple[int, int] = FIGURE_SIZE):
    """Plot energy distribution over time for each subject.
    
    Args:
        df: DataFrame containing audio features
        fig_size: Figure size tuple (width, height)
    """
    plt.figure(figsize=fig_size)
    for id in df.index.get_level_values('ID').unique():
        plt.plot(df.loc[id]['AUDIO_AMPLITUDE'],
                 alpha=0.5,
                 label=f'Subject {id}')
    plt.title('Energy Distribution Over Time')
    plt.xlabel('Time')
    plt.ylabel('Amplitude')
    plt.legend()
    plt.show()


def plot_formant_frequencies(df: pd.DataFrame, fig_size: Tuple[int, int] = FIGURE_SIZE):
    """Plot average formant frequencies by subject.
    
    Args:
        df: DataFrame containing audio features
        fig_size: Figure size tuple (width, height)
    """
    plt.figure(figsize=fig_size)
    formant_features = ['FORMANT_F1', 'FORMANT_F2', 'FORMANT_F3']

    for id in df.index.get_level_values('ID').unique():
        subject_data = df.loc[id][formant_features]
        plt.plot(subject_data.mean(), 'o-', label=f'Subject {id}')

    plt.title('Average Formant Frequencies by Subject')
    plt.xlabel('Formant Number')
    plt.ylabel('Frequency (Hz)')
    plt.xticks([0, 1, 2], ['F1', 'F2', 'F3'])
    plt.legend()
    plt.show()


# Basic statistics
print("Basic statistics for audio features:")
print(df_audio.describe())

# Plot distributions of key audio features
audio_features = ['AUDIO_AMPLITUDE', 'FORMANT_F1', 'FORMANT_F2', 'FORMANT_F3']
plot_feature_distributions(df_audio, audio_features)

# Energy distribution over time
plot_energy_distribution(df_audio)

# Formant frequencies analysis
plot_formant_frequencies(df_audio)

# COVAREP features analysis
covarep_cols = [col for col in df_audio.columns if 'COVAREP' in col]
if covarep_cols:
    plt.figure(figsize=FIGURE_SIZE)
    sns.heatmap(df_audio[covarep_cols].corr(),
                annot=True,
                cmap='coolwarm',
                center=0)
    plt.title('Correlation between COVAREP features')
    plt.tight_layout()
    plt.show()

# TODO: missing speech rate analysis (do i need it?)

## Principal Component Analysis

Perform PCA to reduce dimensionality and identify key features.

In [None]:
from sklearn.decomposition import PCA


def perform_pca(df: pd.DataFrame, n_components: int = 10) -> Tuple[pd.DataFrame, PCA]:
    """Perform PCA on the input data.
    
    Args:
        df: Input DataFrame
        n_components: Number of principal components to keep
        
    Returns:
        Tuple containing PCA-transformed DataFrame and fitted PCA object
    """
    # Select numerical features
    X = df.select_dtypes(include=['float64', 'int64'])

    # Initialize and fit PCA
    pca = PCA(n_components=n_components)  # adjust n_components 
    X_pca = pca.fit_transform(X)

    # Create DataFrame with PCA results
    pca_df = pd.DataFrame(
        X_pca,
        columns=[f'PC{i + 1}' for i in range(X_pca.shape[1])],
        index=X.index
    )

    return pca_df, pca


# Perform PCA
pca_df, pca = perform_pca(df_audio)

# Plot explained variance ratio
plt.figure(figsize=FIGURE_SIZE)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('PCA Explained Variance Ratio')
plt.grid(True)
plt.show()

# Plot first two principal components
plt.figure(figsize=FIGURE_SIZE)
for id in pca_df.index.get_level_values('ID').unique():
    mask = pca_df.index.get_level_values('ID') == id
    plt.scatter(pca_df[mask]['PC1'],
                pca_df[mask]['PC2'],
                alpha=0.6,
                label=f'Subject {id}')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.title('First Two Principal Components')
plt.legend()
plt.grid(True)
plt.show()

# Print explained variance ratios
print("\nExplained variance ratio for each component:")
for i, ratio in enumerate(pca.explained_variance_ratio_):
    print(f"PC{i + 1}: {ratio:.4f}")

# Print Cumulative explained variance ratio
print("\nCumulative explained variance ratio:")
print(np.cumsum(pca.explained_variance_ratio_))

# TODO: missing get feature importance and plot heatmap of feature importance for the first few components 

## Data Splitting

Split the data into training, validation, and test sets while preserving temporal order.

In [None]:
def prepare_data_splits(df_audio: pd.DataFrame, df_result: pd.DataFrame) -> Tuple[
    np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Prepare data splits for training, validation, and testing.
    
    Args:
        df_audio: DataFrame containing audio features
        df_result: DataFrame containing depression labels
        
    Returns:
        Tuple containing training, validation, and test data splits
    """
    # Merge audio features with depression labels
    df = pd.merge(df_audio, df_result, on='ID')  # TODO: check if this is the same df as the one modified in the PCA

    # Prepare features and target
    X = df.drop(['PHQ_Binary'], axis=1)
    y = df['PHQ_Binary']

    # Create time series split
    tscv = TimeSeriesSplit(n_splits=5)
    train_indices = []
    test_indices = []

    # Get split indices while preserving temporal order
    for train_idx, test_idx in tscv.split(X):
        train_indices.append(train_idx)
        test_indices.append(test_idx)

    # Use the last fold for final train/test split
    X_train = X.iloc[train_indices[-1]]
    X_test = X.iloc[test_indices[-1]]
    y_train = y.iloc[train_indices[-1]]
    y_test = y.iloc[test_indices[-1]]

    # Further split training data into train and validation
    train_size = int(0.75 * len(X_train))
    X_train, X_val = X_train.iloc[:train_size], X_train.iloc[train_size:]
    y_train, y_val = y_train.iloc[:train_size], y_train.iloc[train_size:]

    # Standardize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    return X_train, X_val, X_test, y_train, y_val, y_test, scaler


# Prepare data splits
X_train, X_val, X_test, y_train, y_val, y_test, scaler = prepare_data_splits(df_audio, df_result)

## Model Training

Train the audio RNN model with hyperparameter tuning.

In [None]:
from torch.optim.lr_scheduler import ReduceLROnPlateau


def create_data_loaders(X_train: np.ndarray, X_val: np.ndarray, X_test: np.ndarray,
                        y_train: np.ndarray, y_val: np.ndarray, y_test: np.ndarray,
                        batch_size: int = BATCH_SIZE) -> Tuple[DataLoader, DataLoader, DataLoader]:
    """Create PyTorch DataLoaders for training, validation, and testing.
    
    Args:
        X_train, X_val, X_test: Feature arrays
        y_train, y_val, y_test: Label arrays
        batch_size: Batch size for training
        
    Returns:
        Tuple containing train, validation, and test DataLoaders
    """
    # Convert data to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_train)
    X_val_tensor = torch.FloatTensor(X_val)
    X_test_tensor = torch.FloatTensor(X_test)
    y_train_tensor = torch.LongTensor(y_train.values)
    y_val_tensor = torch.LongTensor(y_val.values)
    y_test_tensor = torch.LongTensor(y_test.values)

    # Create datasets
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    return train_loader, val_loader, test_loader


def train_model_with_grid_search(X_train: np.ndarray, y_train: np.ndarray, X_val: np.ndarray, y_val: np.ndarray,
                                 param_grid: Dict, n_epochs: int = N_EPOCHS) -> Tuple[Dict, List[Dict]]:
    """Perform grid search to find optimal hyperparameters.
    
    Args:
        X_train, y_train: Training data
        X_val, y_val: Validation data
        param_grid: Dictionary of hyperparameters to search
        n_epochs: Number of training epochs
        
    Returns:
        Tuple containing best parameters and all results
    """
    # Create data loaders
    train_loader, val_loader, _ = create_data_loaders(X_train, X_val, X_test, y_train, y_val, y_test)

    # Initialize tracking variables
    best_val_loss = float('inf')
    best_params = None
    results = []

    # Grid search
    for params in tqdm(ParameterGrid(param_grid)):
        # Model initialization
        model = AudioRNN(
            input_size=X_train.shape[1],
            hidden_size=params['hidden_size'],
            num_layers=params['num_layers'],
            dropout=params['dropout']
        ).to(DEVICE)

        # Training setup
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'])
        scheduler = ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=0.1,
            patience=3,
            verbose=True
        )

        # Initialize trainer
        trainer = AudioRNNTrainer(
            model=model,
            criterion=criterion,
            optimizer=optimizer,
            scheduler=scheduler,
            device=DEVICE
        )

        # Train model
        train_losses, val_losses = trainer.train(
            train_loader=train_loader,
            val_loader=val_loader,
            n_epochs=n_epochs
        )

        # Record results
        final_val_loss = val_losses[-1]
        results.append({
            'params': params,
            'final_val_loss': final_val_loss,
            'train_losses': train_losses,
            'val_losses': val_losses
        })

        # Update best parameters
        if final_val_loss < best_val_loss:
            best_val_loss = final_val_loss
            best_params = params

    return best_params, results


# Train model with grid search
best_params, results = train_model_with_grid_search(X_train, y_train, X_val, y_val, PARAM_GRID)

# Print best parameters
print("\nBest parameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")
# print(f"Best validation loss: {best_val_loss:.4f}")

# Plot training curves for best model
plt.figure(figsize=FIGURE_SIZE)
best_result = min(results, key=lambda x: x['final_val_loss'])
plt.plot(best_result['train_losses'], label='Training Loss')
plt.plot(best_result['val_losses'], label='Validation Loss')
plt.title('Training and Validation Loss (Best Model)')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

## Model Evaluation

Evaluate the model's performance on the test set.

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay


def evaluate_model(model: nn.Module, test_loader: DataLoader, device: torch.device) -> Tuple[np.ndarray, np.ndarray]:
    """Evaluate the model on the test set.
    
    Args:
        model: Trained PyTorch model
        test_loader: DataLoader containing test data
        device: Device to run evaluation on
        
    Returns:
        Tuple containing true labels and predicted labels
    """
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for X, y in test_loader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())

    return np.array(all_labels), np.array(all_preds)


# Create test loader
_, _, test_loader = create_data_loaders(X_train, X_val, X_test, y_train, y_val, y_test)

# Initialize best model
best_model = AudioRNN(
    input_size=X_train.shape[1],
    **best_params
).to(DEVICE)

# Evaluate model
y_true, y_pred = evaluate_model(best_model, test_loader, DEVICE)

# Print classification report
print("Classification Report:")
print(classification_report(y_true, y_pred))

# Plot confusion matrix
plt.figure(figsize=FIGURE_SIZE)
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

## Save Model

Save the trained model and scaler for later use.

In [None]:
from utils.trainer import save_model

# Save the model and scaler
save_model(best_model, scaler, "audio_model.pth")
print("Model and scaler saved successfully!")