In [6]:
import os
import librosa
import numpy as np
import pandas as pd
import json
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Concatenate, Input
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from pathlib import Path
from textblob import TextBlob
import soundfile as sf
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report




In [7]:
import warnings                    # built‑in module

# Turn off *all* warnings:
warnings.filterwarnings("ignore")

In [8]:
BASE_DIR = Path.cwd()
AUDIO_DIR = BASE_DIR / "Processed"  # Directory where audio files are saved
TEXT_DIR = BASE_DIR / "Processed"   # Directory where transcript files are saved
SAVED_MODEL_PATH = "multimodal_depression_model.h5"

In [9]:
LABELS={
    "300": 0,
    "301": 0,
    "302": 0,
    "303": 0,
    "304": 0,
    "308": 1,
    "309": 1,
    "310": 0,
    "311": 1,
    "312": 0,
    "313": 0,
    "314": 0,
    "315": 0,
    "316": 0,
    "317": 0,
    "318": 0,
    "319": 1,
    "320": 1,
    "321": 1,
    "322": 0,
    "323": 0,
    "324": 0,
    "325": 1,
    "326": 0,
    "327": 0,
    "328": 0,
    "329": 0,
    "330": 1,
    "331": 0,
    "332": 1,
    "334": 0,
    "335": 1,
    "336": 0,
    "337": 1,
    "338": 1,
    "339": 1,
    "340": 0,
    "341": 0,
    "343": 0,
    "344": 1,
    "345": 1,
    "346": 1,
    "347": 1,
    "348": 1,
    "349": 0,
    "350": 1,
    "351": 1,
    "352": 1,
    "353": 1,
    "354": 1,
    "355": 1,
    "356": 1,
    "357": 0,
    "358": 0,
    "359": 1,
    "360": 0,
    "361": 0,
    "362": 1,
    "363": 0,
    "364": 0,
    "365": 1,
    "366": 0,
    "367": 1,
    "368": 0,
    "369": 0,
    "370": 0,
    "371": 0,
    "372": 1,
    "373": 0,
    "374": 0,
    "375": 0,
    "376": 1,
    "377": 1,
    "378": 0,
    "379": 0,
    "381": 1,
    "382": 0,
    "383": 0,
    "446": 0,
    "447": 0,
    "448": 1
}


In [10]:
def extract_audio_features(file_path):
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    return np.mean(mfccs, axis=1)

# Function to extract text features (Sentiment, Word Lengths, etc.)
def extract_text_features(file_path):
    with open(file_path, "r", encoding="utf8") as f:
        rows = json.load(f)  # load the JSON file created in the transcript processing
        print(f"Loaded data: {rows}")  # Debugging line

    # Check if data is a list or a dictionary
    if isinstance(rows, dict):
        # Assuming the file contains a dictionary instead of a list of dictionaries
        text = rows.get("text", "")
    elif isinstance(rows, list):
        # Assuming the file contains a list of rows (each with a 'text' key)
        text = ' '.join([row.get("text", "") for row in rows if "text" in row])
    else:
        raise ValueError("Unexpected data structure in JSON file")

    # Extract text features (Sentiment analysis using TextBlob)
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    return np.array([polarity, subjectivity])


In [11]:
import os

available_ids = []
for filename in os.listdir(AUDIO_DIR):
    if filename.endswith("_p.wav"):
        pid = filename.split("_")[0]
        available_ids.append(pid)

print(f"Total audio files found: {len(available_ids)}")


Total audio files found: 81


In [12]:
audio_features = []
text_features = []
labels = []

# Loop over the labels for the specified participants
for participant_id, label in LABELS.items():
    # Paths to the audio and text files
    audio_file = os.path.join(AUDIO_DIR, f"{participant_id}_p.wav")  # Assuming processed audio files
    text_file = os.path.join(TEXT_DIR, f"{participant_id}_transcript_features.json")  # Text features from the previous step

    if not os.path.exists(audio_file) or not os.path.exists(text_file):
        print(f"Skipping participant {participant_id}, missing files.")
        continue
    
    # Extract features
    audio_features.append(extract_audio_features(audio_file))
    text_features.append(extract_text_features(text_file))
    labels.append(label)

Loaded data: {'polarity_mean': 0.12626169328755535, 'polarity_std': 0.28250398287050743, 'subjectivity_mean': 0.2378644304290856, 'subjectivity_std': 0.331034277905171, 'len_char_mean': 12.913793103448276, 'len_char_std': 12.295895949685866, 'len_words_mean': 2.281609195402299, 'len_words_std': 2.0360921845440108, 'noun_ratio_mean': 0.4424169859514687, 'noun_ratio_std': 0.4049636976376597, 'verb_ratio_mean': 0.1446975916803503, 'verb_ratio_std': 0.23454448994722535, 'duration_total': 584.6800000000001, 'participant': '300'}
Loaded data: {'polarity_mean': 0.11830665076176126, 'polarity_std': 0.3024703120781089, 'subjectivity_mean': 0.319622398013282, 'subjectivity_std': 0.34715560173034643, 'len_char_mean': 29.740331491712706, 'len_char_std': 37.35005544991499, 'len_words_mean': 5.138121546961326, 'len_words_std': 6.29441858639063, 'noun_ratio_mean': 0.4263969933305288, 'noun_ratio_std': 0.3637257463961745, 'verb_ratio_mean': 0.1382527532723798, 'verb_ratio_std': 0.186540474681985, 'dur

Loaded data: {'polarity_mean': 0.11555318934351193, 'polarity_std': 0.3025793207714786, 'subjectivity_mean': 0.2670225682322457, 'subjectivity_std': 0.3365011274520047, 'len_char_mean': 17.370967741935484, 'len_char_std': 15.53677472283559, 'len_words_mean': 3.064516129032258, 'len_words_std': 2.577685748623629, 'noun_ratio_mean': 0.4671581703839768, 'noun_ratio_std': 0.3741017524737454, 'verb_ratio_mean': 0.1631137082749986, 'verb_ratio_std': 0.2188755439804549, 'duration_total': 599.79, 'participant': '319'}
Loaded data: {'polarity_mean': 0.07638099103277675, 'polarity_std': 0.3256317295513642, 'subjectivity_mean': 0.2609991668384526, 'subjectivity_std': 0.3470320419881211, 'len_char_mean': 13.357142857142858, 'len_char_std': 12.71190079866117, 'len_words_mean': 2.3607142857142858, 'len_words_std': 2.07957110371885, 'noun_ratio_mean': 0.43533588435374154, 'noun_ratio_std': 0.3960327895325974, 'verb_ratio_mean': 0.155531462585034, 'verb_ratio_std': 0.23579056104544407, 'duration_total

Loaded data: {'polarity_mean': 0.10684965352934103, 'polarity_std': 0.292441930747546, 'subjectivity_mean': 0.26488608180014434, 'subjectivity_std': 0.3337172696114487, 'len_char_mean': 17.545833333333334, 'len_char_std': 17.91769839071529, 'len_words_mean': 3.0208333333333335, 'len_words_std': 2.8335230472214747, 'noun_ratio_mean': 0.4327522246272247, 'noun_ratio_std': 0.3820902882144527, 'verb_ratio_mean': 0.13891248797498798, 'verb_ratio_std': 0.20214468106525546, 'duration_total': 849.02, 'participant': '336'}
Loaded data: {'polarity_mean': 0.051172120363296825, 'polarity_std': 0.26234996266638, 'subjectivity_mean': 0.24427897266132562, 'subjectivity_std': 0.302166159628229, 'len_char_mean': 30.997737556561084, 'len_char_std': 30.329744788013382, 'len_words_mean': 5.382352941176471, 'len_words_std': 5.096880543676748, 'noun_ratio_mean': 0.399182129148111, 'noun_ratio_std': 0.3326348665341999, 'verb_ratio_mean': 0.19286414347907493, 'verb_ratio_std': 0.2173947230388044, 'duration_to

Loaded data: {'polarity_mean': 0.11581895853579526, 'polarity_std': 0.2590885568936734, 'subjectivity_mean': 0.22582926568895956, 'subjectivity_std': 0.2962033199946134, 'len_char_mean': 24.229591836734695, 'len_char_std': 27.3640854244135, 'len_words_mean': 4.01530612244898, 'len_words_std': 4.4126589935081055, 'noun_ratio_mean': 0.41389540731377467, 'noun_ratio_std': 0.36573933832047795, 'verb_ratio_mean': 0.16959103255021626, 'verb_ratio_std': 0.21508435061597725, 'duration_total': 739.7, 'participant': '353'}
Loaded data: {'polarity_mean': 0.11090160584863976, 'polarity_std': 0.2829778990416202, 'subjectivity_mean': 0.2394871434278214, 'subjectivity_std': 0.31928023894179863, 'len_char_mean': 16.152542372881356, 'len_char_std': 17.68391689629505, 'len_words_mean': 2.8361581920903953, 'len_words_std': 2.9736041297199622, 'noun_ratio_mean': 0.4394648509055289, 'noun_ratio_std': 0.394810797475797, 'verb_ratio_mean': 0.1469396579566071, 'verb_ratio_std': 0.2308225183337289, 'duration_t

Loaded data: {'polarity_mean': 0.18512374488936992, 'polarity_std': 0.3203415204644097, 'subjectivity_mean': 0.30056091670675006, 'subjectivity_std': 0.34206049256462423, 'len_char_mean': 28.65625, 'len_char_std': 28.929384694121786, 'len_words_mean': 4.958333333333333, 'len_words_std': 4.910168642476647, 'noun_ratio_mean': 0.4184221986059415, 'noun_ratio_std': 0.34158137598570393, 'verb_ratio_mean': 0.19943238391259868, 'verb_ratio_std': 0.2305681948727681, 'duration_total': 993.175, 'participant': '369'}
Loaded data: {'polarity_mean': 0.1372534914876464, 'polarity_std': 0.30258611545448544, 'subjectivity_mean': 0.36999769661741494, 'subjectivity_std': 0.32974730754782544, 'len_char_mean': 48.859154929577464, 'len_char_std': 39.41504924301956, 'len_words_mean': 7.938967136150235, 'len_words_std': 6.434065652054897, 'noun_ratio_mean': 0.40143952352114026, 'noun_ratio_std': 0.28330831360002157, 'verb_ratio_mean': 0.20666397356045746, 'verb_ratio_std': 0.18726434043744178, 'duration_tota

Loaded data: {'polarity_mean': 0.0798980101201862, 'polarity_std': 0.25115846909775263, 'subjectivity_mean': 0.2176048423245267, 'subjectivity_std': 0.28743911530397315, 'len_char_mean': 28.45514950166113, 'len_char_std': 26.20741908431481, 'len_words_mean': 4.687707641196013, 'len_words_std': 4.357615754160535, 'noun_ratio_mean': 0.3693320324280848, 'noun_ratio_std': 0.31300104481813934, 'verb_ratio_mean': 0.2424452103170102, 'verb_ratio_std': 0.2558903773161318, 'duration_total': 1175.02, 'participant': '448'}


In [13]:
import numpy as np

# Combine audio and text features
features = audio_features
labels = np.array(labels)


In [66]:
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42, stratify=labels
)


In [67]:
import os
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# ---------------- Logistic Regression ----------------
logreg_model = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=500, solver='liblinear')
)
logreg_model.fit(X_train, y_train)
logreg_preds = logreg_model.predict(X_test)
logreg_acc = accuracy_score(y_test, logreg_preds)
print("LogReg Accuracy:", logreg_acc)

# ---------------- Support Vector Machine -------------
svm_model = make_pipeline(
    StandardScaler(),
    SVC(kernel='rbf', C=10, gamma='scale', probability=True)
)
svm_model.fit(X_train, y_train)
svm_preds = svm_model.predict(X_test)
svm_acc = accuracy_score(y_test, svm_preds)
print("SVM Accuracy:", svm_acc)

# ---------------- Decision Tree ----------------------
dt_model = make_pipeline(
    StandardScaler(),
    DecisionTreeClassifier(max_depth=None, random_state=42)
)
dt_model.fit(X_train, y_train)
dt_preds = dt_model.predict(X_test)
dt_acc = accuracy_score(y_test, dt_preds)
print("Decision Tree Accuracy:", dt_acc)

# ---------------- Gaussian Naive Bayes ---------------
gnb_model = make_pipeline(
    StandardScaler(),
    GaussianNB()
)
gnb_model.fit(X_train, y_train)
gnb_preds = gnb_model.predict(X_test)
gnb_acc = accuracy_score(y_test, gnb_preds)
print("GaussianNB Accuracy:", gnb_acc)

# ---------------- Random Forest ----------------------
rf_model = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(n_estimators=100, random_state=42)
)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)
rf_acc = accuracy_score(y_test, rf_preds)
print("Random Forest Accuracy:", rf_acc)

LogReg Accuracy: 0.6470588235294118
SVM Accuracy: 0.6470588235294118
Decision Tree Accuracy: 0.5882352941176471
GaussianNB Accuracy: 0.6470588235294118
Random Forest Accuracy: 0.5294117647058824


In [69]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score

# Resample with SMOTE (imbalanced dataset handling)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Build the deep learning model (Neural Network)
model = Sequential()

# Input layer with BatchNormalization
model.add(Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'))  
model.add(BatchNormalization())  # Adding BatchNormalization

# Hidden layers
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))  # Adding Dropout
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.5))  # Adding Dropout

# Output layer (binary classification)
model.add(Dense(1, activation='sigmoid'))

# Compile the model with a lower learning rate
model.compile(optimizer=Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping to avoid overfitting
# early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
model.fit(X_train_scaled, y_train_resampled, epochs=50, batch_size=32, 
          validation_data=(X_test_scaled, y_test))

# Evaluate the model on the test set
y_pred_prob = model.predict(X_test_scaled)
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to binary labels

# Print accuracy
from sklearn.metrics import classification_report

# Classification report
print("📋 Classification Report:")
print(classification_report(y_test, y_pred))



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
📋 Classification Report:
              precision    recall  f1-score   support

           0       0.77      1.00      0.87        10
           1       1.00      0.57      0.73         7

    accuracy                           0.82        17
   macro avg       0.88      0.79      0.80        17
weighted avg       0.86      0.82      0.81        17



In [49]:
# Save the deep learning model
model.save('depression_model.h5')


In [50]:
import joblib

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']