In [None]:
import numpy as np
import pandas as pd

from prediction_functions import *

### Loading in and splitting the data 

In [None]:
data_path = "../data/prediction_data/"
depression_data_path = "../data/depression_data/"

# file name for the depression data, must be csv. 
depression_file = "synthetic_mdd_data.csv"

In [None]:
# clac
clac_data = pd.read_csv(data_path + "/clac_mdd_features.csv", index_col=0) 

# common voice
common_data = pd.read_csv(data_path + "/common_mdd_features.csv", index_col=0) 

In [None]:
common_data.head()

In [None]:
clac_data.head()

### Create datasets for the two model versions

In [None]:
mdd_raw = pd.read_csv(depression_data_path + depression_file, index_col=0)

In [None]:
# demographics only - i know these should be the same across datasets but I want to be safe
demographics = mdd_raw.loc[:,["pid", "age", "education_years", "gender", "phq8"]]

In [None]:
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate

In [None]:
# cv splits
inner_cv = GroupKFold(n_splits=10)
outer_cv = GroupKFold(n_splits=10)

In [None]:
# Pipeline
en_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('elasticnet', ElasticNet(random_state=30))
])

# Define hyperparameters to search
en_param_grid = {
    'elasticnet__alpha': [0.1, 1.0, 10.0, 100],
    'elasticnet__l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

### Demographics models

#### Demographics only

In [None]:
features = ["age", "gender", "education_years"]

X = mdd_raw[features]
y = mdd_raw[["phq8"]]
groups = mdd_raw[["pid"]]

In [None]:
en_Inner_Grid_1 = GridSearchCV(en_pipeline,
                               en_param_grid,
                               verbose = 1,
                               cv=inner_cv,
                               n_jobs = -1)

In [None]:
en_demo = cross_validate(en_Inner_Grid_1, X, y, 
                         cv=outer_cv,
                         groups=groups,
                         params={'groups': groups}, 
                         scoring=scorers,
                         n_jobs = -1,
                         return_train_score=True,
                         return_estimator=True)

#### Conventional + demographics

In [None]:
features = mdd_raw.columns.drop(["pid", "phq8"])

X = mdd_raw[features]
y = mdd_raw[["phq8"]]
groups = mdd_raw[["pid"]]

In [None]:
en_Inner_Grid_2 = GridSearchCV(en_pipeline,
                               en_param_grid,
                               verbose = 1,
                               cv=inner_cv,
                               n_jobs = -1)

In [None]:
en_raw = cross_validate(en_Inner_Grid_2, X, y, 
                         cv=outer_cv,
                         groups=groups,
                         params={'groups': groups}, 
                         scoring=scorers,
                         n_jobs = -1,
                         return_train_score=True,
                         return_estimator=True)

#### CLAC + demographics

In [None]:
en_Inner_Grid_3 = GridSearchCV(en_pipeline,
                               en_param_grid,
                               verbose = 1,
                               cv=inner_cv,
                               n_jobs = -1)


In [None]:
features = clac_data.columns.drop(["pid", "phq8"])

X = clac_data[features]
y = clac_data[["phq8"]]
groups = clac_data[["pid"]]

In [None]:
clac_model = cross_validate(en_Inner_Grid_3, X, y, 
                           cv=outer_cv,
                           groups=groups,
                           params={'groups': groups}, 
                           scoring=scorers,
                           return_train_score=True,
                           return_estimator=True,
                           n_jobs = -1)

#### Common voice + demographics

In [None]:
en_Inner_Grid_4 = GridSearchCV(en_pipeline,
                               en_param_grid,
                               verbose = 1,
                               cv=inner_cv,
                               n_jobs = -1)

In [None]:
features = common_data.columns.drop(["pid", "phq8"])

X = common_data[features]
y = common_data[["phq8"]]
groups = common_data[["pid"]]

In [None]:
common_model = cross_validate(en_Inner_Grid_4, X, y, 
                           cv=outer_cv,
                           groups=groups,
                           params={'groups': groups}, 
                           scoring=scorers,
                           return_train_score=True,
                           return_estimator=True,
                           n_jobs = -1)

### Models without demographics

#### Conventional speech features

In [None]:
features = mdd_raw.columns.drop(["pid", "phq8", "gender", "education_years", "age"])

X = mdd_raw[features]
y = mdd_raw[["phq8"]]
groups = mdd_raw[["pid"]]

In [None]:
en_Inner_Grid_5 = GridSearchCV(en_pipeline,
                               en_param_grid,
                               verbose = 1,
                               cv=inner_cv,
                               n_jobs = -1)

In [None]:
conventional_nodemo = cross_validate(en_Inner_Grid_5, X, y, 
                         cv=outer_cv,
                         groups=groups,
                         params={'groups': groups}, 
                         scoring=scorers,
                         n_jobs = -1,
                         return_train_score=True,
                         return_estimator=True)

#### CLAC

In [None]:
en_Inner_Grid_6 = GridSearchCV(en_pipeline,
                               en_param_grid,
                               verbose = 1,
                               cv=inner_cv,
                               n_jobs = -1)


In [None]:
features = clac_data.columns.drop(["pid", "phq8", "gender", "education_years", "age"])

X = clac_data[features]
y = clac_data[["phq8"]]
groups = clac_data[["pid"]]

In [None]:
clac_model_nodemo = cross_validate(en_Inner_Grid_6, X, y, 
                           cv=outer_cv,
                           groups=groups,
                           params={'groups': groups}, 
                           scoring=scorers,
                           return_train_score=True,
                           return_estimator=True,
                           n_jobs = -1)

#### Common voice

In [None]:
en_Inner_Grid_7 = GridSearchCV(en_pipeline,
                               en_param_grid,
                               verbose = 1,
                               cv=inner_cv,
                               n_jobs = -1)

In [None]:
features = common_data.columns.drop(["pid", "phq8", "gender", "education_years", "age"])

X = common_data[features]
y = common_data[["phq8"]]
groups = common_data[["pid"]]

In [None]:
common_model_nodemo = cross_validate(en_Inner_Grid_7, X, y, 
                           cv=outer_cv,
                           groups=groups,
                           params={'groups': groups}, 
                           scoring=scorers,
                           return_train_score=True,
                           return_estimator=True,
                           n_jobs = -1)

### Results

In [None]:
current_model = en_demo
print("outer")
print(f"test_rmse: {current_model['test_rmse'].mean()}")
print(f"test_r2: {current_model['test_r2'].mean()}")
print(f"test_mae: {current_model['test_mae'].mean()}")
print("---------------------------------")
print("best params")
print(current_model['estimator'][0].best_params_)
print(current_model['estimator'][1].best_params_)
print(current_model['estimator'][2].best_params_)
print(current_model['estimator'][3].best_params_)
print(current_model['estimator'][4].best_params_)

In [None]:
results = {
    "metric": ["test_rmse", "test_r2", "test_mae"],
    "demographics_only":  [en_demo['test_rmse'].mean(), en_demo['test_r2'].mean(), en_demo['test_mae'].mean()],
    "conventional_dem":  [en_raw['test_rmse'].mean(), en_raw['test_r2'].mean(), en_raw['test_mae'].mean()],
    "clac_dem":  [clac_model['test_rmse'].mean(), clac_model['test_r2'].mean(), clac_model['test_mae'].mean()],
    "common_dem":  [common_model['test_rmse'].mean(), common_model['test_r2'].mean(), common_model['test_mae'].mean()],
    
    "conventional":  [conventional_nodemo['test_rmse'].mean(), conventional_nodemo['test_r2'].mean(), conventional_nodemo['test_mae'].mean()],
    "clac":  [clac_model_nodemo['test_rmse'].mean(), clac_model_nodemo['test_r2'].mean(), clac_model_nodemo['test_mae'].mean()],
    "common":  [common_model_nodemo['test_rmse'].mean(), common_model_nodemo['test_r2'].mean(), common_model_nodemo['test_mae'].mean()]
}

results_df = pd.DataFrame(results).T
results_df.columns = results_df.iloc[0]  # Set the first row as column names
results_df = results_df[1:]  # Remove the first row
results_df