# **Initialization** 

In [1]:
import pandas as pd

### Control

In [2]:
# Paths
RAW_DATA_FOLDER = 'raw_data'
TARGET_FILE_PATH = 'unprocessed_data'

# Flow Controls
RELOAD_RAW_DATA = False

# System variables
STATE = 42

### Merge raw data files

In [3]:
from raw_data_loader import load_raw_data

if (RELOAD_RAW_DATA):
    load_raw_data(RAW_DATA_FOLDER, TARGET_FILE_PATH)

### Preprocessing and Split

In [4]:
from preprocessing import preprocess_depression_data

dataset = pd.read_csv(TARGET_FILE_PATH + '/depression_data.csv')
X_train, X_test, y_train, y_test, y_embed_train, y_embed_test = preprocess_depression_data(dataset, STATE)

#TODO: Add insomnia part, and change variable names to have one for depression and one for insomnia

Dropped 142 columns with >50% missing. Remaining: 112 columns (was 254)
Dropped 0 training rows with >50% missing. Remaining: 3333 rows (was 3333)
Categorical features: ['PAD790U', 'PAD810U', 'SLQ300', 'SLQ310', 'SLQ320', 'SLQ330']
Numeric features: ['ACD010A', 'ALQ111', 'ALQ121', 'ALQ130', 'ALQ142', 'ALQ151', 'BPQ020', 'BPQ080', 'BPQ101D', 'DBQ930', 'DBQ935', 'DBQ940', 'DBQ945', 'DIQ010', 'DIQ160', 'DIQ180', 'FNQ410', 'FNQ430', 'FNQ440', 'FNQ450', 'FNQ460', 'FNQ470', 'FNQ480', 'FNQ490', 'FNQ510', 'FNQ520', 'FNQ530', 'FNQ540', 'FNDADI', 'FNDAEDI', 'FSD032A', 'FSD032B', 'FSD032C', 'FSDAD', 'FSD151', 'FSQ165', 'FSD162', 'HIQ011', 'HIQ032A', 'HIQ210', 'HOD051', 'HSQ590', 'HUQ010', 'HUQ030', 'HUQ042', 'HUQ055', 'HUQ090', 'INDFMMPI', 'INDFMMPC', 'INQ300', 'KIQ022', 'KIQ005', 'KIQ042', 'KIQ044', 'KIQ481', 'MCQ010', 'AGQ030', 'MCQ053', 'MCQ160A', 'MCQ160B', 'MCQ160C', 'MCQ160D', 'MCQ160E', 'MCQ160F', 'MCQ160M', 'MCQ160P', 'MCQ160L', 'MCQ550', 'MCQ560', 'MCQ220', 'OSQ230', 'OCD150', 'OCQ180', 

# **Models**

### Training Random Forests

In [5]:
# Parameters
N_ESTIMATORS = 100

In [6]:
from forest_models import *

train_binary_forest(N_ESTIMATORS, STATE, X_train, X_test, y_train, y_test)
train_multitarget_forest(N_ESTIMATORS, STATE, X_train, X_test, y_train, y_test, y_embed_train, y_embed_test)
train_forest_with_embed_input(N_ESTIMATORS, STATE, X_train, X_test, y_train, y_test, y_embed_train, y_embed_test)



 ######################################## binary Forest  ########################################
F1 score: 0.0
ROC-AUC: 0.830191361399672
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       775
           1       0.00      0.00      0.00        59

    accuracy                           0.93       834
   macro avg       0.46      0.50      0.48       834
weighted avg       0.86      0.93      0.90       834

Confusion matrix:
 [[775   0]
 [ 59   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])




 ######################################## Multitarget Forest  ########################################
MSE per DPQ item: [0.58468753 0.50118873 0.85106151 0.68359257 0.69430803 0.60432074
 0.6557054  0.45429916 0.18361739]
Average MSE: 0.5791978950173197
F1 score: 0.3877551020408163
ROC-AUC: 0.6481137233460907
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       775
           1       0.49      0.32      0.39        59

    accuracy                           0.93       834
   macro avg       0.72      0.65      0.67       834
weighted avg       0.92      0.93      0.92       834

Confusion matrix:
 [[755  20]
 [ 40  19]]


 ######################################## Forest with embedding as faetures  ########################################
F1 score: 0.6888888888888889
ROC-AUC: 0.9936139967195189
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       775
           1       1.00      0.

### Training Logistic Models

In [7]:
# Parameters
MAX_ITERATIONS = 1000

In [8]:
from logsitic_models import *

train_binary_logistic(MAX_ITERATIONS, STATE, X_train, X_test, y_train, y_test)
train_multitarget_logistic(MAX_ITERATIONS, STATE, X_train, X_test, y_train, y_test, y_embed_train, y_embed_test)
train_logistic_with_embed_input(MAX_ITERATIONS, STATE, X_train, X_test, y_train, y_test, y_embed_train, y_embed_test)



 ######################################## Binary Logistic Regression  ########################################
F1 score: 0.3148148148148148
ROC-AUC: 0.7912083105522143
              precision    recall  f1-score   support

           0       0.96      0.84      0.90       775
           1       0.22      0.58      0.31        59

    accuracy                           0.82       834
   macro avg       0.59      0.71      0.61       834
weighted avg       0.91      0.82      0.86       834

Confusion matrix:
 [[652 123]
 [ 25  34]]


 ######################################## Multitarget Logistic Regression ########################################
MSE per DPQ item: [2.59232614 1.07194245 1.68345324 1.42206235 1.69664269 1.29376499
 1.41127098 1.74460432 0.7853717 ]
Average MSE: 1.5223820943245403
F1 score: 0.2764227642276423
ROC-AUC: 0.6894259158009841
              precision    recall  f1-score   support

           0       0.96      0.80      0.87       775
           1       0.18   

### Training Bayesian Models

In [9]:
from bayesian_models import *

train_binary_bayes(X_train, X_test, y_train, y_test)
train_multitarget_bayes(X_train, X_test, y_train, y_test, y_embed_train, y_embed_test)
train_bayes_with_embed(X_train, X_test, y_train, y_test, y_embed_train, y_embed_test)



 ######################################## Binary GaussianNB Model ########################################
F1 score: 0.13333333333333333
ROC-AUC: 0.5079496992892291
              precision    recall  f1-score   support

           0       0.94      0.10      0.18       775
           1       0.07      0.92      0.13        59

    accuracy                           0.16       834
   macro avg       0.51      0.51      0.16       834
weighted avg       0.88      0.16      0.18       834

Confusion matrix:
 [[ 78 697]
 [  5  54]]


 ######################################## Multitarget Bayesian Ridge Model ########################################
MSE per DPQ item: [0.57214148 0.5100414  0.82521296 0.66488339 0.68810514 0.58624875
 0.6234715  0.44562775 0.20339207]
Average MSE: 0.5687916054884328
F1 score: 0.3333333333333333
ROC-AUC: 0.6115472936030618
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       775
           1       0.56     