# **Initialization** 

In [1]:
import pandas as pd

### Control

In [2]:
# Paths
RAW_DATA_FOLDER = 'raw_data'
TARGET_FILE_PATH = 'unprocessed_data'

# Flow Controls
RELOAD_RAW_DATA = False

# System variables
STATE = 42
TEST_SET_FRACTION = 0.20
MISSING_VALUES_THRESHOLD = 0.50

# **Data Preparation**

### Merge raw data files

In [3]:
from raw_data_loader import load_raw_data

if (RELOAD_RAW_DATA):
    load_raw_data(RAW_DATA_FOLDER, TARGET_FILE_PATH)

### Preprocessing and Split

In [None]:
from preprocessing_depression import clean_and_preprocess_depression_data

dataset = pd.read_csv(TARGET_FILE_PATH + '/depression_data.csv')
X_train, X_test, y_train, y_test, y_embed_train, y_embed_test = clean_and_preprocess_depression_data(dataset, RAW_DATA_FOLDER, TEST_SET_FRACTION, STATE, MISSING_VALUES_THRESHOLD)

#TODO: Add insomnia part, and change variable names to have one for depression and one for insomnia

#TODO Fix issue with the time columns SLQ300/310/320/330

Dropped 142 columns with >50.0% missing values
Shape after dropping high-missing columns: (3333, 109)
Replaced 708 special code values with NaN
Replaced 186 special code values with NaN
Ordinal columns: 27
Nominal columns: 3
Binary columns: 2
Numerical columns: 74
Object columns (excluded): 3
Total columns identified: 109




# **Models**

### Training Bayesian Models

In [None]:
from bayesian_models import *

train_binary_bayes(X_train, X_test, y_train, y_test)
train_multitarget_bayes(X_train, X_test, y_train, y_test, y_embed_train, y_embed_test)
train_bayes_with_embed(X_train, X_test, y_train, y_test, y_embed_train, y_embed_test)



 ######################################## Binary GaussianNB Model ########################################
F1 score: 0.2334384858044164
ROC-AUC: 0.7281793329688354
              precision    recall  f1-score   support

           0       0.96      0.71      0.82       775
           1       0.14      0.63      0.23        59

    accuracy                           0.71       834
   macro avg       0.55      0.67      0.53       834
weighted avg       0.90      0.71      0.78       834

Confusion matrix:
 [[554 221]
 [ 22  37]]


 ######################################## Multitarget Bayesian Ridge Model ########################################
MSE per DPQ item: [0.60882897 0.55739587 0.8385397  0.68392227 0.69568258 0.62193537
 0.64541242 0.41914565 0.20716355]
Average MSE: 0.586447375888932
F1 score: 0.3157894736842105
ROC-AUC: 0.5984691088026243
              precision    recall  f1-score   support

           0       0.94      0.99      0.97       775
           1       0.71      0

### Training Random Forests

In [5]:
# Parameters
N_ESTIMATORS = 100

In [6]:
from forest_models import *

train_binary_forest(N_ESTIMATORS, STATE, X_train, X_test, y_train, y_test)
train_multitarget_forest(N_ESTIMATORS, STATE, X_train, X_test, y_train, y_test, y_embed_train, y_embed_test)
train_forest_with_embed_input(N_ESTIMATORS, STATE, X_train, X_test, y_train, y_test, y_embed_train, y_embed_test)



 ######################################## binary Forest  ########################################
F1 score: 0.0
ROC-AUC: 0.8172006560962274
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       775
           1       0.00      0.00      0.00        59

    accuracy                           0.93       834
   macro avg       0.46      0.50      0.48       834
weighted avg       0.86      0.93      0.90       834

Confusion matrix:
 [[775   0]
 [ 59   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])




 ######################################## Multitarget Forest  ########################################
MSE per DPQ item: [0.66014688 0.58068189 0.8504012  0.71599532 0.7357241  0.64858082
 0.67258993 0.45501235 0.22820779]
Average MSE: 0.6163711430855314
F1 score: 0.3181818181818182
ROC-AUC: 0.6089666484417715
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       775
           1       0.48      0.24      0.32        59

    accuracy                           0.93       834
   macro avg       0.71      0.61      0.64       834
weighted avg       0.91      0.93      0.92       834

Confusion matrix:
 [[760  15]
 [ 45  14]]


 ######################################## Forest with embedding as faetures  ########################################
F1 score: 0.7708333333333334
ROC-AUC: 0.9976927282668124
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       775
           1       1.00      0.

### Training Logistic Models

In [7]:
# Parameters
MAX_ITERATIONS = 1000

In [8]:
from logistic_models import *

train_binary_logistic(MAX_ITERATIONS, STATE, X_train, X_test, y_train, y_test)
train_multitarget_logistic(MAX_ITERATIONS, STATE, X_train, X_test, y_train, y_test, y_embed_train, y_embed_test)
train_logistic_with_embed_input(MAX_ITERATIONS, STATE, X_train, X_test, y_train, y_test, y_embed_train, y_embed_test)



 ######################################## Binary Logistic Regression  ########################################
F1 score: 0.2943722943722944
ROC-AUC: 0.7993439037725534
              precision    recall  f1-score   support

           0       0.96      0.82      0.89       775
           1       0.20      0.58      0.29        59

    accuracy                           0.80       834
   macro avg       0.58      0.70      0.59       834
weighted avg       0.91      0.80      0.84       834

Confusion matrix:
 [[637 138]
 [ 25  34]]


 ######################################## Multitarget Logistic Regression ########################################
MSE per DPQ item: [2.52278177 1.30215827 1.80815348 1.52398082 1.62589928 1.35611511
 1.48800959 2.07793765 1.41127098]
Average MSE: 1.67958966160405
F1 score: 0.259927797833935
ROC-AUC: 0.6876653909240023
              precision    recall  f1-score   support

           0       0.96      0.77      0.85       775
           1       0.17      

### Train Split MLP

In [None]:
torch.device("cuda" if torch.cuda.is_available() else "cpu")

from split_model import train_split_model

train_split_model( X_train, X_test, y_train, y_test, y_embed_train, y_embed_test )

### Train Joint MLP

In [None]:
from joint_model import train_joint_model

train_joint_model( X_train, X_test, y_train, y_test, y_embed_train, y_embed_test )