# **Initialization** 

In [1]:
import pandas as pd
import torch

### Control

In [None]:
# Paths
RAW_DATA_FOLDER = 'raw_data'
TARGET_FILE_PATH = 'unprocessed_data'

# Flow Controls
RELOAD_RAW_DATA = False

# System variables
STATE = 42
TEST_SET_FRACTION = 0.20
MISSING_VALUES_THRESHOLD = 0.50
SAMPLES_ELECTRICAL_CIRCUIT = 5000

# **Data Preparation**

### Merge raw data files

In [3]:
from raw_data_loader import load_raw_data

if (RELOAD_RAW_DATA):
    load_raw_data(RAW_DATA_FOLDER, TARGET_FILE_PATH)

### Preprocessing and Split

In [None]:
from preprocessing_depression import clean_and_preprocess_depression_data
from preprocessing_insomnia import clean_and_preprocess_insomnia_data
from preprocessing_electrical_circuit import gen_and_preprocess_ec_data

dataset = pd.read_csv(TARGET_FILE_PATH + '/depression_data.csv')

dep_X_train, dep_X_test, dep_y_train, dep_y_test, dep_y_embed_train, dep_y_embed_test = clean_and_preprocess_depression_data(dataset, RAW_DATA_FOLDER, TEST_SET_FRACTION, STATE, MISSING_VALUES_THRESHOLD)
ins_X_train, ins_X_test, ins_y_train, ins_y_test, ins_y_embed_train, ins_y_embed_test = 
ec_X_train, ec_X_test, ec_y_train, ec_y_test, ec_y_embed_train, ec_y_embed_test = gen_and_preprocess_ec_data(SAMPLES_ELECTRICAL_CIRCUIT, TEST_SET_FRACTION, STATE)

#TODO Fix issue with the time columns SLQ300/310/320/330

Dropped 142 columns with >50.0% missing values
Shape after dropping high-missing columns: (3333, 109)
Replaced 708 special code values with NaN
Replaced 186 special code values with NaN
Ordinal columns: 27
Nominal columns: 3
Binary columns: 2
Numerical columns: 74
Object columns (excluded): 3
Total columns identified: 109
Basic Electrical Circuit Simulator/Generator




ValueError: Found input variables with inconsistent numbers of samples: [10, 5000]

# **Models**

### Training Bayesian Models

In [5]:
from bayesian_models import *

train_binary_bayes(X_train, X_test, y_train, y_test)
train_multitarget_bayes(X_train, X_test, y_train, y_test, y_embed_train, y_embed_test)
train_bayes_with_embed(X_train, X_test, y_train, y_test, y_embed_train, y_embed_test)



 ######################################## Binary GaussianNB Model ########################################
F1 score: 0.2334384858044164
ROC-AUC: 0.728135593220339
              precision    recall  f1-score   support

           0       0.96      0.71      0.82       775
           1       0.14      0.63      0.23        59

    accuracy                           0.71       834
   macro avg       0.55      0.67      0.53       834
weighted avg       0.90      0.71      0.78       834

Confusion matrix:
 [[554 221]
 [ 22  37]]


 ######################################## Multitarget Bayesian Ridge Model ########################################
MSE per DPQ item: [0.6081899  0.55723907 0.83870486 0.6835447  0.69565646 0.62180295
 0.64496949 0.41919515 0.20707469]
Average MSE: 0.5862641415149625
F1 score: 0.3157894736842105
ROC-AUC: 0.5984691088026243
              precision    recall  f1-score   support

           0       0.94      0.99      0.97       775
           1       0.71      0

### Training Random Forests

In [6]:
# Parameters
N_ESTIMATORS = 100

In [7]:
from forest_models import *

train_binary_forest(N_ESTIMATORS, STATE, X_train, X_test, y_train, y_test)
train_multitarget_forest(N_ESTIMATORS, STATE, X_train, X_test, y_train, y_test, y_embed_train, y_embed_test)
train_forest_with_embed_input(N_ESTIMATORS, STATE, X_train, X_test, y_train, y_test, y_embed_train, y_embed_test)



 ######################################## binary Forest  ########################################
F1 score: 0.03333333333333333
ROC-AUC: 0.8023728813559323
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       775
           1       1.00      0.02      0.03        59

    accuracy                           0.93       834
   macro avg       0.97      0.51      0.50       834
weighted avg       0.94      0.93      0.90       834

Confusion matrix:
 [[775   0]
 [ 58   1]]


 ######################################## Multitarget Forest  ########################################
MSE per DPQ item: [0.66152182 0.58451823 0.85873981 0.69939856 0.7436259  0.65868909
 0.67468957 0.45619149 0.2308952 ]
Average MSE: 0.6186966293631762
F1 score: 0.29213483146067415
ROC-AUC: 0.5992017495899399
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       775
           1       0.43      0.22      0.29      

### Training Logistic Models

In [8]:
# Parameters
MAX_ITERATIONS = 1000

In [9]:
from logistic_models import *

train_binary_logistic(MAX_ITERATIONS, STATE, X_train, X_test, y_train, y_test)
train_multitarget_logistic(MAX_ITERATIONS, STATE, X_train, X_test, y_train, y_test, y_embed_train, y_embed_test)
train_logistic_with_embed_input(MAX_ITERATIONS, STATE, X_train, X_test, y_train, y_test, y_embed_train, y_embed_test)



 ######################################## Binary Logistic Regression  ########################################
F1 score: 0.2918454935622318
ROC-AUC: 0.7994532531437943
              precision    recall  f1-score   support

           0       0.96      0.82      0.89       775
           1       0.20      0.58      0.29        59

    accuracy                           0.80       834
   macro avg       0.58      0.70      0.59       834
weighted avg       0.91      0.80      0.84       834

Confusion matrix:
 [[635 140]
 [ 25  34]]


 ######################################## Multitarget Logistic Regression ########################################
MSE per DPQ item: [2.43884892 1.30935252 1.80215827 1.51918465 1.65707434 1.35611511
 1.47841727 2.08752998 1.5059952 ]
Average MSE: 1.6838529176658674
F1 score: 0.26523297491039427
ROC-AUC: 0.6954948059048661
              precision    recall  f1-score   support

           0       0.96      0.76      0.85       775
           1       0.17  

### Train Split MLP

In [10]:
from split_model import train_split_model

torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_split_model( X_train, X_test, y_train, y_test, y_embed_train, y_embed_test )

TypeError: 'int' object is not callable

### Train Joint MLP

In [None]:
from joint_model import train_joint_model

train_joint_model( X_train, X_test, y_train, y_test, y_embed_train, y_embed_test )