# **Initialization** 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch

### Control

In [2]:
# Paths
RAW_DATA_FOLDER = 'raw_data'
TARGET_FILE_PATH = 'unprocessed_data'

# Flow Controls
RELOAD_RAW_DATA = False
DO_SMOTE = True
DATA = 'insomnia'  # Options: 'depression', 'insomnia', 'electrical_circuit'

# System variables
STATE = 42
TEST_SET_FRACTION = 0.20
MISSING_VALUES_THRESHOLD = 0.50
SAMPLES_ELECTRICAL_CIRCUIT = 5000
VERBOSE = True
FLIP_LABEL_FRACTION = 0.03

np.random.seed(STATE)

# **Data Preparation**

### Merge raw data files

In [3]:
from raw_data_loader import load_raw_data

if (RELOAD_RAW_DATA):
    load_raw_data(RAW_DATA_FOLDER, TARGET_FILE_PATH)

### Preprocessing and Split

In [4]:
from preprocessing_depression import clean_and_preprocess_depression_data
from preprocessing_insomnia import clean_and_preprocess_insomnia_data
from preprocessing_electrical_circuit import gen_and_preprocess_ec_data

dataset = pd.read_csv(TARGET_FILE_PATH + '/' + DATA + '_data.csv')

In [5]:
from visualizations import plot_phq9_distribution

if DATA == 'depression':
    plot_phq9_distribution(dataset, save_path='./visualizations/phq9_distribution.png')

In [6]:
if DATA == 'depression':
    X_train, X_test, y_train, y_test, y_embed_train, y_embed_test = clean_and_preprocess_depression_data(dataset, RAW_DATA_FOLDER, TEST_SET_FRACTION, STATE, MISSING_VALUES_THRESHOLD)
elif DATA == 'insomnia':
    X_train, X_test, y_train, y_test, y_embed_train, y_embed_test = clean_and_preprocess_insomnia_data(dataset, RAW_DATA_FOLDER, TEST_SET_FRACTION, STATE, MISSING_VALUES_THRESHOLD)
    DO_SMOTE = False
elif DATA == 'electrical_circuit':
    X_train, X_test, y_train, y_test, y_embed_train, y_embed_test = gen_and_preprocess_ec_data(SAMPLES_ELECTRICAL_CIRCUIT, TEST_SET_FRACTION, STATE)
    DO_SMOTE = False
else:
    raise ValueError("Invalid dataset selected")

#TODO Fix issue with the time columns SLQ300/310/320/330 in depression and processing

  data["approximate_freq_moderate_LTPA"] = data["PAD790Q"] * data["PAD790U"].map(convert_frequency)
  data["approximate_freq_vigorous_LTPA"] = data["PAD810Q"] * data["PAD810U"].map(convert_frequency)
  data["approximate_mins_moderate_LTPA"] = data["approximate_freq_moderate_LTPA"] * data["PAD800"]
  data["approximate_mins_vigorous_LTPA"] = data["approximate_freq_vigorous_LTPA"] * data["PAD820"]
  data["insulin_time"] = data["DID060"] * data["DIQ060U"].map(convert_month_year)
  data["FNQ520_fixed"] = data["FNQ520"].map(order_corrected)
  data["FNQ540_fixed"] = data["FNQ540"].map(order_corrected)
  data["approximate_freq_moderate_LTPA"] = data["PAD790Q"] * data["PAD790U"].map(convert_frequency)
  data["approximate_freq_vigorous_LTPA"] = data["PAD810Q"] * data["PAD810U"].map(convert_frequency)
  data["approximate_mins_moderate_LTPA"] = data["approximate_freq_moderate_LTPA"] * data["PAD800"]
  data["approximate_mins_vigorous_LTPA"] = data["approximate_freq_vigorous_LTPA"] * data["PAD820"]


### Data Balancing

In [7]:
from data_balancing import resample_training_data

if DO_SMOTE:
    X_train, y_train, y_embed_train = resample_training_data(X_train, y_train, y_embed_train, random_state=STATE)

In [8]:
# Check class distribution
classes, counts = np.unique(y_train, return_counts=True)
print("Class Distribution:\n", dict(zip(classes, counts)))

if len(classes) > 1:
    class_ratio = counts[1] / counts[0]
    print(f"\nClass ratio: {class_ratio:.3f}")
else:
    print("\nOnly one class present.")

Class Distribution:
 {np.int64(0): np.int64(4452), np.int64(1): np.int64(2177)}

Class ratio: 0.489


### Introduce Noise to label

In [9]:
assert FLIP_LABEL_FRACTION > 0.0 and FLIP_LABEL_FRACTION < 1.0, "FLIP_LABEL_FRACTION should be beween 0.0 and 1.0"

# Randomly select indices to flip
if FLIP_LABEL_FRACTION > 0.0:
    num_to_flip = int(FLIP_LABEL_FRACTION * len(y_train))
    flip_indices = np.random.choice(len(y_train), size=num_to_flip, replace=False)

    # If y_train is a pandas Series, convert to int for safe arithmetic
    if hasattr(y_train, 'iloc'):
        y_train = y_train.astype(int)
        y_train.iloc[flip_indices] = 1 - y_train.iloc[flip_indices]
    else:  # numpy array
        y_train[flip_indices] = 1 - y_train[flip_indices]

### Make everything a numpy array

In [10]:
X_train = X_train.values if hasattr(X_train, "values") else np.array(X_train)
X_test = X_test.values if hasattr(X_test, "values") else np.array(X_test)

y_train = y_train.values.ravel() if hasattr(y_train, "values") else np.array(y_train).ravel()
y_test = y_test.values.ravel() if hasattr(y_test, "values") else np.array(y_test).ravel()

y_embed_train = y_embed_train.values if hasattr(y_embed_train, "values") else np.array(y_embed_train)
y_embed_test = y_embed_test.values if hasattr(y_embed_test, "values") else np.array(y_embed_test)

assert(isinstance(X_train, np.ndarray))
assert(isinstance(X_test, np.ndarray))
assert(isinstance(y_train, np.ndarray))
assert(isinstance(y_test, np.ndarray))
assert(isinstance(y_embed_train, np.ndarray))
assert(isinstance(y_embed_test, np.ndarray))

# **Models**

In [11]:
from baseline_models import train_multitarget_baseline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

### Training Bayesian Models

In [12]:
nb_model = GaussianNB()
y_pred_nb, acc_nb = train_multitarget_baseline(
                            model=nb_model,
                            is_classifier=False,
                            X_train=X_train,
                            X_test=X_test,
                            y_embed_train=y_embed_train,
                            y_embed_test=y_embed_test,
                            verbose=VERBOSE)



 ######################################## GaussianNB Multitarget Regressor ########################################
Train MSE per embedding: [4.92008489e+05 1.04875977e+05 3.12703273e+01 5.39164105e+05
 1.17102485e+05 3.07934832e+01]
Test MSE per embedding: [4.83141042e+05 1.09818293e+05 3.12000000e+01 5.54333474e+05
 1.18742387e+05 3.05848943e+01]
Average train MSE: 208868.8532961231
Average test MSE: 211016.16354481372


### Training Random Forests

In [13]:
# Parameter
N_ESTIMATORS = 100

In [14]:
rf_model = RandomForestRegressor(n_estimators=N_ESTIMATORS, random_state=STATE, n_jobs=-1)
y_pred_rf, mse_rf = train_multitarget_baseline(
                                    model=rf_model, 
                                    is_classifier=False, 
                                    X_train=X_train, 
                                    X_test=X_test, 
                                    y_embed_train=y_embed_train, 
                                    y_embed_test=y_embed_test,
                                    verbose=VERBOSE)



 ######################################## RandomForestRegressor Multitarget Regressor ########################################
Train MSE per embedding: [4.21552754e+04 2.42151328e+03 3.26947353e-01 5.07578752e+04
 2.07041694e+03 3.73822281e-01]
Test MSE per embedding: [2.94573840e+05 1.61498089e+04 2.27471535e+00 3.75469639e+05
 1.42906254e+04 2.65758121e+00]
Average train MSE: 16234.2969424599
Average test MSE: 116748.14090667675


### Training Logistic Models

In [15]:
# Parameters
MAX_ITERATIONS = 1000

In [16]:
log_model = LogisticRegression(max_iter=MAX_ITERATIONS, class_weight='balanced', random_state=STATE)
y_pred_log, acc_log = train_multitarget_baseline(
                            model=log_model,
                            is_classifier=True,
                            X_train=X_train,
                            X_test=X_test,
                            y_embed_train=y_embed_train,
                            y_embed_test=y_embed_test,
                            verbose=VERBOSE)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to 



 ######################################## LogisticRegression Multitarget Classifier ########################################
Train accuracy per embedding: [0.11464776 0.11645799 0.21602052 0.12203952 0.11932418 0.20108614]
Test accuracy per embedding: [0.03867069 0.01933535 0.14380665 0.04108761 0.03806647 0.14018127]
Average train accuracy: 0.14826268416553526
Average test accuracy: 0.0701913393756294


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Proposed MLPs

In [17]:
from proposed_models import train_joint_model, train_split_model, train_deep_joint_model, train_deep_split_model

DEVICE = "cpu"#torch.device("cuda" if torch.cuda.is_available() else "cpu")
E_KEEP_RATE = 0.7
l = 1
if DATA == 'depression':
    l = 1e-2
elif DATA == 'insomnia':
    l = 1e-2
elif DATA == 'electrical_circuit':
    l = 1

EPOCHS = 100
AUGMENT_EPOCHS = EPOCHS//2
EARLY_STOP_EPOCHS = EPOCHS//5

In [18]:
# Sanity Checks
print("Using ", DEVICE, " for torch")

assert X_train.shape[0] >= 100 and y_train.shape[0] >= 100 and y_embed_train.shape[0] >= 100, "Arrays must have at least 100 samples for the check."

aligned = (len(X_train[:100]) == len(y_train[:100])) and (len(X_train[:100]) == len(y_embed_train[:100]))
assert aligned, "First 100 samples of X_train, y_train, and y_embed_train are not aligned."

Using  cpu  for torch


### Train Joint MLP

In [19]:
train_joint_model( X_train, X_test, y_train, y_test, y_embed_train, y_embed_test,
                    e_kept_ratio=E_KEEP_RATE,
                    l=l,
                    epochs=EPOCHS,
                    augment_epochs=AUGMENT_EPOCHS,
                    early_stop_epochs=EARLY_STOP_EPOCHS,
                    device=DEVICE
                  )

Training:	##################################################	[100.0%]

Regression Results:
MSE:	112187.7734375


Classification Results:
F1 score: 0.0
              precision    recall  f1-score   support

           0       0.66      1.00      0.80      3068
           1       0.00      0.00      0.00      1572

    accuracy                           0.66      4640
   macro avg       0.33      0.50      0.40      4640
weighted avg       0.44      0.66      0.53      4640

Confusion matrix:
 [[3068    0]
 [1572    0]]
Regression Results:
MSE:	115471.2578125


Classification Results:
F1 score: 0.0
              precision    recall  f1-score   support

           0       0.67      1.00      0.80      1113
           1       0.00      0.00      0.00       542

    accuracy                           0.67      1655
   macro avg       0.34      0.50      0.40      1655
weighted avg       0.45      0.67      0.54      1655

Confusion matrix:
 [[1113    0]
 [ 542    0]]
Training:	-------------

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Training:	##################################################	[100.0%]

Regression Results:
MSE:	115516.375


Classification Results:
F1 score: 0.0
              precision    recall  f1-score   support

           0       0.67      1.00      0.80      1113
           1       0.00      0.00      0.00       542

    accuracy                           0.67      1655
   macro avg       0.34      0.50      0.40      1655
weighted avg       0.45      0.67      0.54      1655

Confusion matrix:
 [[1113    0]
 [ 542    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Train Split MLP

In [20]:
train_split_model( X_train, X_test, y_train, y_test, y_embed_train, y_embed_test,
                    e_kept_ratio=E_KEEP_RATE,
                    epochs=EPOCHS,
                    augment_epochs=AUGMENT_EPOCHS,
                    early_stop_epochs=EARLY_STOP_EPOCHS,
                    device=DEVICE
                  )

Training:	#############-------------------------------------	[26.0% - DONE]

Regression Results:
MSE:	117164.65625


Classification Results:
F1 score: 0.0
              precision    recall  f1-score   support

           0       0.66      1.00      0.80      3068
           1       0.00      0.00      0.00      1572

    accuracy                           0.66      4640
   macro avg       0.33      0.50      0.40      4640
weighted avg       0.44      0.66      0.53      4640

Confusion matrix:
 [[3068    0]
 [1572    0]]
Regression Results:
MSE:	117930.28125


Classification Results:
F1 score: 0.0
              precision    recall  f1-score   support

           0       0.67      1.00      0.80      1113
           1       0.00      0.00      0.00       542

    accuracy                           0.67      1655
   macro avg       0.34      0.50      0.40      1655
weighted avg       0.45      0.67      0.54      1655

Confusion matrix:
 [[1113    0]
 [ 542    0]]
Training:	-----------

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Training:	######################----------------------------	[44.0% - DONE]

Regression Results:
MSE:	117930.28125


Classification Results:
F1 score: 0.0
              precision    recall  f1-score   support

           0       0.67      1.00      0.80      1113
           1       0.00      0.00      0.00       542

    accuracy                           0.67      1655
   macro avg       0.34      0.50      0.40      1655
weighted avg       0.45      0.67      0.54      1655

Confusion matrix:
 [[1113    0]
 [ 542    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Train Deep Joint Model

In [21]:
train_deep_joint_model( X_train, X_test, y_train, y_test, y_embed_train, y_embed_test,
                        e_kept_ratio=E_KEEP_RATE,
                        l=l,
                        epochs=EPOCHS,
                        augment_epochs=AUGMENT_EPOCHS,
                        early_stop_epochs=EARLY_STOP_EPOCHS,
                        device=DEVICE
                      )

Training:	###########---------------------------------------	[22.0% - DONE]

Regression Results:
MSE:	463525.5


Classification Results:
F1 score: 0.0
              precision    recall  f1-score   support

           0       0.66      1.00      0.80      3068
           1       0.00      0.00      0.00      1572

    accuracy                           0.66      4640
   macro avg       0.33      0.50      0.40      4640
weighted avg       0.44      0.66      0.53      4640

Confusion matrix:
 [[3068    0]
 [1572    0]]
Regression Results:
MSE:	464627.03125


Classification Results:
F1 score: 0.0
              precision    recall  f1-score   support

           0       0.67      1.00      0.80      1113
           1       0.00      0.00      0.00       542

    accuracy                           0.67      1655
   macro avg       0.34      0.50      0.40      1655
weighted avg       0.45      0.67      0.54      1655

Confusion matrix:
 [[1113    0]
 [ 542    0]]
Training:	---------------

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Training:	####################------------------------------	[40.0% - DONE]

Regression Results:
MSE:	464627.03125


Classification Results:
F1 score: 0.0
              precision    recall  f1-score   support

           0       0.67      1.00      0.80      1113
           1       0.00      0.00      0.00       542

    accuracy                           0.67      1655
   macro avg       0.34      0.50      0.40      1655
weighted avg       0.45      0.67      0.54      1655

Confusion matrix:
 [[1113    0]
 [ 542    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Train Deep Split Model

In [22]:
train_deep_split_model( X_train, X_test, y_train, y_test, y_embed_train, y_embed_test,
                        e_kept_ratio=E_KEEP_RATE,
                        epochs=EPOCHS,
                        augment_epochs=AUGMENT_EPOCHS,
                        early_stop_epochs=EARLY_STOP_EPOCHS,
                        device=DEVICE
                      )

Training:	###########---------------------------------------	[22.0% - DONE]

Regression Results:
MSE:	463511.15625


Classification Results:
F1 score: 0.0
              precision    recall  f1-score   support

           0       0.66      1.00      0.80      3068
           1       0.00      0.00      0.00      1572

    accuracy                           0.66      4640
   macro avg       0.33      0.50      0.40      4640
weighted avg       0.44      0.66      0.53      4640

Confusion matrix:
 [[3068    0]
 [1572    0]]
Regression Results:
MSE:	464610.53125


Classification Results:
F1 score: 0.0
              precision    recall  f1-score   support

           0       0.67      1.00      0.80      1113
           1       0.00      0.00      0.00       542

    accuracy                           0.67      1655
   macro avg       0.34      0.50      0.40      1655
weighted avg       0.45      0.67      0.54      1655

Confusion matrix:
 [[1113    0]
 [ 542    0]]
Training:	-----------

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Training:	#####################-----------------------------	[42.0% - DONE]

Regression Results:
MSE:	464610.53125


Classification Results:
F1 score: 0.0
              precision    recall  f1-score   support

           0       0.67      1.00      0.80      1113
           1       0.00      0.00      0.00       542

    accuracy                           0.67      1655
   macro avg       0.34      0.50      0.40      1655
weighted avg       0.45      0.67      0.54      1655

Confusion matrix:
 [[1113    0]
 [ 542    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
