# **Initialization** 

In [1]:
import pandas as pd
import numpy as np
import torch

### Control

In [2]:
# Paths
RAW_DATA_FOLDER = 'raw_data'
TARGET_FILE_PATH = 'unprocessed_data'

# Flow Controls
RELOAD_RAW_DATA = False
DO_SMOTE = True
DATA = 'depression'  # Options: 'depression', 'insomnia', 'electrical_circuit'

# System variables
STATE = 42
TEST_SET_FRACTION = 0.20
MISSING_VALUES_THRESHOLD = 0.50
SAMPLES_ELECTRICAL_CIRCUIT = 5000
VERBOSE = True
FLIP_LABEL_FRACTION = 0.03

np.random.seed(STATE)

# **Data Preparation**

### Merge raw data files

In [3]:
from raw_data_loader import load_raw_data

if (RELOAD_RAW_DATA):
    load_raw_data(RAW_DATA_FOLDER, TARGET_FILE_PATH)

### Preprocessing and Split

In [4]:
from preprocessing_depression import clean_and_preprocess_depression_data
from preprocessing_insomnia import clean_and_preprocess_insomnia_data
from preprocessing_electrical_circuit import gen_and_preprocess_ec_data

dataset = pd.read_csv(TARGET_FILE_PATH + '/depression_data.csv')

if DATA == 'depression':
    X_train, X_test, y_train, y_test, y_embed_train, y_embed_test = clean_and_preprocess_depression_data(dataset, RAW_DATA_FOLDER, TEST_SET_FRACTION, STATE, MISSING_VALUES_THRESHOLD)
elif DATA == 'insomnia':
    X_train, X_test, y_train, y_test, y_embed_train, y_embed_test = clean_and_preprocess_insomnia_data(dataset, RAW_DATA_FOLDER, TEST_SET_FRACTION, STATE, MISSING_VALUES_THRESHOLD)
elif DATA == 'electrical_circuit':
    X_train, X_test, y_train, y_test, y_embed_train, y_embed_test = gen_and_preprocess_ec_data(SAMPLES_ELECTRICAL_CIRCUIT, TEST_SET_FRACTION, STATE)
    DO_SMOTE = False
else:
    raise ValueError("Invalid dataset selected")

#TODO Fix issue with the time columns SLQ300/310/320/330 in depression and processing

Dropped 142 columns with >50.0% missing values
Shape after dropping high-missing columns: (3333, 109)
Replaced 708 special code values with NaN
Replaced 186 special code values with NaN
Ordinal columns: 27
Nominal columns: 3
Binary columns: 2
Numerical columns: 74
Object columns (excluded): 3
Total columns identified: 109




### Data Balancing

In [5]:
from data_balancing import resample_training_data

if DO_SMOTE:
    X_train, y_train, y_embed_train = resample_training_data(X_train, y_train, y_embed_train, random_state=STATE)

ImportError: cannot import name '_is_pandas_df' from 'sklearn.utils.validation' (/usr/lib/python3.14/site-packages/sklearn/utils/validation.py)

In [6]:
# Check class distribution
classes, counts = np.unique(y_train, return_counts=True)
print("Class Distribution:\n", dict(zip(classes, counts)))

if len(classes) > 1:
    class_ratio = counts[1] / counts[0]
    print(f"\nClass ratio: {class_ratio:.3f}")
else:
    print("\nOnly one class present.")

Class Distribution:
 {np.int64(0): np.int64(3095), np.int64(1): np.int64(238)}

Class ratio: 0.077


### Introduce Noise to label

In [7]:
assert FLIP_LABEL_FRACTION > 0.0 and FLIP_LABEL_FRACTION < 1.0, "FLIP_LABEL_FRACTION should be beween 0.0 and 1.0"

# Randomly select indices to flip
if FLIP_LABEL_FRACTION > 0.0:
    num_to_flip = int(FLIP_LABEL_FRACTION * len(y_train))
    flip_indices = np.random.choice(len(y_train), size=num_to_flip, replace=False)

    # If y_train is a pandas Series, convert to int for safe arithmetic
    if hasattr(y_train, 'iloc'):
        y_train = y_train.astype(int)
        y_train.iloc[flip_indices] = 1 - y_train.iloc[flip_indices]
    else:  # numpy array
        y_train[flip_indices] = 1 - y_train[flip_indices]

### Make everything a numpy array

In [8]:
X_train = X_train.values if hasattr(X_train, "values") else np.array(X_train)
X_test = X_test.values if hasattr(X_test, "values") else np.array(X_test)

y_train = y_train.values.ravel() if hasattr(y_train, "values") else np.array(y_train).ravel()
y_test = y_test.values.ravel() if hasattr(y_test, "values") else np.array(y_test).ravel()

y_embed_train = y_embed_train.values if hasattr(y_embed_train, "values") else np.array(y_embed_train)
y_embed_test = y_embed_test.values if hasattr(y_embed_test, "values") else np.array(y_embed_test)

assert(isinstance(X_train, np.ndarray))
assert(isinstance(X_test, np.ndarray))
assert(isinstance(y_train, np.ndarray))
assert(isinstance(y_test, np.ndarray))
assert(isinstance(y_embed_train, np.ndarray))
assert(isinstance(y_embed_test, np.ndarray))

# **Models**

In [6]:
from baseline_models import train_multitarget_baseline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

### Training Bayesian Models

In [6]:
nb_model = GaussianNB()
y_pred_nb, acc_nb = train_multitarget_baseline(
                            model=nb_model,
                            is_classifier=False,
                            X_train=X_train,
                            X_test=X_test,
                            y_embed_train=y_embed_train,
                            y_embed_test=y_embed_test,
                            verbose=VERBOSE)



 ######################################## GaussianNB Multitarget Regressor ########################################
Train MSE per embedding: [ 52059.36025  41509.8905   54759.43225  63583.644    77929.185
  62877.18275 107183.147    47843.55025]
Test MSE per embedding: [ 56894.288  44889.096  63442.078  74432.063  90656.514  77874.897
 119260.47   49646.971]
Average train MSE: 63468.174
Average test MSE: 72137.047125


### Training Random Forests

In [7]:
# Parameter
N_ESTIMATORS = 100

In [8]:
rf_model = RandomForestRegressor(n_estimators=N_ESTIMATORS, random_state=STATE, n_jobs=-1)
y_pred_rf, mse_rf = train_multitarget_baseline(
                                    model=rf_model, 
                                    is_classifier=False, 
                                    X_train=X_train, 
                                    X_test=X_test, 
                                    y_embed_train=y_embed_train, 
                                    y_embed_test=y_embed_test,
                                    verbose=VERBOSE)



 ######################################## RandomForestRegressor Multitarget Regressor ########################################
Train MSE per embedding: [ 495.97067715  696.42313045  593.53877665  391.28257722  123.41573847
 3346.21772207 5408.1000016   502.34868655]
Test MSE per embedding: [ 2943.6461006  5329.0526277  4048.9118285  2540.1975794   734.7480725
 24310.930406  40204.2232533  3802.708152 ]
Average train MSE: 1444.6621637718752
Average test MSE: 10489.302252499998


### Training Logistic Models

In [9]:
# Parameters
MAX_ITERATIONS = 1000

In [10]:
log_model = LogisticRegression(max_iter=MAX_ITERATIONS, class_weight='balanced', random_state=STATE)
y_pred_log, acc_log = train_multitarget_baseline(
                            model=log_model,
                            is_classifier=True,
                            X_train=X_train,
                            X_test=X_test,
                            y_embed_train=y_embed_train,
                            y_embed_test=y_embed_test,
                            verbose=VERBOSE)



 ######################################## LogisticRegression Multitarget Classifier ########################################
Train accuracy per embedding: [0.06525 0.069   0.06175 0.0595  0.0595  0.056   0.0525  0.061  ]
Test accuracy per embedding: [0.003 0.003 0.001 0.001 0.002 0.001 0.001 0.002]
Average train accuracy: 0.0605625
Average test accuracy: 0.00175


## Proposed MLPs

In [9]:
from proposed_models import train_joint_model, train_split_model, train_deep_joint_model, train_deep_split_model

DEVICE = "cpu"#torch.device("cuda" if torch.cuda.is_available() else "cpu")
E_KEEP_RATE = 0.7
l = 1
if DATA == 'depression':
    l = 1e-2
elif DATA == 'insomnia':
    l = 1e-2
elif DATA == 'electrical_circuit':
    l = 1

EPOCHS = 100
AUGMENT_EPOCHS = EPOCHS//2
EARLY_STOP_EPOCHS = EPOCHS//5

In [10]:
# Sanity Checks
print("Using ", DEVICE, " for torch")

assert X_train.shape[0] >= 100 and y_train.shape[0] >= 100 and y_embed_train.shape[0] >= 100, "Arrays must have at least 100 samples for the check."

aligned = (len(X_train[:100]) == len(y_train[:100])) and (len(X_train[:100]) == len(y_embed_train[:100]))
assert aligned, "First 100 samples of X_train, y_train, and y_embed_train are not aligned."

Using  cpu  for torch


### Train Joint MLP

In [12]:
train_joint_model( X_train, X_test, y_train, y_test, y_embed_train, y_embed_test,
                    e_kept_ratio=E_KEEP_RATE,
                    l=l,
                    epochs=EPOCHS,
                    augment_epochs=AUGMENT_EPOCHS,
                    early_stop_epochs=EARLY_STOP_EPOCHS,
                    device=DEVICE
                  )

Training:	###############################################---	[95.0% - DONE]

Regression Results:
MSE:	0.6838266849517822


Classification Results:
F1 score: 0.25
              precision    recall  f1-score   support

           0       0.94      0.93      0.94       775
           1       0.23      0.27      0.25        59

    accuracy                           0.88       834
   macro avg       0.59      0.60      0.59       834
weighted avg       0.89      0.88      0.89       834

Confusion matrix:
 [[722  53]
 [ 43  16]]
Training:	##################################################	[100.0%]

Regression Results:
MSE:	0.8706539869308472


Classification Results:
F1 score: 0.1415929203539823
              precision    recall  f1-score   support

           0       0.93      0.94      0.94       775
           1       0.15      0.14      0.14        59

    accuracy                           0.88       834
   macro avg       0.54      0.54      0.54       834
weighted avg       0.88    

### Train Split MLP

In [13]:
train_split_model( X_train, X_test, y_train, y_test, y_embed_train, y_embed_test,
                    e_kept_ratio=E_KEEP_RATE,
                    epochs=EPOCHS,
                    augment_epochs=AUGMENT_EPOCHS,
                    early_stop_epochs=EARLY_STOP_EPOCHS,
                    device=DEVICE
                  )

Training:	##################################################	[100.0%]

Regression Results:
MSE:	1.0288583040237427


Classification Results:
F1 score: 0.19047619047619047
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       775
           1       0.32      0.14      0.19        59

    accuracy                           0.92       834
   macro avg       0.63      0.56      0.57       834
weighted avg       0.89      0.92      0.90       834

Confusion matrix:
 [[758  17]
 [ 51   8]]
Training:	##################################################	[100.0%]

Regression Results:
MSE:	3.390148401260376


Classification Results:
F1 score: 0.1797752808988764
              precision    recall  f1-score   support

           0       0.94      0.97      0.95       775
           1       0.27      0.14      0.18        59

    accuracy                           0.91       834
   macro avg       0.60      0.55      0.57       834
weighted avg       

### Train Deep Joint Model

In [14]:
train_deep_joint_model( X_train, X_test, y_train, y_test, y_embed_train, y_embed_test,
                        e_kept_ratio=E_KEEP_RATE,
                        l=l,
                        epochs=EPOCHS,
                        augment_epochs=AUGMENT_EPOCHS,
                        early_stop_epochs=EARLY_STOP_EPOCHS,
                        device=DEVICE
                      )

Training:	######################################------------	[77.0% - DONE]

Regression Results:
MSE:	1.2267913818359375


Classification Results:
F1 score: 0.0
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       775
           1       0.00      0.00      0.00        59

    accuracy                           0.93       834
   macro avg       0.46      0.50      0.48       834
weighted avg       0.86      0.93      0.90       834

Confusion matrix:
 [[775   0]
 [ 59   0]]
Training:	####----------------------------------------------	[8.0%]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Training:	##################################################	[100.0%]

Regression Results:
MSE:	1.2267913818359375


Classification Results:
F1 score: 0.0
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       775
           1       0.00      0.00      0.00        59

    accuracy                           0.93       834
   macro avg       0.46      0.50      0.48       834
weighted avg       0.86      0.93      0.90       834

Confusion matrix:
 [[775   0]
 [ 59   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Train Deep Split Model

In [None]:
train_deep_split_model( X_train, X_test, y_train, y_test, y_embed_train, y_embed_test,
                        e_kept_ratio=E_KEEP_RATE,
                        epochs=EPOCHS,
                        augment_epochs=AUGMENT_EPOCHS,
                        early_stop_epochs=EARLY_STOP_EPOCHS,
                        device=DEVICE
                      )

Training:	################----------------------------------	[32.0%]