# **Initialization** 

In [10]:
import pandas as pd
import numpy as np
import torch

### Control

In [11]:
# Paths
RAW_DATA_FOLDER = 'raw_data'
TARGET_FILE_PATH = 'unprocessed_data'

# Flow Controls
RELOAD_RAW_DATA = False
DO_SMOTE = True
DATA = 'electrical_circuit'  # Options: 'depression', 'insomnia', 'electrical_circuit'

# System variables
STATE = 42
TEST_SET_FRACTION = 0.20
MISSING_VALUES_THRESHOLD = 0.50
SAMPLES_ELECTRICAL_CIRCUIT = 5000
VERBOSE = True
FLIP_LABEL_FRACTION = 0.03

np.random.seed(STATE)

# **Data Preparation**

### Merge raw data files

In [12]:
from raw_data_loader import load_raw_data

if (RELOAD_RAW_DATA):
    load_raw_data(RAW_DATA_FOLDER, TARGET_FILE_PATH)

### Preprocessing and Split

In [13]:
from preprocessing_depression import clean_and_preprocess_depression_data
from preprocessing_insomnia import clean_and_preprocess_insomnia_data
from preprocessing_electrical_circuit import gen_and_preprocess_ec_data

dataset = pd.read_csv(TARGET_FILE_PATH + '/depression_data.csv')

if DATA == 'depression':
    X_train, X_test, y_train, y_test, y_embed_train, y_embed_test = clean_and_preprocess_depression_data(dataset, RAW_DATA_FOLDER, TEST_SET_FRACTION, STATE, MISSING_VALUES_THRESHOLD)
elif DATA == 'insomnia':
    X_train, X_test, y_train, y_test, y_embed_train, y_embed_test = clean_and_preprocess_insomnia_data(dataset, RAW_DATA_FOLDER, TEST_SET_FRACTION, STATE, MISSING_VALUES_THRESHOLD)
elif DATA == 'electrical_circuit':
    X_train, X_test, y_train, y_test, y_embed_train, y_embed_test = gen_and_preprocess_ec_data(SAMPLES_ELECTRICAL_CIRCUIT, TEST_SET_FRACTION, STATE)
    DO_SMOTE = False
else:
    raise ValueError("Invalid dataset selected")

#TODO Fix issue with the time columns SLQ300/310/320/330 in depression and processing

Basic Electrical Circuit Simulator/Generator


### Data Balancing

In [14]:
from data_balancing import resample_training_data

if DO_SMOTE:
    X_train, y_train, y_embed_train = resample_training_data(X_train, y_train, y_embed_train, random_state=STATE)

In [15]:
# Check class distribution
classes, counts = np.unique(y_train, return_counts=True)
print("Class Distribution:\n", dict(zip(classes, counts)))

if len(classes) > 1:
    class_ratio = counts[1] / counts[0]
    print(f"\nClass ratio: {class_ratio:.3f}")
else:
    print("\nOnly one class present.")

Class Distribution:
 {np.int64(0): np.int64(2007), np.int64(1): np.int64(1993)}

Class ratio: 0.993


### Introduce Noise to label

In [16]:
assert FLIP_LABEL_FRACTION > 0.0 and FLIP_LABEL_FRACTION < 1.0, "FLIP_LABEL_FRACTION should be beween 0.0 and 1.0"

# Randomly select indices to flip
if FLIP_LABEL_FRACTION > 0.0:
    num_to_flip = int(FLIP_LABEL_FRACTION * len(y_train))
    flip_indices = np.random.choice(len(y_train), size=num_to_flip, replace=False)

    # If y_train is a pandas Series, convert to int for safe arithmetic
    if hasattr(y_train, 'iloc'):
        y_train = y_train.astype(int)
        y_train.iloc[flip_indices] = 1 - y_train.iloc[flip_indices]
    else:  # numpy array
        y_train[flip_indices] = 1 - y_train[flip_indices]

### Make everything a numpy array

In [17]:
X_train = X_train.values if hasattr(X_train, "values") else np.array(X_train)
X_test = X_test.values if hasattr(X_test, "values") else np.array(X_test)

y_train = y_train.values.ravel() if hasattr(y_train, "values") else np.array(y_train).ravel()
y_test = y_test.values.ravel() if hasattr(y_test, "values") else np.array(y_test).ravel()

y_embed_train = y_embed_train.values if hasattr(y_embed_train, "values") else np.array(y_embed_train)
y_embed_test = y_embed_test.values if hasattr(y_embed_test, "values") else np.array(y_embed_test)

assert(isinstance(X_train, np.ndarray))
assert(isinstance(X_test, np.ndarray))
assert(isinstance(y_train, np.ndarray))
assert(isinstance(y_test, np.ndarray))
assert(isinstance(y_embed_train, np.ndarray))
assert(isinstance(y_embed_test, np.ndarray))

# **Models**

In [18]:
from baseline_models import train_multitarget_baseline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

### Training Bayesian Models

In [19]:
nb_model = GaussianNB()
y_pred_nb, acc_nb = train_multitarget_baseline(
                            model=nb_model,
                            is_classifier=False,
                            X_train=X_train,
                            X_test=X_test,
                            y_embed_train=y_embed_train,
                            y_embed_test=y_embed_test,
                            verbose=VERBOSE)



 ######################################## GaussianNB Multitarget Regressor ########################################
Train MSE per embedding: [ 52059.36025  41509.8905   54759.43225  63583.644    77929.185
  62877.18275 107183.147    47843.55025]
Test MSE per embedding: [ 56894.288  44889.096  63442.078  74432.063  90656.514  77874.897
 119260.47   49646.971]
Average train MSE: 63468.174
Average test MSE: 72137.047125


### Training Random Forests

In [20]:
# Parameter
N_ESTIMATORS = 100

In [21]:
rf_model = RandomForestRegressor(n_estimators=N_ESTIMATORS, random_state=STATE, n_jobs=-1)
y_pred_rf, mse_rf = train_multitarget_baseline(
                                    model=rf_model, 
                                    is_classifier=False, 
                                    X_train=X_train, 
                                    X_test=X_test, 
                                    y_embed_train=y_embed_train, 
                                    y_embed_test=y_embed_test,
                                    verbose=VERBOSE)

KeyboardInterrupt: 

### Training Logistic Models

In [None]:
# Parameters
MAX_ITERATIONS = 1000

In [None]:
log_model = LogisticRegression(max_iter=MAX_ITERATIONS, class_weight='balanced', random_state=STATE)
y_pred_log, acc_log = train_multitarget_baseline(
                            model=log_model,
                            is_classifier=True,
                            X_train=X_train,
                            X_test=X_test,
                            y_embed_train=y_embed_train,
                            y_embed_test=y_embed_test,
                            verbose=VERBOSE)



 ######################################## LogisticRegression Multitarget Classifier ########################################
Train accuracy per embedding: [0.06525 0.069   0.06175 0.0595  0.0595  0.056   0.0525  0.061  ]
Test accuracy per embedding: [0.003 0.003 0.001 0.001 0.002 0.001 0.001 0.002]
Average train accuracy: 0.0605625
Average test accuracy: 0.00175


## Proposed MLPs

In [22]:
from proposed_models import train_joint_model, train_split_model, train_deep_joint_model, train_deep_split_model

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
E_KEEP_RATE = 0.7
l = 1.0
if DATA == 'depression':
    l = 1e-2
elif DATA == 'insomnia':
    l = 1e-2
elif DATA == 'electrical_circuit':
    l = 1.0

EPOCHS = 300
FINE_TUNE_EPOCHS = 30

In [23]:
# Running on GPU currently takes twice as long
DEVICE = 'cpu'

In [24]:
# Sanity Checks
print("Using ", DEVICE, " for torch")

assert X_train.shape[0] >= 100 and y_train.shape[0] >= 100 and y_embed_train.shape[0] >= 100, "Arrays must have at least 100 samples for the check."

aligned = (len(X_train[:100]) == len(y_train[:100])) and (len(X_train[:100]) == len(y_embed_train[:100]))
assert aligned, "First 100 samples of X_train, y_train, and y_embed_train are not aligned."

Using  cpu  for torch


### Train Joint MLP

In [25]:
train_joint_model( X_train, X_test, y_train, y_test, y_embed_train, y_embed_test, e_kept_ratio=E_KEEP_RATE, l=l, device=DEVICE, epochs=EPOCHS, fine_tune_epochs=FINE_TUNE_EPOCHS)

Regression Results:
MSE:	27534.015625


Classification Results:
F1 score: 0.6998867497168743
              precision    recall  f1-score   support

           0       0.74      0.78      0.76       544
           1       0.72      0.68      0.70       456

    accuracy                           0.73      1000
   macro avg       0.73      0.73      0.73      1000
weighted avg       0.73      0.73      0.73      1000

Confusion matrix:
 [[426 118]
 [147 309]]
Regression Results:
MSE:	27734.30078125


Classification Results:
F1 score: 0.7084708470847084
              precision    recall  f1-score   support

           0       0.76      0.76      0.76       544
           1       0.71      0.71      0.71       456

    accuracy                           0.73      1000
   macro avg       0.73      0.73      0.73      1000
weighted avg       0.73      0.73      0.73      1000

Confusion matrix:
 [[413 131]
 [134 322]]


### Train Split MLP

In [None]:
train_split_model( X_train, X_test, y_train, y_test, y_embed_train, y_embed_test, e_kept_ratio=E_KEEP_RATE, device=DEVICE, fine_tune_epochs=FINE_TUNE_EPOCHS, epochs=EPOCHS)

Training:	###################-------------------------------	[39.0%]

KeyboardInterrupt: 

### Train Deep Joint Model

In [None]:
# why is 'l' hardcoded here? 
train_deep_joint_model( X_train, X_test, y_train, y_test, y_embed_train, y_embed_test, e_kept_ratio=E_KEEP_RATE, l=1, device=DEVICE, epochs=EPOCHS, fine_tune_epochs=FINE_TUNE_EPOCHS )

Training:	####----------------------------------------------	[8.7%]

KeyboardInterrupt: 

### Train Deep Split Model

In [None]:
# Why is there no 'l' parameter here? 
train_deep_split_model( X_train, X_test, y_train, y_test, y_embed_train, y_embed_test, e_kept_ratio=E_KEEP_RATE, device=DEVICE, epochs=EPOCHS, fine_tune_epochs=FINE_TUNE_EPOCHS )

Training:	#################################################-	[99.0%]
Regression Results:
MSE:	1.199455738067627


Classification Results:
F1 score: 0.06837606837606838
              precision    recall  f1-score   support

           0       0.93      0.93      0.93       775
           1       0.07      0.07      0.07        59

    accuracy                           0.87       834
   macro avg       0.50      0.50      0.50       834
weighted avg       0.87      0.87      0.87       834

Confusion matrix:
 [[721  54]
 [ 55   4]]
Training:	--------------------------------------------------	[0.0%]

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>