In [69]:
import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import time

from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix, classification_report


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier


import xgboost as xgb
import lightgbm as lgb
import catboost as cb


import tensorflow as tf
import keras
import keras_tuner
from keras.regularizers import l1
from keras.layers import LeakyReLU
from keras import layers
from keras.regularizers import l1


In [70]:
# loading data 
df = pd.read_csv('dataset_classification/diabetes_012_health_indicators_BRFSS2015_cleaned.csv')

In [71]:
df.head()

Unnamed: 0,HighBP,HighChol,BMI,Stroke,HeartDiseaseorAttack,PhysActivity,HvyAlcoholConsump,AnyHealthcare,GenHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes
0,1.0,1.0,40.0,0.0,0.0,0.0,0.0,1.0,5.0,15.0,1.0,0.0,9.0,4.0,3.0,0.0
1,1.0,0.0,27.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,11.0,3.0,6.0,0.0
2,1.0,1.0,24.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,11.0,5.0,4.0,0.0
3,1.0,1.0,25.0,0.0,0.0,1.0,0.0,1.0,2.0,2.0,0.0,1.0,10.0,6.0,8.0,0.0
4,1.0,0.0,30.0,0.0,0.0,0.0,0.0,1.0,3.0,14.0,0.0,0.0,9.0,6.0,7.0,0.0


In [72]:
df.shape   

(191994, 16)

In [73]:
df = df.astype(int)

In [74]:
df['Diabetes'].value_counts()   

Diabetes
0    166830
2     21274
1      3890
Name: count, dtype: int64

In [75]:
X = df.drop('Diabetes', axis=1)
y = df['Diabetes']

In [76]:
unsersample = RandomUnderSampler(sampling_strategy='majority')
X, y = unsersample.fit_resample(X, y)

In [77]:
y.value_counts()    

Diabetes
2    21274
0     3890
1     3890
Name: count, dtype: int64

In [78]:
print("X_train shape:", X.shape)
print("y_train shape:", y.shape)

X_train shape: (29054, 15)
y_train shape: (29054,)


In [79]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

In [None]:
# Copy y values for converting classes to matrix
y_train_n = y_train.copy()
y_val_n = y_val.copy()
y_test_n = y_test.copy()

In [None]:
# I am gonna  tf.keras.utils.to_categorical  to convert classes into matrix. Since I will use other models as well, I copied y values.
y_train_n= tf.keras.utils.to_categorical(y_train_n)
y_val_n= tf.keras.utils.to_categorical(y_val_n)
y_test_n= tf.keras.utils.to_categorical(y_test_n)

In [82]:
print("y_train_n:", y_train_n.shape)
print("y_val_n:", y_val_n.shape)
print("y_test_n:", y_test_n.shape)

y_train_n: (20337, 3)
y_val_n: (4358, 3)
y_test_n: (4359, 3)


In [83]:
# print the shapes of the training, validation, and testing sets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (20337, 15)
y_train shape: (20337,)
X_val shape: (4358, 15)
y_val shape: (4358,)
X_test shape: (4359, 15)
y_test shape: (4359,)


In [84]:
# Save the categories into a list 
categories = list(np.unique(df['Diabetes']))    
categories

[0, 1, 2]

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

In [48]:
categories

[0, 1, 2]

In [None]:

# Creating a function to fine tune and adding hp object
def build_model(hp):
    model = keras.Sequential()
    
    # Adding the input layer
    model.add(keras.layers.BatchNormalization(
        momentum=hp.Float('bn_momentum', 0.1, 0.9, 0.1), input_shape=(len(X.columns),))),
  
     
    
    # First Hidden Layer 
    model.add(keras.layers.Dense(
        units=hp.Int('units', min_value=32, max_value=256, step=4),
        # For fine tuning the model, I am gonna use tow activation functions  relu and tanh
        activation=hp.Choice('activation', ['relu', "tanh"]),
        kernel_regularizer=keras.regularizers.l1(l1=hp.Float('l1', 0, 0.1, step=0.01),)
    ))
        
    # Dropout layer
    if hp.Boolean('dropout'):
        model.add(keras.layers.Dropout(rate=hp.Float('dropout_rate', 0.1, 0.5, step=0.1)))
        
        
    # Adding additional hidden layers
    for i in range(hp.Int("num_layers", 1, 4)):
        units = hp.Int(f"units_{i+1}", 16, 128, 2)
        activation = hp.Choice(f"activation_{i}", ['relu', 'tanh', 'LeakyReLU'])
        
        if activation == 'LeakyReLU':
           model.add(keras.layers.Dense(units))
           model.add(keras.layers.LeakyReLU(negative_slope=hp.Float('leaky_relu_slope', 0.1, 0.5, step=0.1)))
            
        else:
           model.add(keras.layers.Dense(units, activation=activation))
           
    model.add(layers.Dense(len(categories), activation="softmax"))
    
    # Setting up the optimizer and compiling the model 
    learning_rate = hp.Float('lr', min_value=1e-4, max_value=5e-2, sampling="log")
    # Creating the dictionary for the optimizers for givin flexibility to the model
    optimizers = {
        'adam': keras.optimizers.Adam(learning_rate=learning_rate),
        'sgd': keras.optimizers.SGD(learning_rate=learning_rate, momentum=hp.Float('momentum', 0.0, 0.9, 0.1)),
        'rmsprop': keras.optimizers.RMSprop(learning_rate=learning_rate)}[hp.Choice('optimizer', ['adam', 'sgd', 'rmsprop'])]
    model.compile(optimizer=optimizers, loss='categorical_crossentropy',metrics=['accuracy'])
    return model
 


build_model(keras_tuner.HyperParameters())
# Setting up the Keras tuner
tuner = keras_tuner.RandomSearch(
    hypermodel= build_model,
    objective="val_loss",
    max_trials=10,
    executions_per_trial=3,
    directory='dataset_classification/model_tuning',
    project_name="classification_model",
    overwrite=True  
)

callback = [
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5)
]

# Starting  searching
tuner.search(X_train_scaled, y_train_n, epochs=250, validation_data=(X_val_scaled, y_val_n), callbacks=callback, batch_size=126)   

# I got the main code from deep learning lecture notes and for debugging I used  Google and LLM

Trial 10 Complete [00h 01m 22s]
val_loss: 0.7612646818161011

Best val_loss So Far: 0.7612083355585734
Total elapsed time: 00h 27m 10s


In [None]:
# print out the result and suggestions
tuner.results_summary()

Results summary
Results in dataset_classification/model_tuning\classification_model
Showing 10 best trials
Objective(name="val_loss", direction="min")

Trial 02 summary
Hyperparameters:
bn_momentum: 0.9
units: 128
activation: relu
l1: 0.0
dropout: True
num_layers: 3
units_1: 46
activation_0: tanh
lr: 0.0009730546622782074
momentum: 0.7000000000000001
optimizer: sgd
leaky_relu_slope: 0.4
units_2: 30
activation_1: relu
units_3: 38
activation_2: relu
units_4: 106
activation_3: LeakyReLU
dropout_rate: 0.4
Score: 0.9698121349016825

Trial 00 summary
Hyperparameters:
bn_momentum: 0.5
units: 164
activation: tanh
l1: 0.03
dropout: False
num_layers: 4
units_1: 96
activation_0: LeakyReLU
lr: 0.0007346830490058072
momentum: 0.2
optimizer: rmsprop
leaky_relu_slope: 0.1
units_2: 16
activation_1: relu
units_3: 16
activation_2: relu
units_4: 16
activation_3: relu
Score: 0.9745214581489563

Trial 07 summary
Hyperparameters:
bn_momentum: 0.1
units: 56
activation: tanh
l1: 0.04
dropout: False
num_layers

In [87]:
# Getting the best hyperparameters
best_hps = tuner.get_best_hyperparameters(1)[0]
# Making dataframe to show the best hyperparameters
best_hps_dict = best_hps.values
best_hps_df = pd.DataFrame(best_hps_dict.items(), columns=['Hyperparameter', 'Value'])
best_hps_df

Unnamed: 0,Hyperparameter,Value
0,bn_momentum,0.3
1,units,76
2,activation,tanh
3,l1,0.08
4,dropout,True
5,num_layers,3
6,units_1,68
7,activation_0,LeakyReLU
8,lr,0.000262
9,momentum,0.4


In [None]:
# Crating the category list for CatBoost model
category_features = ['HighBP', 'HighChol','Stroke', 'PhysActivity',  'HeartDiseaseorAttack', 'HvyAlcoholConsump', 'AnyHealthcare', 'GenHlth', 'PhysHlth', 'DiffWalk', 'Sex']

In [None]:
# Dictionary of classification models
models = {
    "Logistic Regression": LogisticRegression(C=0.5, solver='saga', max_iter=1000, n_jobs=-1),
    "Random Forest Classifier": RandomForestClassifier(n_estimators=500, max_depth=15, min_samples_split=5, max_features='sqrt'),
    
    "SVC": SVC(C=5, kernel='linear', probability=True, max_iter=120000),
    "KNN": KNeighborsClassifier(n_neighbors=20, weights='distance', algorithm='auto',n_jobs=-1),    
    "MLP Classifier": MLPClassifier(hidden_layer_sizes=(256, 128, 64), activation='relu', solver='adam', 
    early_stopping=True, alpha=0.0005, learning_rate_init=0.01, max_iter=1000),
    
    "Decision Tree": DecisionTreeClassifier(max_depth=15, min_samples_split=20, min_samples_leaf=10),
    "Extra Trees Classifier": ExtraTreesClassifier(n_estimators=600, max_depth=25, min_samples_split=5, max_features='sqrt', n_jobs=-1),
    "Gradient Boosting Classifier": GradientBoostingClassifier(n_estimators=600, learning_rate=0.01, max_depth=15),
    
    "XGBoost": xgb.XGBClassifier(n_estimators=600, learning_rate=0.01, max_depth=15, 
    enable_categorical=True, objective='multi:softprob'), 
    "LightGBM": lgb.LGBMClassifier(n_estimators=500, learning_rate=0.01, max_depth=12, verbose=0, objective='multiclass'),
    "CatBoost": cb.CatBoostClassifier(n_estimators=600, learning_rate=0.01, depth=6, verbose=0),
    
    "Neural Network": keras.models.Sequential([
         keras.Input(shape=(len(X.columns),)),
         layers.BatchNormalization( momentum=0.3),
         layers.Dense(76, activation='tanh'),
         layers.BatchNormalization(momentum=0.3),
         layers.Dense(68, activation='tanh'),  
         layers.BatchNormalization( momentum=0.3),
         layers.Dense(120, activation='relu'),
         layers.BatchNormalization(momentum=0.3),
         layers.Dense(56, activation='relu'),
         layers.BatchNormalization(momentum=0.3),      
         layers.Dense(16, activation=LeakyReLU(negative_slope=0.4)),
         layers.BatchNormalization(momentum=0.3),
         layers.Dense(len(categories), activation='softmax')])
}

In [None]:

# Creating a list to store the results
results = []
# for each model
for name, model in models.items():
    print("Starting ... " + name)
    start = time.time()
    # Training each model
    if name == 'Neural Network':
        model.compile(optimizer=keras.optimizers.SGD(
        learning_rate=0.0002616798647325732), loss='categorical_crossentropy', metrics=['accuracy']) 
        # Early Stopping 
        early_stop = keras.callbacks.EarlyStopping(
        patience=10, monitor='val_loss', mode='min',
        restore_best_weights=True)
        # Fitting the model
        model.fit(X_train_scaled, y_train_n,
        validation_data=(X_val_scaled, y_val_n), 
        callbacks=[early_stop], epochs=500,
        batch_size=64, verbose=0)
        # mAking predictions
        # I am going to use np.argmax to convert the matrix into classes
        predictions = model.predict(X_test_scaled)
        predictions = np.argmax(predictions, axis=1)
        y_test_n = np.argmax(y_test_n, axis=1)
        # Evaluating the model
        accuracy = accuracy_score(y_test_n, predictions)
        precision = precision_score(y_test_n, predictions, average="macro")
        recall = recall_score(y_test_n, predictions, average="macro")
        f1 = f1_score(y_test_n, predictions, average="macro")



    elif name in ['KNN', 'MLP Classifier', 'SVC']:
        # Training with scaled data
        model.fit(X_train_scaled, y_train)
        predictions = model.predict(X_test_scaled)
        probability = model.predict_proba(X_test_scaled) if hasattr(model, "predict_proba") else None
        
    elif name == 'CatBoost':
        model.fit(X_train, y_train, cat_features=category_features, verbose=0)
        predictions = model.predict(X_test)
        probability = model.predict_proba(X_test)

    else:
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        probability = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

    end = time.time()
    Train_Time = round(end - start, 2)
    # calculate the metrics
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    # Calculate metrics for current model in training
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average="macro")
    recall = recall_score(y_test, predictions, average="macro")
    f1 = f1_score(y_test, predictions, average="macro")
    # ROC AUC
    if probability is not None:
        y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
        roc_auc = roc_auc_score(y_test, probability, multi_class='ovr')
    else:
        roc_auc = np.nan

    # save the metrics for this model into results
    results.append([name, accuracy, precision, recall, f1, roc_auc, Train_Time])
    
 #I got the main code from the lecture notes, and for debugging, I used Google and an LLM here as well

Starting ... Logistic Regression


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Starting ... Random Forest Classifier
Starting ... SVC


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Starting ... KNN
Starting ... MLP Classifier


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Starting ... Decision Tree
Starting ... Extra Trees Classifier
Starting ... Gradient Boosting Classifier
Starting ... XGBoost
Starting ... LightGBM


[WinError 2] The system cannot find the file specified
  File "c:\Users\murta\Desktop\Desktop\ML&DE\Fourth_Semester\Advanced_ML\.venv\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "C:\Users\murta\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "C:\Users\murta\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\murta\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 1456, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


Starting ... CatBoost


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Starting ... Neural Network
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [92]:
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', "Precision", "Recall", "F1", "ROC AUC", 'Time'])
results_df.sort_values('F1', ascending=False)   

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,ROC AUC,Time
5,Decision Tree,0.705896,0.452776,0.431476,0.430812,0.635075,0.05
8,XGBoost,0.726543,0.481118,0.425692,0.424038,0.657004,21.2
9,LightGBM,0.749025,0.709854,0.428142,0.421494,0.691843,1.97
7,Gradient Boosting Classifier,0.710255,0.455721,0.416326,0.41795,0.630541,923.01
1,Random Forest Classifier,0.74696,0.5758,0.423366,0.41698,0.685178,6.06
6,Extra Trees Classifier,0.742602,0.523769,0.420678,0.416066,0.673568,2.62
10,CatBoost,0.747878,0.446358,0.421085,0.412793,0.690894,32.45
0,Logistic Regression,0.74696,0.441591,0.421622,0.412587,0.686393,3.08
3,KNN,0.71599,0.334378,0.334026,0.292287,0.514794,1.65
2,SVC,0.72769,0.242563,0.333333,0.280795,0.498816,37.23


**From these results, I see that XGBoost and CatBoost have better performing,  this makes sense because they use boosting methods.**<br>
**On the other hand, Logistic Regression and Decision Tree train very quickly and are easy to explaim, but their accuracy is lower.**<br>
**If I want the best performance and have enough time, I would choose XGBoost or CatBoost.**<br>

In [None]:
# saving the results
results_df.to_csv('dataset_classification/results.csv', index=False)