In [237]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import time

from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix, classification_report


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier


import xgboost as xgb
import lightgbm as lgb
import catboost as cb


import tensorflow as tf
import keras_tuner as kt
import keras
from keras import layers
import keras_tuner
from keras.regularizers import l1

In [238]:
# loading data 
df = pd.read_csv('dataset_classification/diabetes_012_health_indicators_BRFSS2015_cleaned.csv')

In [239]:
# convert dataset to intrger
df = df.astype(int)

In [240]:
df.head()

Unnamed: 0,Diabetes,HighBP,HighChol,BMI,Stroke,HeartDiseaseorAttack,PhysActivity,HvyAlcoholConsump,AnyHealthcare,GenHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1,1,40,0,0,0,0,1,5,15,1,0,9,4,3
1,0,0,0,25,0,0,1,0,0,3,0,0,0,7,6,1
2,0,1,1,28,0,0,0,0,1,5,30,1,0,9,4,8
3,0,1,0,27,0,0,1,0,1,2,0,0,0,11,3,6
4,0,1,1,24,0,0,1,0,1,2,0,0,0,11,5,4


In [241]:
df.shape   

(229781, 16)

In [242]:
df['Diabetes'].value_counts()   

Diabetes
0    190055
2     35097
1      4629
Name: count, dtype: int64

In [243]:
X = df.drop('Diabetes', axis=1)
y = df['Diabetes']

In [244]:
rus = RandomUnderSampler(random_state=42)
X, y = rus.fit_resample(X, y)

In [245]:
y.value_counts()    

Diabetes
0    4629
1    4629
2    4629
Name: count, dtype: int64

In [246]:
print("X_train shape:", X.shape)
print("y_train shape:", y.shape)

X_train shape: (13887, 15)
y_train shape: (13887,)


In [247]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

In [248]:
y_train_n = y_train.copy()
y_val_n = y_val.copy()
y_test_n = y_test.copy()

In [249]:
y_train_n= tf.keras.utils.to_categorical(y_train_n)
y_val_n= tf.keras.utils.to_categorical(y_val_n)
y_test_n= tf.keras.utils.to_categorical(y_test_n)

In [250]:
print("y_train_n:", y_train_n.shape)
print("y_val_n:", y_val_n.shape)
print("y_test_n:", y_test_n.shape)

y_train_n: (9720, 3)
y_val_n: (2083, 3)
y_test_n: (2084, 3)


In [251]:
# print the shapes of the training, validation, and testing sets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (9720, 15)
y_train shape: (9720,)
X_val shape: (2083, 15)
y_val shape: (2083,)
X_test shape: (2084, 15)
y_test shape: (2084,)


In [252]:
# Save the categories into a list 
categories = list(np.unique(df['Diabetes']))    
categories

[0, 1, 2]

In [253]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

In [202]:
categories

[0, 1, 2]

In [203]:
# Creating a function to fine tune and adding hp object
def build_model(hp):
    model = keras.Sequential()
    
    # Adding the input layer
    
    model.add(keras.layers.BatchNormalization(
        momentum=hp.Float('bn_momentum', 0.1, 0.9, 0.1), input_shape=(len(X.columns),))),
  
     
    
    # First Hidden Layer 
    model.add(keras.layers.Dense(
        units=hp.Int('units', min_value=32, max_value=256, step=4),
        # For fine tuning the model, I am gonna use tow activation functions  relu and tanh
        activation=hp.Choice('activation', ['relu', "tanh"]),
        kernel_regularizer=keras.regularizers.l1(l1=hp.Float('l1', 0, 0.1, step=0.01),)
    ))
        
    # Dropout layer
    if hp.Boolean('dropout'):
        model.add(keras.layers.Dropout(rate=hp.Float('dropout_rate', 0.1, 0.5, step=0.1)))
        
        
    # Adding additional hidden layers
    for i in range(hp.Int("num_layers", 1, 4)):
        units = hp.Int(f"units_{i+1}", 16, 128, 2)
        activation = hp.Choice(f"activation_{i}", ['relu', 'tanh', 'LeakyReLU'])
        
        if activation == 'LeakyReLU':
           model.add(keras.layers.Dense(units))
           model.add(keras.layers.LeakyReLU(negative_slope=hp.Float('leaky_relu_slope', 0.1, 0.5, step=0.1)))
            
        else:
           model.add(keras.layers.Dense(units, activation=activation))
           
    model.add(layers.Dense(len(categories), activation="softmax"))
    
    # Setting up the optimizer and compiling the model 
    learning_rate = hp.Float('lr', min_value=1e-4, max_value=5e-2, sampling="log")
    # Creating the dictionary for the optimizers for givin flexibility to the model
    optimizers = {
        'adam': keras.optimizers.Adam(learning_rate=learning_rate),
        'sgd': keras.optimizers.SGD(learning_rate=learning_rate, momentum=hp.Float('momentum', 0.0, 0.9, 0.1)),
        'rmsprop': keras.optimizers.RMSprop(learning_rate=learning_rate)}[hp.Choice('optimizer', ['adam', 'sgd', 'rmsprop'])]
    
    model.compile(optimizer=optimizers, loss='categorical_crossentropy',metrics=['accuracy'])
    return model

build_model(keras_tuner.HyperParameters())
# Setting up the Keras tuner
tuner = keras_tuner.RandomSearch(
    hypermodel= build_model,
    objective="val_loss",
    max_trials=10,
    executions_per_trial=3,
    directory='dataset_classification/model_tuning',
    project_name="classification_model",
    overwrite=True  
)

callback = [
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5)
]
# Starting  searching
tuner.search(X_train_scaled, y_train_n, epochs=250, validation_data=(X_val_scaled, y_val_n), callbacks=callback)     


Trial 10 Complete [00h 05m 00s]
val_loss: 0.9877884785334269

Best val_loss So Far: 0.9698121349016825
Total elapsed time: 01h 11m 10s


In [204]:
# print out the result and suggestions
tuner.results_summary()

Results summary
Results in dataset_classification/model_tuning\classification_model
Showing 10 best trials
Objective(name="val_loss", direction="min")

Trial 02 summary
Hyperparameters:
bn_momentum: 0.9
units: 128
activation: relu
l1: 0.0
dropout: True
num_layers: 3
units_1: 46
activation_0: tanh
lr: 0.0009730546622782074
momentum: 0.7000000000000001
optimizer: sgd
leaky_relu_slope: 0.4
units_2: 30
activation_1: relu
units_3: 38
activation_2: relu
units_4: 106
activation_3: LeakyReLU
dropout_rate: 0.4
Score: 0.9698121349016825

Trial 00 summary
Hyperparameters:
bn_momentum: 0.5
units: 164
activation: tanh
l1: 0.03
dropout: False
num_layers: 4
units_1: 96
activation_0: LeakyReLU
lr: 0.0007346830490058072
momentum: 0.2
optimizer: rmsprop
leaky_relu_slope: 0.1
units_2: 16
activation_1: relu
units_3: 16
activation_2: relu
units_4: 16
activation_3: relu
Score: 0.9745214581489563

Trial 07 summary
Hyperparameters:
bn_momentum: 0.1
units: 56
activation: tanh
l1: 0.04
dropout: False
num_layers

In [205]:
# Get the top 2 models.
models = tuner.get_best_models(num_models=2)
best_model = models[0]
best_model.summary()

  saveable.load_own_variables(weights_store.get(inner_path))
  saveable.load_own_variables(weights_store.get(inner_path))


In [206]:
# Getting the best hyperparameters
best_hps = tuner.get_best_hyperparameters(1)[0]
# Making dataframe to show the best hyperparameters
best_hps_dict = best_hps.values
best_hps_df = pd.DataFrame(best_hps_dict.items(), columns=['Hyperparameter', 'Value'])
best_hps_df

Unnamed: 0,Hyperparameter,Value
0,bn_momentum,0.9
1,units,128
2,activation,relu
3,l1,0.0
4,dropout,True
5,num_layers,3
6,units_1,46
7,activation_0,tanh
8,lr,0.000973
9,momentum,0.7


In [207]:
df.head()

Unnamed: 0,Diabetes,HighBP,HighChol,BMI,Stroke,HeartDiseaseorAttack,PhysActivity,HvyAlcoholConsump,AnyHealthcare,GenHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1,1,40,0,0,0,0,1,5,15,1,0,9,4,3
1,0,0,0,25,0,0,1,0,0,3,0,0,0,7,6,1
2,0,1,1,28,0,0,0,0,1,5,30,1,0,9,4,8
3,0,1,0,27,0,0,1,0,1,2,0,0,0,11,3,6
4,0,1,1,24,0,0,1,0,1,2,0,0,0,11,5,4


In [254]:
category = ['HighBP', 'HighChol','Stroke', 'PhysActivity',  'HeartDiseaseorAttack', 'HvyAlcoholConsump', 'AnyHealthcare', 'GenHlth', 'PhysHlth', 'DiffWalk', 'Sex']

In [255]:
from keras.layers import LeakyReLU
# Dictionary of classification models
models = {
    "Logistic Regression": LogisticRegression(C=0.5, solver='saga', max_iter=1000, multi_class='multinomial', n_jobs=-1),
    "Random Forest Classifier": RandomForestClassifier(n_estimators=500, max_depth=15, min_samples_split=5, max_features='sqrt'),
    "SVC": SVC(C=5, kernel='rbf', probability=True, cache_size=1000, max_iter=5000),
    "KNN": KNeighborsClassifier(n_neighbors=20, weights='distance', algorithm='auto',n_jobs=-1),    
    "MLP Classifier": MLPClassifier(hidden_layer_sizes=(256, 128, 64), activation='relu', solver='adam', 
                                    early_stopping=True, alpha=0.0005, learning_rate_init=0.01, max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(max_depth=15, min_samples_split=20, min_samples_leaf=10),
    "Extra Trees Classifier": ExtraTreesClassifier(n_estimators=600, max_depth=25, min_samples_split=5, max_features='sqrt', n_jobs=-1),
    #"Gradient Boosting Classifier": GradientBoostingClassifier(n_estimators=600, learning_rate=0.01, max_depth=15),
    "XGBoost": xgb.XGBClassifier(n_estimators=600, learning_rate=0.01, max_depth=15, 
                                 use_label_encoder=False, enable_categorical=True, objective='multi:softprobar', num_class=len(X.columns)), 
    "LightGBM": lgb.LGBMClassifier(n_estimators=500, learning_rate=0.01, max_depth=12, verbose=0, objective='multiclass'),
    "CatBoost": cb.CatBoostClassifier(n_estimators=600, learning_rate=0.01, depth=6, verbose=0),
    "Neural Network": keras.models.Sequential([
         keras.Input(shape=(len(X.columns),)),
         layers.BatchNormalization( momentum=0.9),
         layers.Dense(128, activation='relu'),
         layers.BatchNormalization(momentum=0.9),
         layers.Dense(46, activation='tanh'),  
         layers.BatchNormalization( momentum=0.9),
         layers.Dense(30, activation='relu'),
         layers.BatchNormalization(momentum=0.9),
         layers.Dense(30, activation='relu'),
         layers.BatchNormalization(momentum=0.9),      
         layers.Dense(106, activation=LeakyReLU(negative_slope=0.4)),
         layers.BatchNormalization(momentum=0.9),
         layers.Dense(len(categories), activation='softmax')])
}

In [256]:
# Creating a list to store the results
results = []
# for each model
for name, model in models.items():
    print("Starting ... " + name)
    model.random_state = 78

    start = time.time()

    if name == 'Neural Network':
        model.compile(optimizer=keras.optimizers.SGD(learning_rate=0.0009730546622782074), loss='categorical_crossentropy', metrics=['accuracy']) 
        early_stop = keras.callbacks.EarlyStopping(patience=10, monitor='val_loss', mode='min', restore_best_weights=True)
        model.fit(X_train_scaled, y_train_n, validation_data=(X_val_scaled, y_val_n), callbacks=[early_stop], epochs=500, batch_size=65, verbose=0)
        predictions = model.predict(X_test_scaled)
        predictions = np.argmax(predictions, axis=1)
        y_test_n = np.argmax(y_test_n, axis=1)
        accuracy = accuracy_score(y_test_n, predictions)
        precision = precision_score(y_test_n, predictions, average="macro")
        recall = recall_score(y_test_n, predictions, average="macro")
        f1 = f1_score(y_test_n, predictions, average="macro")

    elif name in ['SVR', 'KNN', 'MLP Classifier']:
        model.fit(X_train_scaled, y_train)
        predictions = model.predict(X_test_scaled)
        probability = model.predict_proba(X_test_scaled) if hasattr(model, "predict_proba") else None

    elif name == 'CatBoost':
        model.fit(X_train, y_train, cat_features=category)
        predictions = model.predict(X_test)
        probability = model.predict_proba(X_test)

    else:
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        probability = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

    end = time.time()
    Train_Time = round(end - start, 2)
    # calculate the metrics
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    # Calculate metrics for current model in training
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average="macro")
    recall = recall_score(y_test, predictions, average="macro")
    f1 = f1_score(y_test, predictions, average="macro")

    # ROC AUC
    if probability is not None:
        y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
        roc_auc = roc_auc_score(y_test, probability, multi_class='ovr')
    else:
        roc_auc = np.nan

    # save the metrics for this model into results
    results.append([name, accuracy, precision, recall, f1, roc_auc, Train_Time])

Starting ... Logistic Regression




Starting ... Random Forest Classifier
Starting ... SVC
Starting ... KNN
Starting ... MLP Classifier
Starting ... Decision Tree
Starting ... Extra Trees Classifier
Starting ... XGBoost


Parameters: { "use_label_encoder" } are not used.



Starting ... LightGBM
Starting ... CatBoost
Starting ... Neural Network
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [259]:
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', "Precision", "Recall", "F1", "ROC AUC", 'Time'])
results_df.sort_values('F1', ascending=False)   

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,ROC AUC,Time
2,SVC,0.515835,0.511317,0.513663,0.511967,0.690351,24.69
10,Neural Network,0.511516,0.506238,0.50861,0.504618,0.693933,68.31
0,Logistic Regression,0.510077,0.498788,0.507338,0.499631,0.689843,2.6
1,Random Forest Classifier,0.503839,0.496122,0.501249,0.497527,0.680041,4.96
8,LightGBM,0.503359,0.49552,0.500594,0.495909,0.690043,1.06
9,CatBoost,0.5,0.489439,0.496967,0.490625,0.693933,48.18
6,Extra Trees Classifier,0.482726,0.478406,0.480609,0.478943,0.65876,3.4
4,MLP Classifier,0.498081,0.477211,0.493841,0.473054,0.682072,13.0
5,Decision Tree,0.47025,0.465376,0.468615,0.466773,0.641925,0.03
3,KNN,0.46785,0.463044,0.465584,0.463247,0.640031,0.38
