In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score


import shap
from ucimlrepo import fetch_ucirepo 


  from .autonotebook import tqdm as notebook_tqdm


## **1.Data Undestanding**

#### 1.1 Dataset Overview

In [2]:
# fetch dataset 
cdc_diabetes_health_indicators = fetch_ucirepo(id=891) 

# data (as pandas dataframes) 
X = cdc_diabetes_health_indicators.data.features 
y = cdc_diabetes_health_indicators.data.targets 


In [3]:
# Remove duplicate rows
combined = pd.concat([X, y], axis=1).drop_duplicates()

# Check for identical X with different y and remove them
inconsistent_indices = combined[combined.duplicated(subset=combined.columns[:-1], keep=False) & combined.duplicated(subset=[combined.columns[-1]], keep=False)].index
if not inconsistent_indices.empty:
    combined = combined.drop(inconsistent_indices)

# Separate features and target after cleaning
X = combined.iloc[:, :-1]
y = pd.DataFrame(combined.iloc[:, -1], columns=['Diabetes_binary'])

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [5]:
# Reshape y_train and y_test to avoid DataConversionWarning
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

In [6]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr_params = {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['liblinear', 'lbfgs']}
lr_grid = GridSearchCV(lr, lr_params, cv=5, scoring='accuracy')
lr_grid.fit(X_train, y_train)
lr_accuracy = accuracy_score(y_test, lr_grid.best_estimator_.predict(X_test))
print("Best Logistic Regression Params:", lr_grid.best_params_)
print("Logistic Regression Accuracy:", lr_accuracy)

Best Logistic Regression Params: {'C': 0.01, 'solver': 'lbfgs'}
Logistic Regression Accuracy: 0.855132651483355


In [7]:
# Decision Tree
dt = DecisionTreeClassifier()
dt_params = {'max_depth': [5, 10, 15, 20, None], 'min_samples_split': [2, 5, 10]}
dt_grid = GridSearchCV(dt, dt_params, cv=5, scoring='accuracy')
dt_grid.fit(X_train, y_train)
dt_accuracy = accuracy_score(y_test, dt_grid.best_estimator_.predict(X_test))
print("Best Decision Tree Params:", dt_grid.best_params_)
print("Decision Tree Accuracy:", dt_accuracy)


Best Decision Tree Params: {'max_depth': 5, 'min_samples_split': 2}
Decision Tree Accuracy: 0.8571870374870221


In [8]:
# Random Forest
rf = RandomForestClassifier()
rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15, None], 'min_samples_split': [2, 5, 10]}
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='accuracy')
rf_grid.fit(X_train, y_train)
rf_accuracy = accuracy_score(y_test, rf_grid.best_estimator_.predict(X_test))
print("Best Random Forest Params:", rf_grid.best_params_)
print("Random Forest Accuracy:", rf_accuracy)

Best Random Forest Params: {'max_depth': 15, 'min_samples_split': 10, 'n_estimators': 200}
Random Forest Accuracy: 0.8586008085002982


In [9]:
# Gradient-Boosted Tree
gbt = GradientBoostingClassifier()
gbt_params = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]}
gbt_grid = GridSearchCV(gbt, gbt_params, cv=5, scoring='accuracy')
gbt_grid.fit(X_train, y_train)
gbt_accuracy = accuracy_score(y_test, gbt_grid.best_estimator_.predict(X_test))
print("Best GBT Params:", gbt_grid.best_params_)
print("GBT Accuracy:", gbt_accuracy)

Best GBT Params: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}
GBT Accuracy: 0.8580264640261548


In [None]:
# Support Vector Machine
# svm = SVC()
# svm_params = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
# svm_grid = GridSearchCV(svm, svm_params, cv=5, scoring='accuracy')
# svm_grid.fit(X_train, y_train)
# svm_accuracy = accuracy_score(y_test, svm_grid.best_estimator_.predict(X_test))
# print("Best SVM Params:", svm_grid.best_params_)
# print("SVM Accuracy:", svm_accuracy)

In [6]:
# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_accuracy = accuracy_score(y_test, nb.predict(X_test))
print("Naive Bayes Accuracy:", nb_accuracy)

Naive Bayes Accuracy: 0.7640327818153704


In [7]:

# Neural Network (MLP)
mlp = MLPClassifier(max_iter=1000)
mlp_params = {'hidden_layer_sizes': [(50,), (100,), (100, 50)], 'activation': ['relu', 'tanh'], 'alpha': [0.0001, 0.001, 0.01]}
mlp_grid = GridSearchCV(mlp, mlp_params, cv=5, scoring='accuracy')
mlp_grid.fit(X_train, y_train)
mlp_accuracy = accuracy_score(y_test, mlp_grid.best_estimator_.predict(X_test))
print("Best MLP Params:", mlp_grid.best_params_)
print("MLP Accuracy:", mlp_accuracy)

Best MLP Params: {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (50,)}
MLP Accuracy: 0.858689169188628


In [9]:
# Selecting the best model
accuracies = {
    # 'Logistic Regression': lr_accuracy,
    # 'Decision Tree': dt_accuracy,
    # 'Random Forest': rf_accuracy,
    # 'Gradient-Boosted Tree': gbt_accuracy,
    # 'Support Vector Machine': svm_accuracy,
    'Naive Bayes': nb_accuracy,
    'Neural Network (MLP)': mlp_accuracy
}

best_model = max(accuracies, key=accuracies.get)
print("Best Model:", best_model, "with Accuracy:", accuracies[best_model])

Best Model: Neural Network (MLP) with Accuracy: 0.858689169188628
