In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
import copy
import json
import seaborn as sns
from sklearn.linear_model import LinearRegression
from joblib import dump
from sklearn.metrics import classification_report, mean_absolute_error, mean_squared_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
df = pd.read_csv("../data/diabetes.csv")

In [3]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


# Splitting Dataset

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
for col in cols:
    df[col] = df[col].replace(0, df[col].mean())

In [6]:
X = df.drop(columns="Outcome").values
y = df["Outcome"].values

X_train, temp_X, y_train, temp_y = train_test_split(
    X, y,
    test_size=0.4,
    stratify=y,
    random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    temp_X, temp_y,
    test_size=0.5,
    stratify=temp_y,
    random_state=42
)

# Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression

In [8]:
log_reg = LogisticRegression(
    class_weight="balanced",
    solver="saga",
    max_iter=5000,
    random_state=42
)

In [9]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

pipeline_lr = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(
        class_weight="balanced",
        solver="saga",
        max_iter=5000,
        random_state=42
    ))
])

param_grid_lr = {
    "model__C": [0.01, 0.1, 1, 10, 100]
}

search_lr = RandomizedSearchCV(
    pipeline_lr,
    param_grid_lr,
    n_iter=5,
    scoring="roc_auc",
    cv=5,
    random_state=42,
    n_jobs=-1
)

search_lr.fit(X_train, y_train)
best_lr_model = search_lr.best_estimator_

In [10]:
y_prob_lr = best_lr_model.predict_proba(X_val)[:, 1]
y_pred_lr = (y_prob_lr >= 0.5).astype(int)

In [11]:
print(classification_report(y_test, y_pred_lr))
print("F1:", f1_score(y_test, y_pred_lr))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_lr))


              precision    recall  f1-score   support

           0       0.66      0.58      0.62       101
           1       0.34      0.42      0.38        53

    accuracy                           0.53       154
   macro avg       0.50      0.50      0.50       154
weighted avg       0.55      0.53      0.53       154

F1: 0.37606837606837606
ROC-AUC: 0.5142910517466841


# Random Forest Regression

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
rf = RandomForestClassifier(
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

In [14]:
param_grid_rf = {
    "n_estimators": [200, 400, 600],
    "max_depth": [5, 10, 15, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"]
}

search_rf = RandomizedSearchCV(
    rf,
    param_grid_rf,
    n_iter=20,
    scoring="roc_auc",
    cv=5,
    random_state=42,
    n_jobs=-1
)

search_rf.fit(X_train, y_train)
best_rf_model = search_rf.best_estimator_

In [15]:
y_prob_rf = best_rf_model.predict_proba(X_test)[:, 1]
y_pred_rf = (y_prob_rf >= 0.5).astype(int)

In [16]:
print(classification_report(y_test, y_pred_rf))
print("F1:", f1_score(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_rf))

              precision    recall  f1-score   support

           0       0.80      0.77      0.79       101
           1       0.60      0.64      0.62        53

    accuracy                           0.73       154
   macro avg       0.70      0.71      0.70       154
weighted avg       0.73      0.73      0.73       154

F1: 0.6181818181818182
ROC-AUC: 0.7978703530730431


# XGBoost

In [17]:
from xgboost import XGBClassifier

In [18]:
# Compute class ratio
pos = sum(y_train == 1)
neg = sum(y_train == 0)
scale_pos_weight = neg / pos

xgboost = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1
)

In [19]:
# Random search hyperparameter tuning
param_grid_xgb = {
    "n_estimators": [100, 300, 500],
    "max_depth": [3, 4, 5],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.7, 0.8, 1.0],
    "colsample_bytree": [0.7, 0.8, 1.0]
}

search_xgb = RandomizedSearchCV(
    xgboost,
    param_grid_xgb,
    n_iter=30,
    scoring="roc_auc",
    cv=5,
    random_state=42,
    n_jobs=-1
)

search_xgb.fit(X_train, y_train)
best_xgb_model = search_xgb.best_estimator_

In [20]:
y_val_prob_xgb = best_xgb_model.predict_proba(X_val)[:, 1]
y_pred_xgb = (y_val_prob_xgb >= 0.5).astype(int)

In [21]:
from sklearn.calibration import CalibratedClassifierCV

cal_xgb = CalibratedClassifierCV(
    best_xgb_model,
    method="isotonic",
    cv=5
)

cal_xgb.fit(X_train, y_train)

y_test_prob = cal_xgb.predict_proba(X_test)[:, 1]

In [22]:
thresholds = np.linspace(0.3, 0.7, 50)
best_f1 = 0
best_thresh = 0.5

for t in thresholds:
    preds = (y_val_prob_xgb >= t).astype(int)
    f1 = f1_score(y_val, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

In [23]:
y_test_prob = cal_xgb.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_prob >= best_thresh).astype(int)

print(classification_report(y_test, y_test_pred))
print("F1:", f1_score(y_test, y_test_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_test_prob))

              precision    recall  f1-score   support

           0       0.82      0.78      0.80       101
           1       0.62      0.68      0.65        53

    accuracy                           0.75       154
   macro avg       0.72      0.73      0.73       154
weighted avg       0.75      0.75      0.75       154

F1: 0.6486486486486487
ROC-AUC: 0.7950681860638893


In [24]:
# Logistic Regression outperformed the other models in this case, this indicates that the underlying signal here is largely linear.

# Export Models

In [25]:
dump(best_rf_model, "../models/random_forest.joblib")
dump(best_xgb_model, "../models/xgboost.joblib")

['../models/xgboost.joblib']

## Export threshold

In [26]:
thresholds = {
    "xgboost": float(best_thresh)
}

with open("../models/thresholds.json", "w") as f:
    json.dump(thresholds, f)