In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, recall_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import xgboost as xgb
import lightgbm as lgb
import joblib
import numpy as np

# Load dataset
data = pd.read_csv('./archive/Diabetes Classification.csv')

# Define features and target
X = data.drop(columns=['Diagnosis'])  # Features
y = data['Diagnosis']  # Target (Diagnosis)

# Identify categorical and numerical columns
categorical_columns = ['Gender']
numerical_columns = ['Age', 'BMI', 'Chol', 'TG', 'HDL', 'LDL', 'Cr', 'BUN']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())]), numerical_columns),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_columns)
    ])

# Define base learners for stacking
xgb_model = xgb.XGBClassifier(tree_method='auto', random_state=42, scale_pos_weight=np.sum(y == 0) / np.sum(y == 1))  # Adjust class weight
lgb_model = lgb.LGBMClassifier(random_state=42, class_weight='balanced')  # Class weight balanced for LightGBM
mlp_model = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=1000, random_state=42)

# Stacking ensemble
stacked_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('mlp', mlp_model)
    ],
    final_estimator=MLPClassifier(hidden_layer_sizes=(64,), max_iter=1000, random_state=42),
    cv=5
)

# Combine preprocessing and stacking model into a single pipeline
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                                 ('classifier', stacked_model)])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the final model on the entire training data
model_pipeline.fit(X_train, y_train)

# Predict probabilities to adjust the threshold
y_proba = model_pipeline.predict_proba(X_test)[:, 1]

# Custom threshold (lowering from 0.5 to 0.3 to favor positive predictions)
threshold = 0.3
y_pred = (y_proba >= threshold).astype(int)

# Evaluate the performance
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Recall (Positive Class):", recall_score(y_test, y_pred, pos_label=1))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save the best model
joblib.dump(model_pipeline, 'best_diabetes_stacked_model_with_tuned_threshold.pkl')


[LightGBM] [Info] Number of positive: 1570, number of negative: 2535
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000774 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1535
[LightGBM] [Info] Number of data points in the train set: 4105, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 1256, number of negative: 2028
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000625 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1524
[LightGBM] [Info] Number of data points in the train set: 3284, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Numb

['best_diabetes_stacked_model_with_tuned_threshold.pkl']

In [22]:
import joblib
import numpy as np
import pandas as pd

# Load the saved model
model_pipeline = joblib.load('best_diabetes_stacked_model_with_tuned_threshold.pkl')

# Features in the dataset:
# 1. Gender (Categorical) - Male, Female
# 2. Age (Numerical) - Example: 45 (years)
# 3. BMI (Numerical) - Body Mass Index, Example: 24.5
# 4. Chol (Numerical) - Cholesterol level, Example: 4.2
# 5. TG (Numerical) - Triglycerides level, Example: 0.9
# 6. HDL (Numerical) - High-Density Lipoprotein level, Example: 2.4
# 7. LDL (Numerical) - Low-Density Lipoprotein level, Example: 1.4
# 8. Cr (Numerical) - Creatinine level, Example: 46 (mg/dL)
# 9. BUN (Numerical) - Blood Urea Nitrogen level, Example: 4.7 (mg/dL)
# 10. Diagnosis (Target) - 0: No Diabetes, 1: Diabetes

new_data = pd.DataFrame({
    'Gender': ['Male'],        # Categorical
    'Age': [45],               # Numerical
    'BMI': [28.7],             # Numerical
    'Chol': [220],             # Numerical
    'TG': [150],               # Numerical
    'HDL': [45],               # Numerical
    'LDL': [130],              # Numerical
    'Cr': [1.1],               # Numerical
    'BUN': [14.5]              # Numerical
})

# Predict the diagnosis for the new data sample
# Use the same model pipeline to ensure preprocessing is applied correctly
prediction = model_pipeline.predict(new_data)
predicted_proba = model_pipeline.predict_proba(new_data)

# Set thereshold for reduce false negative 
# threshold = 0.3
# y_pred_threshold = np.where(y_pred_proba >= threshold, 1, 0)

# Print the predicted class (0 or 1)
print("Predicted Diagnosis (0 = No Diabetes, 1 = Diabetes):", prediction[0])

# Print the predicted probability for each class
print("Predicted Probability (0 = No Diabetes, 1 = Diabetes):", predicted_proba)

Predicted Diagnosis (0 = No Diabetes, 1 = Diabetes): 1
Predicted Probability (0 = No Diabetes, 1 = Diabetes): [[0.08017943 0.91982057]]
