In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import joblib

Load Data

In [2]:
file_path = "../data/nephro_ai_REAL_patients_risk_classified.csv"
df = pd.read_csv(file_path)

print("Dataset Loaded:")
print(df.head())

Dataset Loaded:
   patient_id   age   spo2  bp_systolic  heart_rate  hypertension  diabetes  \
0  10000032.0  52.0  96.30         89.0        96.0           0.0       0.0   
1  10000690.0  86.0  95.70        123.0        84.0           1.0       0.0   
2  10000980.0  73.0  98.91        142.0        74.0           1.0       1.0   
3  10001217.0  55.0  95.02        127.0        87.0           1.0       0.0   
4  10001725.0  46.0  98.23        100.0        79.0           0.0       0.0   

  risk_category  
0      Low Risk  
1      Low Risk  
2     High Risk  
3      Low Risk  
4      Low Risk  


HANDLE MISSING VALUE

In [3]:
df = df.dropna()

3. ENCODE CATEGORICAL COLUMNS

In [4]:
label_enc = LabelEncoder()

# Example columns (adjust based on your dataset)
# Diabetes, Hypertension might be Yes/No
categorical_cols = ["diabetes", "hypertension", "risk_category"]

for col in categorical_cols:
    df[col] = label_enc.fit_transform(df[col])

SELECT FEATURES + TARGET

In [5]:
# Advanced Feature Engineering
df['age_bp'] = df['age'] * df['bp_systolic']
df['diab_hyper'] = df['diabetes'] * df['hypertension']

# Binning (Categorizing continuous variables)
# BP Stages: Normal (<120), Elevated (120-129), High Stage 1 (130-139), High Stage 2 (>140)
df['bp_category'] = pd.cut(df['bp_systolic'], bins=[0, 120, 130, 140, 300], labels=[0, 1, 2, 3])
# Age Groups: Young (<30), Middle (30-60), Senior (>60)
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 60, 120], labels=[0, 1, 2])

# Convert bins to codes
df['bp_category'] = df['bp_category'].cat.codes
df['age_group'] = df['age_group'].cat.codes

# Select all features
X = df[["age", "bp_systolic", "diabetes", "hypertension", "age_bp", "diab_hyper", "bp_category", "age_group"]]
y = df["risk_category"]

 SPLIT TRAIN / TEST

In [6]:
# Reverting SMOTE as it decreased overall accuracy in favor of recall
# We will stick to the original distribution but use stratify to ensure train/test have same proportions
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training data shape:", X_train.shape)

Training data shape: (2891, 8)


TRAIN RANDOM FOREST MODEL

In [7]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# --- 1. Normal (Baseline) Models ---
print("--- 1. Normal (Baseline) Models ---")

# Baseline Random Forest
rf_baseline = RandomForestClassifier(n_estimators=100, random_state=42)
rf_baseline.fit(X_train, y_train)
rf_pred = rf_baseline.predict(X_test)
print(f"Baseline Random Forest Accuracy: {accuracy_score(y_test, rf_pred):.4f}")

# Baseline XGBoost
# Set n_jobs=1 to avoid conflict with RandomizedSearchCV's n_jobs=-1
xgb_baseline = XGBClassifier(objective='multi:softmax', num_class=3, eval_metric='mlogloss', random_state=42, n_jobs=1)
xgb_baseline.fit(X_train, y_train)
xgb_pred = xgb_baseline.predict(X_test)
print(f"Baseline XGBoost Accuracy: {accuracy_score(y_test, xgb_pred):.4f}")

# --- 2. Fine-Tuning XGBoost ---
print("\n--- 2. Fine-Tuning XGBoost ---")
xgb_param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 0.5)
}

xgb_search = RandomizedSearchCV(
    xgb_baseline, param_distributions=xgb_param_dist, 
    n_iter=20, cv=3, scoring='accuracy', random_state=42, n_jobs=-1, verbose=1
)

xgb_search.fit(X_train, y_train)
best_xgb = xgb_search.best_estimator_
print(f"Best XGB Params: {xgb_search.best_params_}")
best_xgb_pred = best_xgb.predict(X_test)
print(f"Fine-Tuned XGBoost Accuracy: {accuracy_score(y_test, best_xgb_pred):.4f}")

# --- 3. Stacking Classifier ---
print("\n--- 3. Stacking Classifier (Final Model) ---")
estimators = [
    ('xgb', best_xgb),
    ('rf', rf_baseline)
]

model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=5
)

model.fit(X_train, y_train)
print("Stacking Classifier Trained.")

--- 1. Normal (Baseline) Models ---
Baseline Random Forest Accuracy: 0.7469
Baseline XGBoost Accuracy: 0.7580

--- 2. Fine-Tuning XGBoost ---
Fitting 3 folds for each of 20 candidates, totalling 60 fits


3 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\U S E R\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\U S E R\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\U S E R\AppData\Local\Packages\PythonSoftwareFoundation.Python.

Best XGB Params: {'colsample_bytree': 0.718509402281633, 'gamma': 0.08263346953150125, 'learning_rate': 0.01469092202235818, 'max_depth': 3, 'n_estimators': 443, 'subsample': 0.7579526072702278}
Fine-Tuned XGBoost Accuracy: 0.7939

--- 3. Stacking Classifier (Final Model) ---
Stacking Classifier Trained.


 7. EVALUATE MODEL

In [8]:
pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred))

Accuracy: 0.7828492392807745

Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.66      0.68        79
           1       0.81      0.98      0.89       431
           2       0.72      0.44      0.54       213

    accuracy                           0.78       723
   macro avg       0.74      0.69      0.70       723
weighted avg       0.77      0.78      0.76       723



FEATURE IMPORTANCE

In [9]:
# Feature Importance (Using the Fine-Tuned XGBoost model)
# StackingClassifier doesn't have feature_importances_, so we use the best base model
importances = best_xgb.feature_importances_
feature_names = X.columns

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

print("Feature Ranking:")
for i in range(len(feature_names)):
    print(f"{feature_names[indices[i]]}: {importances[indices[i]]:.4f}")

Feature Ranking:
diab_hyper: 0.3352
hypertension: 0.2832
diabetes: 0.1929
bp_category: 0.0787
bp_systolic: 0.0589
age_bp: 0.0198
age: 0.0173
age_group: 0.0140


In [10]:
import os
os.makedirs("../models", exist_ok=True)

joblib.dump(model, "../models/ckd_model.pkl")
joblib.dump(label_enc, "../models/label_encoder.pkl")
print("Model saved to ../models/ckd_model.pkl")
print("Label Encoder saved to ../models/label_encoder.pkl")

Model saved to ../models/ckd_model.pkl
Label Encoder saved to ../models/label_encoder.pkl
