In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import joblib

Load Data

In [27]:
file_path = "../data/nephro_ai_REAL_patients_risk_classified.csv"
df = pd.read_csv(file_path)

print("Dataset Loaded:")
print(df.head())

Dataset Loaded:
   patient_id   age   spo2  bp_systolic  heart_rate  hypertension  diabetes  \
0  10000032.0  52.0  96.30         89.0        96.0           0.0       0.0   
1  10000690.0  86.0  95.70        123.0        84.0           1.0       0.0   
2  10000980.0  73.0  98.91        142.0        74.0           1.0       1.0   
3  10001217.0  55.0  95.02        127.0        87.0           1.0       0.0   
4  10001725.0  46.0  98.23        100.0        79.0           0.0       0.0   

  risk_category  
0      Low Risk  
1      Low Risk  
2     High Risk  
3      Low Risk  
4      Low Risk  


HANDLE MISSING VALUE

In [28]:
df = df.dropna()

3. ENCODE CATEGORICAL COLUMNS

In [29]:
label_enc = LabelEncoder()

# Example columns (adjust based on your dataset)
# Diabetes, Hypertension might be Yes/No
categorical_cols = ["diabetes", "hypertension", "risk_category"]

for col in categorical_cols:
    df[col] = label_enc.fit_transform(df[col])

SELECT FEATURES + TARGET

In [44]:
# Advanced Feature Engineering
df['age_bp'] = df['age'] * df['bp_systolic']
df['diab_hyper'] = df['diabetes'] * df['hypertension']

# Binning (Categorizing continuous variables)
# BP Stages: Normal (<120), Elevated (120-129), High Stage 1 (130-139), High Stage 2 (>140)
df['bp_category'] = pd.cut(df['bp_systolic'], bins=[0, 120, 130, 140, 300], labels=[0, 1, 2, 3])
# Age Groups: Young (<30), Middle (30-60), Senior (>60)
df['age_group'] = pd.cut(df['age'], bins=[0, 30, 60, 120], labels=[0, 1, 2])

# Convert bins to codes
df['bp_category'] = df['bp_category'].cat.codes
df['age_group'] = df['age_group'].cat.codes

# Select all features
X = df[["age", "bp_systolic", "diabetes", "hypertension", "age_bp", "diab_hyper", "bp_category", "age_group"]]
y = df["risk_category"]

 SPLIT TRAIN / TEST

In [45]:
# Reverting SMOTE as it decreased overall accuracy in favor of recall
# We will stick to the original distribution but use stratify to ensure train/test have same proportions
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training data shape:", X_train.shape)

Training data shape: (2891, 8)


TRAIN RANDOM FOREST MODEL

In [49]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Stacking Classifier
# Level 0: XGBoost, Random Forest
# Level 1: Logistic Regression (Meta-learner)

estimators = [
    ('xgb', XGBClassifier(
        objective='multi:softmax', num_class=3, eval_metric='mlogloss',
        n_estimators=300, max_depth=5, learning_rate=0.05, random_state=42
    )),
    ('rf', RandomForestClassifier(n_estimators=300, max_depth=10, random_state=42))
]

model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=5
)

model.fit(X_train, y_train)

 7. EVALUATE MODEL

In [47]:
pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred))

Accuracy: 0.7773167358229599

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.70      0.71        79
           1       0.81      0.95      0.88       431
           2       0.68      0.46      0.55       213

    accuracy                           0.78       723
   macro avg       0.74      0.70      0.71       723
weighted avg       0.76      0.78      0.76       723



FEATURE IMPORTANCE

In [34]:
importances = model.feature_importances_
feature_names = X.columns

for name, imp in zip(feature_names, importances):
    print(f"{name}: {imp:.4f}")


age: 0.0115
bp_systolic: 0.0896
diabetes: 0.4164
hypertension: 0.4824


In [50]:
import os
os.makedirs("../models", exist_ok=True)

joblib.dump(model, "../models/ckd_model.pkl")
joblib.dump(label_enc, "../models/label_encoder.pkl")
print("Model saved to ../models/ckd_model.pkl")
print("Label Encoder saved to ../models/label_encoder.pkl")

Model saved to ../models/ckd_model.pkl
Label Encoder saved to ../models/label_encoder.pkl
