<a href="https://colab.research.google.com/github/NSJayaweera/NCD-Risk_Prediction/blob/Chronic_Kidney_Diseases/Final%20model%20(XGBoost).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Drive mounting (Colab)

from google.colab import drive
drive.mount('/content/drive')

DATA_PATH = "/content/drive/MyDrive/DSGP Kidney disease/kidney_disease_dataset.csv"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib

RANDOM_STATE = 42              # for reproducibility

In [3]:
# Load dataset
ckd_df = pd.read_csv(DATA_PATH)

# Make a copy
df = ckd_df.copy()
print(f"Shape of dataset: {df.shape}")

# Remove leakage feature
df = df.drop(columns=["Dialysis_Needed"])

Shape of dataset: (2304, 9)


In [4]:
TARGET_COL = "CKD_Status"

# Binary columns
binary_cols = ["Diabetes","Hypertension"]

# Continuous columns
continuous_cols = [col for col in df.columns
    if col not in binary_cols + [TARGET_COL]]


In [5]:
# Remove duplicates

# Remove rows with missing target
df = df.dropna(subset=[TARGET_COL])

# Fill missing values
for col in continuous_cols:
    df[col] = df[col].fillna(df[col].median())

for col in binary_cols:
    df[col] = df[col].fillna(df[col].mode()[0])


In [6]:
# Remove duplicates

df = df.drop_duplicates()

In [7]:
# Remove Medically Impossible Values

# Define medically valid ranges
valid_filter = (
    (df["Age"].between(0, 110)) &
    (df["Creatinine_Level"].between(0.1, 20)) &
    (df["BUN"].between(1, 150)) &
    (df["GFR"].between(0, 130)) &
    (df["Urine_Output"].between(0, 10000))
)

# Apply filter
df = df.loc[valid_filter]

In [8]:
# Feature Engineering: BUN / Creatinine Ratio

# Create ratio
df["BUN_Creat_Ratio"] = df["BUN"] / df["Creatinine_Level"].replace(0, np.nan)

# If any NaNs were created due to 0 creatinine fill them with the median ratio
df["BUN_Creat_Ratio"] = df["BUN_Creat_Ratio"].fillna(df["BUN_Creat_Ratio"].median())

In [9]:
# Define Features (X) and Target (y)
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

continuous_cols = [col for col in X.columns if col not in binary_cols]

print("Feature shape:", X.shape)
print("Target shape:", y.shape)

Feature shape: (2304, 8)
Target shape: (2304,)


In [10]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="logloss"
)

xgb_model.fit(X, y)

In [11]:
# Save the trained model
joblib.dump(xgb_model, "xgb_ckd_model.pkl")

['xgb_ckd_model.pkl']

In [12]:
# Save preprocessing info
joblib.dump(continuous_cols, "continuous_cols.pkl")
joblib.dump(binary_cols, "binary_cols.pkl")

# Save medians and modes
train_medians = X[continuous_cols].median()
train_modes = X[binary_cols].mode().iloc[0]

joblib.dump(train_medians, "train_medians.pkl")
joblib.dump(train_modes, "train_modes.pkl")


['train_modes.pkl']

In [13]:
def predict_new_patients(new_df):
    # Load model and preprocessing info
    model = joblib.load("xgb_ckd_model.pkl")
    continuous_cols = joblib.load("continuous_cols.pkl")
    binary_cols = joblib.load("binary_cols.pkl")
    train_medians = joblib.load("train_medians.pkl")
    train_modes = joblib.load("train_modes.pkl")

    # Feature Engineering for new data
    if "BUN" in new_df.columns and "Creatinine_Level" in new_df.columns:
        new_df["BUN_Creat_Ratio"] = new_df["BUN"] / new_df["Creatinine_Level"].replace(0, np.nan)
        new_df["BUN_Creat_Ratio"] = new_df["BUN_Creat_Ratio"].fillna(train_medians["BUN_Creat_Ratio"])

    # Fill missing values
    new_df[continuous_cols] = new_df[continuous_cols].fillna(train_medians)
    new_df[binary_cols] = new_df[binary_cols].fillna(train_modes)

    # Ensure column order matches training
    new_df = new_df[X.columns]

    # Predict
    predictions = model.predict(new_df)
    return predictions


In [16]:
# Example new patient
new_patient = pd.DataFrame({
    "Age": [55],
    "BUN": [20],
    "Creatinine_Level": [1.2],
    "GFR": [90],
    "Diabetes": [1],
    "Hypertension": [0],
    "Urine_Output": [1500]
})

pred = predict_new_patients(new_patient)
print("CKD Prediction:", pred[0])

CKD Prediction: 0
