<a href="https://colab.research.google.com/github/NSJayaweera/NCD-Risk_Prediction/blob/Chronic_Kidney_Diseases/Final%20model%20(XGBoost).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Drive mounting (Colab)

from google.colab import drive
drive.mount('/content/drive')

DATA_PATH = "/content/drive/MyDrive/DSGP Kidney disease/kidney_disease_dataset.csv"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

RANDOM_STATE = 42              # for reproducibility
TEST_SIZE = 0.30               # 30% test split
VAL_SIZE = 0.30                # 30% of train used as validation
TARGET_COL = "CKD_Status"

In [3]:
# Load dataset
ckd_df = pd.read_csv(DATA_PATH)

# Make a copy
df = ckd_df.copy()
print(f"Shape of dataset: {df.shape}\n")

# Remove leakage feature
df = df.drop(columns=["Dialysis_Needed"])

Shape of dataset: (2304, 9)



In [4]:
# Checking for missing values
print("\nMissing Values per Column:")
display(df.isnull().sum())


Missing Values per Column:


Unnamed: 0,0
Age,0
Creatinine_Level,0
BUN,0
Diabetes,0
Hypertension,0
GFR,0
Urine_Output,0
CKD_Status,0


In [5]:
#Handle missing values

# Target
TARGET_COL = "CKD_Status"

# Binary columns
binary_cols = [
    "Diabetes",
    "Hypertension"
]

# Continuous columns (exclude target & binary)
continuous_cols = [
    col for col in df.columns
    if col not in binary_cols + [TARGET_COL]
]


# Remove rows with missing target
before_rows = df.shape[0]
df = df.dropna(subset=[TARGET_COL])
after_rows = df.shape[0]

print(f"Rows removed due to missing target: {before_rows - after_rows}")

for col in continuous_cols:
    median_value = df[col].median()
    df[col].fillna(median_value)

for col in binary_cols:
    mode_value = df[col].mode()[0]
    df[col].fillna(mode_value)

print("\nMissing values after handling:")
display(df.isnull().sum())


Rows removed due to missing target: 0

Missing values after handling:


Unnamed: 0,0
Age,0
Creatinine_Level,0
BUN,0
Diabetes,0
Hypertension,0
GFR,0
Urine_Output,0
CKD_Status,0


In [6]:
# Check for duplicate rows
print("\nDuplicate Rows:")
print(df.duplicated().sum())


Duplicate Rows:
0


In [7]:
# Remove duplicate rows

before_rows = df.shape[0]
df = df.drop_duplicates()
after_rows = df.shape[0]

print("\nDuplicate rows removed:", before_rows - after_rows)
print("Shape after removing duplicates:", df.shape)

# Verify no duplicates remain
print("\nDuplicate rows after removal:")
print(df.duplicated().sum())


Duplicate rows removed: 0
Shape after removing duplicates: (2304, 8)

Duplicate rows after removal:
0


In [8]:
# Detection of medically impossible values

invalid_age = ~df["Age"].between(0, 110)
invalid_creat = ~df["Creatinine_Level"].between(0.1, 20)
invalid_bun = ~df["BUN"].between(1, 150)
invalid_gfr = ~df["GFR"].between(0, 130)
invalid_urine = ~df["Urine_Output"].between(0, 10000)

print("\nInvalid counts per rule:")
print("Age:", invalid_age.sum())
print("Creatinine_Level:", invalid_creat.sum())
print("BUN:", invalid_bun.sum())
print("GFR:", invalid_gfr.sum())
print("Urine_Output:", invalid_urine.sum())



Invalid counts per rule:
Age: 0
Creatinine_Level: 0
BUN: 0
GFR: 0
Urine_Output: 0


In [9]:
# Remove Medically Impossible Values

# Define medically valid ranges
valid_filter = (
    (df["Age"].between(0, 110)) &
    (df["Creatinine_Level"].between(0.1, 20)) &
    (df["BUN"].between(1, 150)) &
    (df["GFR"].between(0, 130)) &
    (df["Urine_Output"].between(0, 10000))
)

before_shape = df.shape

# Apply filter
df = df.loc[valid_filter]

after_shape = df.shape

print("Shape before medical anomaly removal:", before_shape)
print("Shape after medical anomaly removal: ", after_shape)
print("Rows removed:", before_shape[0] - after_shape[0])


Shape before medical anomaly removal: (2304, 8)
Shape after medical anomaly removal:  (2304, 8)
Rows removed: 0


In [10]:
# Feature Engineering: BUN / Creatinine Ratio

# Create ratio
df["BUN_Creat_Ratio"] = df["BUN"] / df["Creatinine_Level"].replace(0, np.nan)

# If any NaNs were created due to 0 creatinine fill them with the median ratio
df["BUN_Creat_Ratio"] = df["BUN_Creat_Ratio"].fillna(df["BUN_Creat_Ratio"].median())

print("New Feature Added: BUN_Creat_Ratio")
print("Dataset Shape After Feature Engineering:", df.shape)

# Verification
display(df[["BUN", "Creatinine_Level", "BUN_Creat_Ratio"]].head())

New Feature Added: BUN_Creat_Ratio
Dataset Shape After Feature Engineering: (2304, 9)


Unnamed: 0,BUN,Creatinine_Level,BUN_Creat_Ratio
0,40.9,0.3,136.333333
1,17.1,1.79,9.553073
2,15.0,2.67,5.617978
3,31.1,0.97,32.061856
4,22.8,2.05,11.121951


In [11]:
# Define Features (X) and Target (y)
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL]

print("Feature shape:", X.shape)
print("Target shape:", y.shape)

Feature shape: (2304, 8)
Target shape: (2304,)


In [12]:
# Train–Test Split

from sklearn.model_selection import train_test_split

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y,
    test_size=TEST_SIZE,          # e.g., 0.30
    random_state=RANDOM_STATE,
    stratify=y
)

print("Train_full shape:", X_train_full.shape)
print("Test shape:      ", X_test.shape)


# Train–Validation Split (from Training Data)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full,
    test_size=VAL_SIZE,           # e.g., 0.30 of train_full
    random_state=RANDOM_STATE,
    stratify=y_train_full
)

print("Train shape:     ", X_train.shape)
print("Validation shape:", X_val.shape)
print("Test shape:      ", X_test.shape)

# Identify Binary & Continuous Columns

binary_cols = ["Diabetes", "Hypertension"]
continuous_cols = [col for col in X.columns if col not in binary_cols]

print("Binary columns:     ", binary_cols)
print("Continuous columns: ", continuous_cols)


# Scaling (Z-score) - Scale ONLY Continuous Features

scaler = StandardScaler()

# Fit scaler on TRAIN only
X_train_scaled = X_train.copy()
X_val_scaled   = X_val.copy()
X_test_scaled  = X_test.copy()

X_train_scaled[continuous_cols] = scaler.fit_transform(X_train[continuous_cols])
X_val_scaled[continuous_cols]   = scaler.transform(X_val[continuous_cols])
X_test_scaled[continuous_cols]  = scaler.transform(X_test[continuous_cols])

print("Scaling complete.")
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_val_scaled shape:  ", X_val_scaled.shape)
print("X_test_scaled shape: ", X_test_scaled.shape)


Train_full shape: (1612, 8)
Test shape:       (692, 8)
Train shape:      (1128, 8)
Validation shape: (484, 8)
Test shape:       (692, 8)
Binary columns:      ['Diabetes', 'Hypertension']
Continuous columns:  ['Age', 'Creatinine_Level', 'BUN', 'GFR', 'Urine_Output', 'BUN_Creat_Ratio']
Scaling complete.
X_train_scaled shape: (1128, 8)
X_val_scaled shape:   (484, 8)
X_test_scaled shape:  (692, 8)


In [13]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="logloss"
)

xgb_model.fit(X_train, y_train)