In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Load the baseline from Notebook 01
df = pd.read_csv('kidney_disease_cleaned.csv')
print("Dataset loaded for Feature Engineering.")

Dataset loaded for Feature Engineering.


In [2]:
# Encoding Categorical Features
# Identifying columns with text values
le = LabelEncoder()
object_cols = df.select_dtypes(include=['object']).columns

for col in object_cols:
    # Fill NA temporarily to avoid errors during encoding, then put them back
    null_mask = df[col].isnull()
    df[col] = le.fit_transform(df[col].astype(str))
    df.loc[null_mask, col] = np.nan

print(f"Encoded columns: {list(object_cols)}")

Encoded columns: ['red_blood_cells', 'pus_cell', 'pus_cell_clumps', 'bacteria', 'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'pedal_edema', 'anemia']


In [3]:
# Advanced Imputation using k-NN
# We use KNNImputer to fill missing values based on similarities
imputer = KNNImputer(n_neighbors=5)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

print("Missing values after Imputation:", df_imputed.isnull().sum().sum())

Missing values after Imputation: 0


In [4]:
# Feature Scaling
scaler = MinMaxScaler()

# We scale all features except the target 'classification'
X = df_imputed.drop('classification', axis=1)
y = df_imputed['classification']

X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
df_final = pd.concat([X_scaled, y], axis=1)

print("Features scaled to [0, 1] range.")
df_final.head()

Features scaled to [0, 1] range.


Unnamed: 0,age,blood_pressure,specific_gravity,albumin,sugar,red_blood_cells,pus_cell,pus_cell_clumps,bacteria,blood_glucose_random,...,packed_cell_volume,white_blood_cell_count,red_blood_cell_count,hypertension,diabetes_mellitus,coronary_artery_disease,appetite,pedal_edema,anemia,classification
0,0.522727,0.230769,0.75,0.2,0.0,0.8,1.0,0.0,0.0,0.211538,...,0.777778,0.231405,0.525424,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.056818,0.0,0.75,0.8,0.0,0.6,1.0,0.0,0.0,0.194444,...,0.644444,0.157025,0.484746,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.681818,0.230769,0.25,0.4,0.6,1.0,1.0,0.0,0.0,0.856838,...,0.488889,0.219008,0.288136,0.0,1.0,0.0,1.0,0.0,1.0,1.0
3,0.522727,0.153846,0.0,0.8,0.0,1.0,0.0,1.0,0.0,0.202991,...,0.511111,0.18595,0.305085,1.0,0.0,0.0,1.0,1.0,1.0,1.0
4,0.556818,0.230769,0.25,0.4,0.0,1.0,1.0,0.0,0.0,0.179487,...,0.577778,0.210744,0.423729,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [5]:
# Save Final Preprocessed Data
df_final.to_csv('kidney_disease_final.csv', index=False)
print("Final preprocessed dataset saved as 'kidney_disease_final.csv'")

Final preprocessed dataset saved as 'kidney_disease_final.csv'
