In [1]:
# 📌 Step 1: Import Libraries

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [2]:
# Step 2: Load Dataset


# Load dataset
df = pd.read_csv(r"E:\MultipleDisease\kidney_disease.csv")  # Use raw string 'r' to handle Windows paths safely

# Clean column names: strip whitespace, lowercase, replace spaces with underscores
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Display the first 5 rows
df.head()


Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [11]:
# 📌 Step 3: Basic Data Cleaning

# Drop unnamed columns (e.g., 'Unnamed: 0') if present
df = df.loc[:, ~df.columns.str.contains('^unnamed', case=False)]

# Drop rows where 'classification' (target) is missing
df.dropna(subset=['classification'], inplace=True)

# Clean and encode the target variable
df['classification'] = (
    df['classification']
    .astype(str)                      # Ensure all are strings
    .str.strip()                      # Remove leading/trailing spaces or tabs
    .str.lower()                      # Convert to lowercase
    .replace({'ckd': 1, 'ckd\t': 1, 'notckd': 0, 'notckd\t': 0})
)

# Show class distribution
df['classification'].value_counts()


classification
1    250
0    150
Name: count, dtype: int64

In [12]:
# 📌 Step 4: Encode Categorical Variables
# Identify categorical columns
categorical_cols = df.select_dtypes(include='object').columns

# Initialize label encoder
le = LabelEncoder()

# Encode each categorical column
for col in categorical_cols:
    # Step 1: Clean and handle missing values
    df[col] = df[col].astype(str).str.strip().replace('?', pd.NA)

    # Step 2: Fill missing with mode (most frequent value)
    if df[col].isna().sum() > 0:
        df[col] = df[col].fillna(df[col].mode()[0])

    # Step 3: Encode with LabelEncoder
    df[col] = le.fit_transform(df[col])

# Display updated dataset
df.head()


Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,-1.727726,-0.210031,0.254214,0.421486,0.076249,-0.380269,-0.560523,0.684468,-0.291825,-0.17932,...,0.189948,0.253875,-0.12562,1.298336,1.370741,-0.276686,-0.510723,-0.47269,-0.407926,1
1,-1.719066,-2.627234,-1.972476,0.421486,2.363728,-0.380269,-0.560523,0.684468,-0.291825,-0.17932,...,-0.390784,-0.325583,1.074572,-0.73827,-0.698302,-0.276686,-0.510723,-0.47269,-0.407926,1
2,-1.710406,0.615355,0.254214,-1.421074,0.838742,2.507853,0.895381,0.684468,-0.291825,-0.17932,...,-1.068304,0.181443,1.074572,-0.73827,1.370741,-0.276686,1.965511,-0.47269,2.357675,1
3,-1.701745,-0.210031,-0.488016,-2.342354,2.363728,-0.380269,0.895381,-1.838915,2.780013,-0.17932,...,-0.971515,-0.108286,-1.165786,1.298336,-0.698302,-0.276686,1.965511,2.048324,2.357675,1
4,-1.693085,-0.033163,0.254214,-1.421074,0.838742,-0.380269,0.895381,0.684468,-0.291825,-0.17932,...,-0.681149,0.10901,-0.605697,-0.73827,-0.698302,-0.276686,-0.510723,-0.47269,-0.407926,1


In [13]:
# 📌 Step 5: Handle Missing Numeric Values & Scale
from sklearn.preprocessing import StandardScaler

# Select only numeric columns, excluding the target
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.drop('classification')

# Fill missing values with the median (more robust than mean)
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].median())

# Standardize (z-score) the numerical features
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# View final preprocessed data
df.head()


Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,-1.727726,-0.210031,0.254214,0.421486,0.076249,-0.380269,-0.560523,0.684468,-0.291825,-0.17932,...,0.189948,0.253875,-0.12562,1.298336,1.370741,-0.276686,-0.510723,-0.47269,-0.407926,1
1,-1.719066,-2.627234,-1.972476,0.421486,2.363728,-0.380269,-0.560523,0.684468,-0.291825,-0.17932,...,-0.390784,-0.325583,1.074572,-0.73827,-0.698302,-0.276686,-0.510723,-0.47269,-0.407926,1
2,-1.710406,0.615355,0.254214,-1.421074,0.838742,2.507853,0.895381,0.684468,-0.291825,-0.17932,...,-1.068304,0.181443,1.074572,-0.73827,1.370741,-0.276686,1.965511,-0.47269,2.357675,1
3,-1.701745,-0.210031,-0.488016,-2.342354,2.363728,-0.380269,0.895381,-1.838915,2.780013,-0.17932,...,-0.971515,-0.108286,-1.165786,1.298336,-0.698302,-0.276686,1.965511,2.048324,2.357675,1
4,-1.693085,-0.033163,0.254214,-1.421074,0.838742,-0.380269,0.895381,0.684468,-0.291825,-0.17932,...,-0.681149,0.10901,-0.605697,-0.73827,-0.698302,-0.276686,-0.510723,-0.47269,-0.407926,1


In [9]:

df.to_csv("kidney_cleaned.csv", index=False)
print("✅ Cleaned dataset saved as 'kidney_cleaned.csv'")


✅ Cleaned dataset saved as 'kidney_cleaned.csv'


In [14]:

# 📊 Step 6: Univariate Analysis – Age Distribution
fig = px.histogram(
    df,
    x='age',
    nbins=30,
    title="Age Distribution of Patients",
    color_discrete_sequence=['indianred'],  # Optional styling
    marginal="box"  # Adds a boxplot on top
)
fig.update_layout(
    xaxis_title='Age',
    yaxis_title='Count',
    bargap=0.1
)
fig.show()


In [15]:

# 📊 Step 7: Univariate Analysis – CKD vs Not CKD

# Map numeric target to labels for better visualization
df['ckd_status'] = df['classification'].map({1: 'CKD', 0: 'Not CKD'})

fig = px.histogram(
    df,
    x='ckd_status',
    color='ckd_status',
    title="CKD vs Not CKD (Target Variable)",
    labels={'ckd_status': 'CKD Status'},
    color_discrete_sequence=['darkred', 'darkgreen']
)

fig.update_layout(
    xaxis_title='CKD Status',
    yaxis_title='Number of Patients',
    showlegend=False
)

fig.show()


In [None]:
# 📊 Step 8: Bivariate Analysis – Blood Urea vs CKD Status


# Ensure correct data type
df['bu'] = pd.to_numeric(df['bu'], errors='coerce')
df = df[df['bu'].notna()]  # Drop rows with NaN in blood urea

# Create box plot
fig = px.box(
    df,
    x='classification',
    y='bu',  # Corrected from 'blood_urea' to 'bu'
    points="all",
    title="Blood Urea by CKD Classification",
    color='classification',
    template="plotly_white"
)

# Update layout
fig.update_layout(
    xaxis_title="Classification (0 = Not CKD, 1 = CKD)",
    yaxis_title="Blood Urea (mg/dL)",
    showlegend=False
)

fig.show()


In [18]:
# 📊 Step 9: Bivariate – Correlation with Target (Bar Plot)

# Calculate correlation with target 'classification'
correlations = df.corr(numeric_only=True)['classification'].drop('classification').sort_values(ascending=False)

# Create bar plot using Plotly
fig = px.bar(
    x=correlations.index,
    y=correlations.values,
    title="Feature Correlation with Target (CKD)",
    labels={'x': 'Features', 'y': 'Correlation'},
    template="plotly_white"
)

# Update layout
fig.update_layout(
    xaxis_title="Features",
    yaxis_title="Correlation Coefficient",
    xaxis_tickangle=-45
)

fig.show()


In [19]:
# 📊 Step 10: Multivariate Analysis – Correlation Heatmap


# Compute correlation matrix (numeric columns only)
corr = df.corr(numeric_only=True)

# Create heatmap using Plotly
fig = go.Figure(data=go.Heatmap(
    z=corr.values,
    x=corr.columns,
    y=corr.columns,
    colorscale='Viridis',
    colorbar=dict(title="Correlation"),
    zmin=-1, zmax=1  # Fix the range for better interpretation
))

# Update layout with title and formatting
fig.update_layout(
    title="Correlation Heatmap of Numerical Features",
    xaxis_nticks=len(corr.columns),
    yaxis_nticks=len(corr.columns),
    width=800,
    height=800
)

fig.show()


In [20]:
# 📊 Step 11: Multivariate – Pairplot Alternative (Scatter Matrix)

# Use correct column names from your dataset
subset_cols = ['age', 'bu', 'sc', 'classification']  # bu = blood urea, sc = serum creatinine

# Optional: Drop rows with NaN in selected columns
df_subset = df[subset_cols].dropna()

# Generate scatter matrix
fig = px.scatter_matrix(
    df_subset,
    dimensions=subset_cols[:-1],  # Only features, exclude 'classification' from axes
    color='classification',
    title="Scatter Matrix of Selected Features (Age, BU, SC)",
    template="plotly_white"
)

# Hide diagonal histograms for cleaner view
fig.update_traces(diagonal_visible=False)
fig.show()


In [21]:
# 📊 Step 12: Other – Hypertension Count

# Create histogram for hypertension
fig = px.histogram(
    df,
    x='htn',  # Correct column name for hypertension
    title="Hypertension Count (0 = No, 1 = Yes)",
    color='htn',  # Optional: add color distinction
    template="plotly_white"
)

# Update axis labels
fig.update_layout(
    xaxis_title="Hypertension (0 = No, 1 = Yes)",
    yaxis_title="Count",
    showlegend=False
)

fig.show()
