# Kidney Disease Analysis

This notebook analyzes the Chronic Kidney Disease (CKD) dataset to understand factors associated with kidney disease and develop predictive models.

## 1. Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Set style for better visualizations
plt.style.use('seaborn')
sns.set_palette('husl')

In [None]:
# Load the dataset
df = pd.read_csv('../data/Kidney/kidney_disease.csv')
print(f"Dataset shape: {df.shape}")
df.head()

## 2. Data Overview

In [None]:
# Basic information about the dataset
print("\nDataset Info:")
df.info()

print("\nMissing Values:")
print(df.isnull().sum())

print("\nBasic Statistics:")
df.describe()

## 3. Exploratory Data Analysis

In [None]:
# Distribution of target variable
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='classification')
plt.title('Distribution of Kidney Disease Cases')
plt.xlabel('Disease Status')
plt.ylabel('Count')
plt.show()

In [None]:
# Age distribution by disease status
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='classification', y='age')
plt.title('Age Distribution by Disease Status')
plt.xlabel('Disease Status')
plt.ylabel('Age')
plt.show()

In [None]:
# Analyze categorical variables
categorical_cols = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']

for col in categorical_cols:
    plt.figure(figsize=(8, 6))
    sns.countplot(data=df, x=col, hue='classification')
    plt.title(f'{col} Distribution by Disease Status')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# Correlation matrix of numerical features
numerical_cols = df.select_dtypes(include=[np.number]).columns
plt.figure(figsize=(15, 10))
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Features')
plt.show()

## 4. Feature Engineering

In [None]:
# Handle missing values
df = df.fillna(df.mean())

# Encode categorical variables
categorical_cols = ['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane']
label_encoders = {}

for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    df[col] = label_encoders[col].fit_transform(df[col].astype(str))

# Prepare features and target
X = df.drop(['id', 'classification'], axis=1)
y = df['classification']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 5. Model Development

In [None]:
# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = rf_model.predict(X_test_scaled)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## 6. Feature Importance Analysis

In [None]:
# Plot feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

## 7. Risk Factor Analysis

In [None]:
# Analyze key risk factors
risk_factors = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo']

for factor in risk_factors:
    plt.figure(figsize=(8, 6))
    sns.boxplot(data=df, x='classification', y=factor)
    plt.title(f'{factor} Distribution by Disease Status')
    plt.xlabel('Disease Status')
    plt.ylabel(factor)
    plt.show()

## 8. Conclusions and Recommendations

Based on the analysis, we can draw the following conclusions:

1. Key risk factors for kidney disease include:
   - Age
   - Blood Pressure
   - Specific Gravity
   - Albumin levels
   - Blood Glucose levels
   - Blood Urea levels
   - Serum Creatinine levels
   - Hemoglobin levels

2. The Random Forest model shows good performance in predicting kidney disease.

3. Recommendations for early detection and prevention:
   - Regular monitoring of blood pressure
   - Regular kidney function tests
   - Monitoring of blood glucose levels
   - Age-appropriate screening programs
   - Regular check-ups for individuals with risk factors