In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
dataset_path = '/content/kidney_disease_dataset.csv'
df = pd.read_csv(dataset_path)

# Display first few rows to understand structure
print("Preview of dataset:")
print(df.head())

# Feature columns and target
feature_columns = ['Age', 'Creatinine_Level', 'BUN', 'Diabetes', 'Hypertension', 'Urine_Output']
target_column = 'GFR'  # Predicting kidney function

# Check if columns exist
missing_cols = [col for col in feature_columns + [target_column] if col not in df.columns]
if missing_cols:
    raise ValueError(f"Missing columns in dataset: {missing_cols}")

# Convert categorical columns to numeric (if not already)
for col in ['Diabetes', 'Hypertension']:
    if df[col].dtype == object:
        df[col] = df[col].map({'Yes': 1, 'No': 0})

# Drop rows with missing values
df = df.dropna(subset=feature_columns + [target_column])

# Optional: Visualize GFR categories
def gfr_category(gfr):
    if gfr >= 90:
        return 'Normal'
    elif gfr >= 60:
        return 'Mild CKD'
    elif gfr >= 30:
        return 'Moderate CKD'
    elif gfr >= 15:
        return 'Severe CKD'
    else:
        return 'Kidney Failure'

df['GFR Category'] = df[target_column].apply(gfr_category)

# Pie chart of GFR categories
gfr_counts = df['GFR Category'].value_counts()
plt.figure(figsize=(6,6))
plt.pie(gfr_counts, labels=gfr_counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette("pastel"))
plt.title('Distribution of GFR Categories')
plt.show()

# Prepare training data
X = df[feature_columns]
y = df[target_column]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n📊 Model Evaluation")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared (Accuracy): {r2*100:.2f}%")

# Plot Actual vs Predicted GFR
plt.figure(figsize=(8,6))
sns.scatterplot(x=y_test, y=y_pred)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel('Actual GFR')
plt.ylabel('Predicted GFR')
plt.title('Actual vs Predicted GFR')
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load dataset
dataset_path = '/content/kidney_disease_dataset.csv'  # change path if needed
df = pd.read_csv(dataset_path)

# Display first few rows
print("Dataset Preview:")
print(df.head())

# Convert categorical columns (if they contain 'Yes'/'No') to numeric
binary_map = {'Yes': 1, 'No': 0}
for col in ['Diabetes', 'Hypertension', 'CKD_Status', 'Dialysis_Needed']:
    if df[col].dtype == object:
        df[col] = df[col].map(binary_map)

# Define features and target
feature_columns = ['Age', 'Creatinine_Level', 'BUN', 'Diabetes', 'Hypertension', 'Urine_Output']
target_column = 'CKD_Status'  # Binary classification (0 = No CKD, 1 = CKD)

# Drop rows with missing values
df = df.dropna(subset=feature_columns + [target_column])

# Pie chart: distribution of CKD status
status_counts = df[target_column].value_counts().sort_index()
status_labels = ['No CKD', 'CKD']
plt.figure(figsize=(6,6))
plt.pie(status_counts, labels=status_labels, autopct='%1.1f%%', startangle=140, colors=sns.color_palette("pastel"))
plt.title('CKD Status Distribution')
plt.show()

# Prepare features (X) and target (y)
X = df[feature_columns]
y = df[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train logistic regression model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%\n")

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No CKD', 'CKD']))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No CKD', 'CKD'],
            yticklabels=['No CKD', 'CKD'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
dataset_path = '/content/kidney_disease_dataset.csv'  # update if needed
df = pd.read_csv(dataset_path)

# Preview data
print("Dataset preview:")
print(df.head())

# Map categorical 'Yes'/'No' columns to 1/0
binary_map = {'Yes': 1, 'No': 0}
for col in ['Diabetes', 'Hypertension', 'CKD_Status', 'Dialysis_Needed']:
    if df[col].dtype == object:
        df[col] = df[col].map(binary_map)

# Define features and target
feature_columns = ['Age', 'Creatinine_Level', 'BUN', 'Diabetes', 'Hypertension', 'Urine_Output']
target_column = 'CKD_Status'  # 0 = No CKD, 1 = CKD

# Drop rows with missing values in important columns
df = df.dropna(subset=feature_columns + [target_column])

# Pie chart of CKD Status distribution
status_counts = df[target_column].value_counts().sort_index()
status_labels = ['No CKD', 'CKD']
plt.figure(figsize=(6,6))
plt.pie(status_counts, labels=status_labels, autopct='%1.1f%%', startangle=140, colors=sns.color_palette("pastel"))
plt.title('CKD Status Distribution')
plt.show()

# Prepare data for training/testing
X = df[feature_columns]
y = df[target_column]

# Split dataset (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Initialize and train Decision Tree classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100:.2f}%\n")

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=status_labels))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=status_labels,
            yticklabels=status_labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Visualize decision tree (optional, can be large!)
plt.figure(figsize=(20,10))
plot_tree(model, feature_names=feature_columns, class_names=status_labels, filled=True, rounded=True)
plt.title("Decision Tree for CKD Status Classification")
plt.show()