# Capstone Project - Predicting 30-Day Readmission for Diabetic Patients

## Step 1: Load and Clean Data

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("diabetic_data.csv")
df["readmitted_binary"] = df["readmitted"].apply(lambda x: 1 if x == "<30" else 0)
df_clean = df.drop(columns=["encounter_id", "patient_nbr", "weight", "payer_code", "medical_specialty", "readmitted"])
df_clean.replace("?", np.nan, inplace=True)
df_clean.dropna(inplace=True)

from sklearn.preprocessing import LabelEncoder
label_cols = df_clean.select_dtypes(include=["object"]).columns
for col in label_cols:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])

df_clean.head()

## Step 2: Balance Data with SMOTE

In [None]:
!pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

X = df_clean.drop(columns=["readmitted_binary"])
y = df_clean["readmitted_binary"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", pd.Series(y_train_res).value_counts())

## Step 3: Train Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_res, y_train_res)
y_pred = rf_model.predict(X_test)

## Step 4: Evaluate Model Performance

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, y_pred))

conf_matrix = confusion_matrix(y_test, y_pred)
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(6,4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", 
            xticklabels=["Not Readmitted", "Readmitted"], 
            yticklabels=["Not Readmitted", "Readmitted"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

## Step 5: Plot ROC Curve

In [None]:
from sklearn.metrics import roc_curve, auc

y_proba = rf_model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(7, 5))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Random Forest")
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()

## Step 6: Feature Importance (Top 15)

In [None]:
importances = rf_model.feature_importances_
feature_names = X.columns

sorted_idx = np.argsort(importances)[::-1][:15]
plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_idx)), importances[sorted_idx][::-1], align='center')
plt.yticks(range(len(sorted_idx)), [feature_names[i] for i in sorted_idx][::-1])
plt.xlabel("Importance")
plt.title("Top 15 Feature Importances - Random Forest")
plt.tight_layout()
plt.show()