# Capstone Project - Predicting 30-day Readmission for Diabetic Patients

## Step 1: Load and Prepare the Data

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("diabetic_data.csv")
df["readmitted_binary"] = df["readmitted"].apply(lambda x: 1 if x == "<30" else 0)

# Drop irrelevant or high-missing columns
df_clean = df.drop(columns=["encounter_id", "patient_nbr", "weight", "payer_code", "medical_specialty", "readmitted"])
df_clean.replace("?", np.nan, inplace=True)
df_clean.dropna(inplace=True)

# Encode categorical variables
from sklearn.preprocessing import LabelEncoder
label_cols = df_clean.select_dtypes(include=["object"]).columns
label_encoders = {}
for col in label_cols:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

df_clean.head()

## Step 2: Feature Selection (Correlation Matrix)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

corr = df_clean.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr, cmap="coolwarm", annot=False)
plt.title("Feature Correlation Matrix")
plt.tight_layout()
plt.show()

## Step 3: Addressing Class Imbalance with SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

X = df_clean.drop(columns=["readmitted_binary"])
y = df_clean["readmitted_binary"]

# Split before SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to training set
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", pd.Series(y_train_res).value_counts())

## Step 4: Train and Compare Multiple Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

for name, model in models.items():
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    print(f"\n{name}")
    print(classification_report(y_test, y_pred))

## Step 5: Visualize Confusion Matrix of the Best Model (Random Forest)

In [None]:
from sklearn.metrics import confusion_matrix

rf_model = RandomForestClassifier()
rf_model.fit(X_train_res, y_train_res)
y_pred_rf = rf_model.predict(X_test)

conf_matrix = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Not Readmitted", "Readmitted"],
            yticklabels=["Not Readmitted", "Readmitted"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Random Forest Confusion Matrix")
plt.tight_layout()
plt.show()