In [None]:
import numpy as np
import pandas as pd

np.random.seed(42)

N = 5000

data = pd.DataFrame({
    "age": np.random.randint(18, 90, N),
    "sex": np.random.choice(["Male", "Female"], N),
    "length_of_stay": np.random.poisson(lam=5, size=N) + 1,
    "num_comorbidities": np.random.poisson(lam=2, size=N),
    "has_diabetes": np.random.binomial(1, 0.25, N),
    "has_hypertension": np.random.binomial(1, 0.35, N),
    "prior_admissions": np.random.poisson(lam=1, size=N),
    "icu_stay": np.random.binomial(1, 0.15, N)
})

# Generate readmission risk (non-random logic)
logit = (
    -3
    + 0.03 * data["age"]
    + 0.4 * data["num_comorbidities"]
    + 0.6 * data["prior_admissions"]
    + 0.8 * data["icu_stay"]
    + 0.3 * data["has_diabetes"]
)

prob = 1 / (1 + np.exp(-logit))
data["readmitted_30d"] = np.random.binomial(1, prob)

data.head()

In [None]:
# Basic structure
data.info()

# Missing values
data.isna().sum()

# Logical checks
assert (data["age"] >= 18).all()
assert (data["length_of_stay"] > 0).all()

# Convert categorical
data["sex"] = data["sex"].astype("category")

In [None]:
data["readmitted_30d"].value_counts(normalize=True)

In [None]:
data.describe()

In [None]:
import matplotlib.pyplot as plt

plt.figure()
data.groupby("readmitted_30d")["length_of_stay"].mean().plot(kind="bar")
plt.title("Average Length of Stay by Readmission Status")
plt.ylabel("Days")
plt.show()

In [None]:
plt.figure()
data.groupby("readmitted_30d")["num_comorbidities"].mean().plot(kind="bar")
plt.title("Average Comorbidities by Readmission Status")
plt.show()

In [None]:
df = data.copy()

# Age bands (clinically interpretable)
df["age_group"] = pd.cut(
    df["age"],
    bins=[18, 40, 60, 75, 100],
    labels=["18-39", "40-59", "60-74", "75+"]
)

# High risk flag
df["high_risk_comorbidity"] = (df["num_comorbidities"] >= 3).astype(int)

# Encode categorical
df = pd.get_dummies(df, columns=["sex", "age_group"], drop_first=True)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X = df.drop(columns=["readmitted_30d"])
y = df["readmitted_30d"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

In [None]:
importance = pd.Series(
    np.exp(model.coef_[0]),
    index=X.columns
).sort_values(ascending=False)

importance.head(10)