In [14]:
import numpy as np
import pandas as pd

np.random.seed(42)

N = 50000  # CHANGE THIS (e.g., 50000 or 100000)

# Generate features
age = np.random.randint(18, 90, N)
gender = np.random.choice(["M", "F"], N)

diagnosis_code = np.random.choice([f"D{i}" for i in range(1, 11)], N)

procedures = np.random.poisson(lam=2, size=N)
lab_result = np.random.normal(loc=100, scale=15, size=N)

length_of_stay = np.random.randint(1, 15, N)
prev_admissions = np.random.poisson(lam=1.5, size=N)

discharge_type = np.random.choice(
    ["Home", "Rehab", "Nursing"], N, p=[0.6, 0.25, 0.15]
)

# Risk scoring logic
risk_score = (
    0.03 * age +
    0.5 * prev_admissions +
    0.2 * length_of_stay +
    0.05 * procedures +
    np.where(discharge_type == "Nursing", 2, 0) +
    np.where(lab_result > 120, 1, 0)
)

# Adjust for healthcare-like imbalance (~20%)
probability = 1 / (1 + np.exp(-0.08 * (risk_score - 18)))

readmitted = np.random.binomial(1, probability)

# Create DataFrame
df = pd.DataFrame({
    "age": age,
    "gender": gender,
    "diagnosis_code": diagnosis_code,
    "procedures": procedures,
    "lab_result": lab_result,
    "length_of_stay": length_of_stay,
    "prev_admissions": prev_admissions,
    "discharge_type": discharge_type,
    "readmitted": readmitted
})

# Save
df.to_csv("patient_data.csv", index=False)

print("New dataset created!")
print("Shape:", df.shape)
print("Class balance:\n", df["readmitted"].value_counts(normalize=True))


New dataset created!
Shape: (50000, 9)
Class balance:
 readmitted
0    0.74752
1    0.25248
Name: proportion, dtype: float64
