In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/diabetic_data.csv")
print("✅ Data loaded")


✅ Data loaded


In [2]:
df.replace("?", np.nan, inplace=True)
df.drop(columns=["encounter_id", "patient_nbr"], inplace=True)


In [3]:
age_map = {
    '[0-10)': 5, '[10-20)': 15, '[20-30)': 25,
    '[30-40)': 35, '[40-50)': 45, '[50-60)': 55,
    '[60-70)': 65, '[70-80)': 75, '[80-90)': 85,
    '[90-100)': 95
}
df["age"] = df["age"].map(age_map)



In [4]:
df["high_risk"] = np.where(
    (df["A1Cresult"].isin([">8", ">7"])) |
    (df["max_glu_serum"] == ">300") |
    (df["number_diagnoses"] >= 7),
    1, 0
)



In [5]:
features = [
    "age", "gender", "time_in_hospital",
    "num_lab_procedures", "num_medications",
    "number_outpatient", "number_emergency",
    "number_inpatient", "number_diagnoses",
    "insulin", "diabetesMed"
]

X = df[features]
y = df["high_risk"]



In [6]:
X = pd.get_dummies(X, drop_first=True)
print("✅ Encoding done")


✅ Encoding done


In [7]:
import joblib
joblib.dump(list(X.columns), "../model/feature_names.pkl")
print("✅ Feature names saved")


✅ Feature names saved


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [10]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(
    hidden_layer_sizes=(128, 64, 32),
    max_iter=100,
    random_state=42,
    verbose=True
)

model.fit(X_train_scaled, y_train)


Iteration 1, loss = 0.16805895
Iteration 2, loss = 0.11057981
Iteration 3, loss = 0.10948447
Iteration 4, loss = 0.10884334
Iteration 5, loss = 0.10803516
Iteration 6, loss = 0.10821769
Iteration 7, loss = 0.10806715
Iteration 8, loss = 0.10829542
Iteration 9, loss = 0.10720095
Iteration 10, loss = 0.10745361
Iteration 11, loss = 0.10679572
Iteration 12, loss = 0.10733641
Iteration 13, loss = 0.10631256
Iteration 14, loss = 0.10632612
Iteration 15, loss = 0.10609460
Iteration 16, loss = 0.10580776
Iteration 17, loss = 0.10572824
Iteration 18, loss = 0.10547112
Iteration 19, loss = 0.10498786
Iteration 20, loss = 0.10512246
Iteration 21, loss = 0.10471355
Iteration 22, loss = 0.10449760
Iteration 23, loss = 0.10418091
Iteration 24, loss = 0.10407481
Iteration 25, loss = 0.10376225
Iteration 26, loss = 0.10355186
Iteration 27, loss = 0.10272766
Iteration 28, loss = 0.10295450
Iteration 29, loss = 0.10252381
Iteration 30, loss = 0.10229215
Iteration 31, loss = 0.10278234
Iteration 32, los



In [11]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))


Accuracy: 0.9494939569617765
ROC AUC: 0.9824890056755295
              precision    recall  f1-score   support

           0       0.88      0.94      0.91      5342
           1       0.98      0.95      0.97     15012

    accuracy                           0.95     20354
   macro avg       0.93      0.95      0.94     20354
weighted avg       0.95      0.95      0.95     20354



In [12]:
joblib.dump(model, "../model/ann_model.pkl")
joblib.dump(scaler, "../model/scaler.pkl")

print("✅ All model artifacts saved")


✅ All model artifacts saved
