In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from joblib import dump


In [2]:
# Path to your local dataset
DATA_PATH = "/Users/user/Documents/GitHub/Heart-Disease-Risk---Streamlit-App/heartdataset.csv"

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"CSV not found at: {DATA_PATH}")

df = pd.read_csv(DATA_PATH)
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

print(f"✅ Loaded dataset with shape: {df.shape}")
df.head()


✅ Loaded dataset with shape: (303, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
# Try to detect the target column automatically
TARGET_CANDIDATES = ["target", "heartdisease", "num"]

target_col = next((t for t in TARGET_CANDIDATES if t in df.columns), None)
if target_col is None:
    raise ValueError("No target column found. Add a binary 'target' column to your CSV.")
else:
    print(f"🎯 Target column detected: {target_col}")


🎯 Target column detected: target


In [4]:
X = df.drop(columns=[target_col])
y = df[target_col].astype(int)

# Handle missing values
X = X.replace([np.inf, -np.inf], np.nan)
num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
for c in num_cols:
    X[c] = X[c].fillna(X[c].median())

print(f"📊 Using {len(num_cols)} numeric features")


📊 Using 13 numeric features


In [5]:
pre = ColumnTransformer([("num", StandardScaler(), num_cols)], remainder="drop")

model = Pipeline([
    ("pre", pre),
    ("clf", LogisticRegression(max_iter=300, solver="lbfgs"))
])

Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
model.fit(Xtr, ytr)

pred = model.predict(Xte)
proba = model.predict_proba(Xte)[:, 1]

print(classification_report(yte, pred))
print("ROC-AUC:", round(roc_auc_score(yte, proba), 4))


              precision    recall  f1-score   support

           0       0.86      0.68      0.76        28
           1       0.77      0.91      0.83        33

    accuracy                           0.80        61
   macro avg       0.82      0.79      0.80        61
weighted avg       0.81      0.80      0.80        61

ROC-AUC: 0.869


In [6]:
meta = {
    "feature_order": list(X.columns),
    "num_cols": num_cols,
    "target": target_col,
}

dump({"model": model, "meta": meta}, "model.pkl")
print("💾 Model saved as model.pkl")


💾 Model saved as model.pkl


In [7]:
# Verify saved model
from joblib import load
obj = load("model.pkl")
print("✅ Model reloaded successfully with features:", len(obj["meta"]["feature_order"]))


✅ Model reloaded successfully with features: 13
