In [4]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

import joblib
import argparse
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


In [8]:
df = pd.read_csv("heart.csv")

In [22]:
if "target" not in df.columns:
    if "num" in df.columns:
        df["target"] = (df["num"] > 0).astype(int)
    else:
        raise ValueError("No 'target' (or 'num') column found in CSV.")

In [10]:
df = df.drop_duplicates().reset_index(drop=True)
df = df[df["target"].notnull()]


In [12]:
possible_cols = [
    "age","sex","cp","trestbps","chol","fbs","restecg",
    "thalach","exang","oldpeak","slope","ca","thal"
]
present = [c for c in possible_cols if c in df.columns]
missing = [c for c in possible_cols if c not in df.columns]
if missing:
    print(f"[Info] Missing columns not found in your CSV (skipped): {missing}")

X = df[present].copy()
y = df["target"].astype(int)

# Decide numeric vs categorical
numeric_cols = [c for c in present if c in ["age","trestbps","chol","thalach","oldpeak"]]
# treat these as categorical / ordinal-like (OneHot keeps it simple & robust):
categorical_cols = [c for c in present if c in ["sex","cp","fbs","restecg","exang","slope","ca","thal"]]


In [None]:
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", num_pipe, numeric_cols),
    ("cat", cat_pipe, categorical_cols)
])


In [15]:
fast_mode = False  

if fast_mode:
    clf = LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        n_jobs=None
    )
else:
    clf = RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        min_samples_split=2,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    )


In [16]:
pipe = ImbPipeline(steps=[
    ("pre", preprocessor),
    ("smote", SMOTE(random_state=42)),
    ("clf", clf)
])


In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Fit
pipe.fit(X_train, y_train)

  File "C:\Users\verma\AppData\Roaming\Python\Python311\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Program Files\Python311\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Program Files\Python311\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Program Files\Python311\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


In [18]:
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]

print("\n=== Test Metrics ===")
print(classification_report(y_test, y_pred, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
try:
    print("ROC AUC:", roc_auc_score(y_test, y_proba))
except Exception:
    pass
print("Average Precision (PR AUC):", average_precision_score(y_test, y_proba))



=== Test Metrics ===
              precision    recall  f1-score   support

           0     0.8182    0.6429    0.7200        28
           1     0.7436    0.8788    0.8056        33

    accuracy                         0.7705        61
   macro avg     0.7809    0.7608    0.7628        61
weighted avg     0.7778    0.7705    0.7663        61

Confusion Matrix:
 [[18 10]
 [ 4 29]]
ROC AUC: 0.8847402597402597
Average Precision (PR AUC): 0.9094872179120643


In [19]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipe, X, y, cv=cv, scoring="roc_auc", n_jobs=-1)
print("\n5-Fold ROC AUC:", np.round(cv_scores, 4), " | mean:", np.round(cv_scores.mean(), 4))


5-Fold ROC AUC: [0.9751 0.8923 0.8434 0.9287 0.9174]  | mean: 0.9114


In [None]:
model_out = "heart_disease_model.pkl"  

joblib.dump({
    "pipeline": pipe,
    "feature_order": present,
    "numeric_cols": numeric_cols,
    "categorical_cols": categorical_cols
}, model_out)

print(f"\nSaved model pipeline to: {model_out}")



Saved model pipeline to: heart_disease_model.pkl
