In [None]:
import pandas as pd
import numpy as np



from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

In [5]:
df = pd.read_csv("../data/raw/Heart_Disease_Prediction.csv")
df.head()

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence


In [None]:
# Target
y = (df["Heart Disease"] == "Presence").astype(int)

# Features
X = df.drop(columns=["Heart Disease"])

ratio_features = ["Age", "BP", "Cholesterol", "Max HR", "ST depression"]
categorical_features = [
    "Sex", "Chest pain type", "FBS over 120", "EKG results",
    "Exercise angina", "Slope of ST", "Number of vessels fluro", "Thallium"
]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), ratio_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ],
    remainder="drop"
)

In [9]:
logreg = LogisticRegression(max_iter=2000)

pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", logreg)
])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=55, stratify=y
)

# 5-fold CV on training set (stability check)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=55)

cv_res = cross_validate(
    pipe, X_train, y_train,
    cv=cv,
    scoring=["roc_auc", "accuracy"],
    return_train_score=False
)

print("CV ROC-AUC: mean =", cv_res["test_roc_auc"].mean().round(3),
      "std =", cv_res["test_roc_auc"].std().round(3))
print("CV Accuracy: mean =", cv_res["test_accuracy"].mean().round(3),
      "std =", cv_res["test_accuracy"].std().round(3))

CV ROC-AUC: mean = 0.909 std = 0.024
CV Accuracy: mean = 0.857 std = 0.042


Low standard deviation means that the model is stable accross folds

In [14]:
pipe.fit(X_train, y_train)

y_proba = pipe.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.5).astype(int)

print("Test ROC-AUC:", roc_auc_score(y_test, y_proba))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

Test ROC-AUC: 0.9324561403508771
[[34  4]
 [ 5 25]]
              precision    recall  f1-score   support

           0      0.872     0.895     0.883        38
           1      0.862     0.833     0.847        30

    accuracy                          0.868        68
   macro avg      0.867     0.864     0.865        68
weighted avg      0.868     0.868     0.867        68



True negatives: 34
False positives: 4
False negatives: 5
True positives: 25

For threshold = 0.5
Specificity (true negative rate):
TN / (TN + FP) = 34 / 38 = 0.895

Sensitivity (recall for disease):
TP / (TP + FN) = 25 / 30 = 0.833

Since these two numbers (0.895 vs 0.833) are close means that the model does not favor class 1 (disease) or class 0 (no disease).

In [15]:
for t in [0.3, 0.4, 0.5, 0.6]:
    y_pred_t = (y_proba >= t).astype(int)
    print(f"Threshold {t}")
    print(confusion_matrix(y_test, y_pred_t))

Threshold 0.3
[[26 12]
 [ 3 27]]
Threshold 0.4
[[32  6]
 [ 4 26]]
Threshold 0.5
[[34  4]
 [ 5 25]]
Threshold 0.6
[[34  4]
 [ 6 24]]


Logistic regression provides stable discrimination between patients with and without heart disease and a threshold of 0.4 provides a good balance between sensitivity and specificity, while 0.5 favors reducing false positives.