# Logistic Regression for Heart Disease

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [6]:
# Load preprocessed data from the preprocessing folder
X = pd.read_csv("../preprocessing/preprocessed_heart_disease_X.csv")
y = pd.read_csv("../preprocessing/preprocessed_heart_disease_y.csv").squeeze()

print(X.shape, y.shape)

(297, 17) (297,)


In [7]:
# Split into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1, stratify=y
)

# Build a pipeline: Scaling → PCA → Logistic Regression
log_reg = LogisticRegression(max_iter=10000, class_weight='balanced', solver='newton-cg')
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),  
    ('log_reg', log_reg)
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate on test data
y_pred = pipeline.predict(X_test)

# Evaluate performance
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

✅ Accuracy: 0.5333333333333333

Confusion Matrix:
 [[35 11  1  0  1]
 [ 5  4  3  1  3]
 [ 2  1  3  1  4]
 [ 0  4  1  5  1]
 [ 0  2  1  0  1]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.73      0.78        48
           1       0.18      0.25      0.21        16
           2       0.33      0.27      0.30        11
           3       0.71      0.45      0.56        11
           4       0.10      0.25      0.14         4

    accuracy                           0.53        90
   macro avg       0.43      0.39      0.40        90
weighted avg       0.61      0.53      0.56        90



## Evaluation

In [8]:
k = 5
kfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=1)
cv_scores = cross_val_score(log_reg, X, y, cv=kfold, scoring='accuracy')
print(f"{k}-Fold Cross Validation Results:")
print("Accuracy for each fold:", np.round(cv_scores, 3))
print("Mean Accuracy:", np.round(cv_scores.mean(), 3))
print("Standard Deviation:", np.round(cv_scores.std(), 3))

5-Fold Cross Validation Results:
Accuracy for each fold: [0.55  0.533 0.61  0.525 0.508]
Mean Accuracy: 0.545
Standard Deviation: 0.035
