In [None]:
# heart_prediction_with_download.py
# End-to-end heart disease prediction (downloads data, preprocessing, modeling, evaluation)
# Requires: pandas, numpy, scikit-learn, imbalanced-learn, joblib
# pip install pandas numpy scikit-learn imbalanced-learn joblib

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.utils import resample
import joblib
import os

try:
    from imblearn.over_sampling import SMOTE
    _SMOTE_AVAILABLE = True
except:
    _SMOTE_AVAILABLE = False

RANDOM_STATE = 42

def download_data():
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
    cols = ["age","sex","cp","trestbps","chol","fbs","restecg","thalach",
            "exang","oldpeak","slope","ca","thal","target"]
    df = pd.read_csv(url, header=None, names=cols, na_values="?")

    # Convert multi-class target to binary
    df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)

    # Save for future use
    df.to_csv("heart.csv", index=False)
    print("✅ Downloaded and saved dataset to heart.csv")
    return df

def preprocess(df):
    X = df.drop(columns=['target'])
    y = df['target']

    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = [c for c in X.columns if c not in numeric_cols]

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent'))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ], remainder='passthrough')

    X_pre = preprocessor.fit_transform(X)
    return X_pre, y, preprocessor

def balance_data(X, y):
    if _SMOTE_AVAILABLE:
        sm = SMOTE(random_state=RANDOM_STATE)
        return sm.fit_resample(X, y)
    else:
        df_temp = pd.DataFrame(X)
        df_temp['target'] = y.values
        majority = df_temp[df_temp['target'] == 0]
        minority = df_temp[df_temp['target'] == 1]
        minority_up = resample(minority, replace=True, n_samples=len(majority), random_state=RANDOM_STATE)
        upsampled = pd.concat([majority, minority_up])
        return upsampled.drop(columns=['target']).values, upsampled['target']

def train_and_evaluate(X_train, y_train, X_test, y_test):
    results = {}

    # Logistic Regression
    log_clf = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
    log_clf.fit(X_train, y_train)
    y_pred = log_clf.predict(X_test)
    y_proba = log_clf.predict_proba(X_test)[:, 1]
    results['logistic'] = {
        'model': log_clf,
        'accuracy': accuracy_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_proba),
        'report': classification_report(y_test, y_pred, output_dict=True),
        'confusion': confusion_matrix(y_test, y_pred)
    }

    # Random Forest
    rf = RandomForestClassifier(random_state=RANDOM_STATE)
    gs = GridSearchCV(rf, {'n_estimators':[100,200], 'max_depth':[None,6,10]}, cv=5, scoring='roc_auc')
    gs.fit(X_train, y_train)
    best_rf = gs.best_estimator_
    y_pred_rf = best_rf.predict(X_test)
    y_proba_rf = best_rf.predict_proba(X_test)[:, 1]
    results['random_forest'] = {
        'model': best_rf,
        'accuracy': accuracy_score(y_test, y_pred_rf),
        'roc_auc': roc_auc_score(y_test, y_proba_rf),
        'report': classification_report(y_test, y_pred_rf, output_dict=True),
        'confusion': confusion_matrix(y_test, y_pred_rf),
        'best_params': gs.best_params_
    }

    # SVM
    svc = SVC(probability=True, random_state=RANDOM_STATE)
    svc.fit(X_train, y_train)
    y_pred_svc = svc.predict(X_test)
    y_proba_svc = svc.predict_proba(X_test)[:, 1]
    results['svc'] = {
        'model': svc,
        'accuracy': accuracy_score(y_test, y_pred_svc),
        'roc_auc': roc_auc_score(y_test, y_proba_svc),
        'report': classification_report(y_test, y_pred_svc, output_dict=True),
        'confusion': confusion_matrix(y_test, y_pred_svc)
    }

    return results

def save_best_model(results, preprocessor):
    best_model = None
    best_score = -1
    best_name = ""
    for name, res in results.items():
        score = res['roc_auc']
        if score > best_score:
            best_score = score
            best_model = res['model']
            best_name = name
    pipeline = make_pipeline(preprocessor, best_model)
    joblib.dump(pipeline, "best_heart_model.joblib")
    print(f"💾 Saved best model ({best_name}) with ROC AUC: {best_score:.4f}")

def main():
    df = download_data()
    X_all, y_all, preprocessor = preprocess(df)
    X_train, X_test, y_train, y_test = train_test_split(
        X_all, y_all, test_size=0.2, stratify=y_all, random_state=RANDOM_STATE
    )
    X_train_bal, y_train_bal = balance_data(X_train, y_train)
    results = train_and_evaluate(X_train_bal, y_train_bal, X_test, y_test)

    for name, res in results.items():
        print(f"\n=== {name.upper()} ===")
        print(f"Accuracy: {res['accuracy']:.4f}, ROC AUC: {res['roc_auc']:.4f}")
        print("Confusion Matrix:")
        print(res['confusion'])

    save_best_model(results, preprocessor)

if __name__ == "__main__":
    main()


✅ Downloaded and saved dataset to heart.csv

=== LOGISTIC ===
Accuracy: 0.8689, ROC AUC: 0.9448
Confusion Matrix:
[[27  6]
 [ 2 26]]

=== RANDOM_FOREST ===
Accuracy: 0.9016, ROC AUC: 0.9524
Confusion Matrix:
[[28  5]
 [ 1 27]]

=== SVC ===
Accuracy: 0.8525, ROC AUC: 0.9448
Confusion Matrix:
[[27  6]
 [ 3 25]]
💾 Saved best model (random_forest) with ROC AUC: 0.9524
