In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path
import yaml

from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, RocCurveDisplay
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.svm import SVC

import shap
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Basic Exploration

In [None]:
data_path = Path("../data/heart.csv")
df = pd.read_csv(data_path)
df

In [None]:
df.info()

In [None]:
df.hist(bins=30, figsize=(15, 10))
plt.tight_layout()
plt.show()

In [None]:
# visualize categorical columns
plot_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
print("Plotting columns:", plot_cols)

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()
for i, col in enumerate(plot_cols):
    vc = df[col].value_counts(dropna=False)
    axes[i].bar(vc.index.astype(str), vc.values, color="C0")
    axes[i].set_title(f"{col} â€” counts")
    axes[i].set_xlabel(col)
    axes[i].set_ylabel("Count")
    axes[i].tick_params(axis='x', rotation=45)
# Turn off any unused subplot axes
for j in range(len(plot_cols), len(axes)):
    axes[j].axis('off')

plt.tight_layout()
plt.show()

# Baseline models

In [None]:
df_dm = pd.get_dummies(df)
df_dm

In [None]:
X = df_dm[df_dm.columns.drop('HeartDisease')].values
y = df_dm['HeartDisease'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
def simple_model(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)

    acc = accuracy_score(y_test, pred)
    cv_score = cross_val_score(clf, X_train, y_train, cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=99), scoring='roc_auc').mean()
    
    print(f"Accuracy: {acc:.2%}")
    print(f"Cross Validation Score: {cv_score:.2%}\n")

    try:
        y_proba = clf.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
    except Exception:
        # fallback to using class predictions if predict_proba is not available
        print("*** probability prediction is not available")
        auc = roc_auc_score(y_test, pred)


    print(f"AUC Score: {auc:.2%}")

    fig, ax = plt.subplots(figsize=(4, 3))
    RocCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax)
    ax.set_title("ROC_AUC_Plot")
    plt.show()

In [None]:
simple_model(xgb.XGBClassifier(eval_metric='logloss'), X_train, y_train, X_test, y_test)

In [None]:
simple_model(SVC(kernel='linear', probability=True), X_train, y_train, X_test, y_test)

# shap

In [None]:
clf = xgb.XGBClassifier(eval_metric='logloss')
clf.fit(X_train, y_train)

explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test, feature_names=df_dm[df_dm.columns.drop('HeartDisease')].columns)

In [None]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC(kernel="linear", probability=True))
])
pipeline.fit(X_train, y_train)

explainer = shap.LinearExplainer(pipeline.named_steps["svc"], X_train, feature_perturbation="correlation_dependent")
shap_values = explainer.shap_values(X_test)

# Visualize
shap.summary_plot(shap_values, X_test, feature_names=df_dm[df_dm.columns.drop('HeartDisease')].columns)