In [None]:
import numpy as np
import pandas as pd

from pathlib import Path
import yaml

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Model tuning

In [None]:
data_path = Path("../data/heart.csv")

In [None]:
df = pd.read_csv(data_path)

X = df.drop(columns=['HeartDisease'])
y = df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)

In [None]:

def build_preprocessing_pipeline(X: pd.DataFrame) -> ColumnTransformer:
    # Auto-detect numeric and categorical columns and build separate transformers.
    # Numeric columns: scale with StandardScaler.
    # Categorical columns: One-hot encode (handle unknown categories at transform time).
    numeric_features = X.select_dtypes(include=["number"]).columns.tolist()
    categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

    transformers = []
    if numeric_features:
        transformers.append(("num", StandardScaler(), numeric_features))
    if categorical_features:
        # sparse_output=False for compatibility with downstream estimators expecting dense arrays
        transformers.append(("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features))

    transformer = ColumnTransformer(
        transformers=transformers,
        remainder="drop"
    )
    return transformer

In [None]:


# Build pipeline (uses your preprocessing builder)
preprocessor = build_preprocessing_pipeline(X_train)   # X is the full feature DataFrame
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("svc", SVC())   # we'll set probability in param grid if desired
])

# Parameter grid (tune kernel, C, gamma, class_weight). Include probability=True if you want predict_proba.
param_grid = {
    "svc__kernel": ["rbf", "linear"],
    "svc__C": [0.1, 1, 10],
    "svc__gamma": ["scale", "auto"] + list(np.logspace(-5, 1, num=10, endpoint=True)),
    "svc__class_weight": [None, "balanced"],
    # enabling probability=True makes ROC via predict_proba possible but slows training; optional
    "svc__probability": [True]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=99)

gs = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=cv,
    n_jobs=-1,
    verbose=2,
    refit=True
)

# Fit on training data
gs.fit(X_train, y_train)

print("Best params:", gs.best_params_)
print("Best cross-val ROC-AUC:", gs.best_score_)

# Evaluate on test set
best_pipe = gs.best_estimator_
y_pred = best_pipe.predict(X_test)

# Prefer predict_proba for ROC AUC if available, else use decision_function
if hasattr(best_pipe, "predict_proba"):
    try:
        y_proba = best_pipe.predict_proba(X_test)[:, 1]
    except Exception:
        y_proba = best_pipe.decision_function(X_test)
else:
    y_proba = best_pipe.decision_function(X_test)

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print(f"Test Accuracy: {acc:.4f}")
print(f"Test ROC-AUC: {auc:.4f}")

In [None]:
best_params = gs.best_params_

clean_params = {
    k.replace("svc__", ""): v
    for k, v in best_params.items()
}

config = {
    "model": {
        "type": "svc",
        "params": clean_params
    }
}

In [None]:
config_path = Path("../config/model_config.yaml")
config_path.parent.mkdir(parents=True, exist_ok=True)

with open(config_path, "w") as f:
    yaml.safe_dump(
        config,
        f,
        sort_keys=False
    )

print(f"Saved best parameters to {config_path}")