In [3]:
# 06_hyperparameter_tuning_fixed.py

import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

# -----------------------------
# 1. Load dataset
# -----------------------------
DATA_PATH = "data/processed_cleaned.csv"
df = pd.read_csv(DATA_PATH)
X = df.drop(columns=['target']).values
y = df['target'].values

# Detect if binary classification
binary_class = len(np.unique(y)) == 2

# -----------------------------
# 2. Train-test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# -----------------------------
# 3. RandomForest GridSearch
# -----------------------------
rf_pipe = Pipeline([('scaler', StandardScaler()), ('clf', RandomForestClassifier(random_state=42))])
rf_param_grid = {
    'clf__n_estimators': [100, 200, 500],
    'clf__max_depth': [None, 5, 10],
    'clf__min_samples_split': [2, 5],
    'clf__max_features': ['auto', 0.8, 0.5]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rf_grid = GridSearchCV(rf_pipe, rf_param_grid, cv=cv, scoring='roc_auc_ovr' if not binary_class else 'roc_auc', n_jobs=-1, verbose=2)
rf_grid.fit(X_train, y_train)

print("Best RF params:", rf_grid.best_params_)
print("Best RF AUC (CV):", rf_grid.best_score_)

best_rf = rf_grid.best_estimator_

# Compute test AUC
if binary_class:
    y_proba_rf = best_rf.predict_proba(X_test)[:,1]
else:
    y_proba_rf = best_rf.predict_proba(X_test)
auc_rf = roc_auc_score(y_test, y_proba_rf, multi_class='ovr' if not binary_class else None)
print("RF Test AUC:", auc_rf)

# -----------------------------
# 4. SVM RandomizedSearch
# -----------------------------
svm_pipe = Pipeline([('scaler', StandardScaler()), ('clf', SVC(probability=True, random_state=42))])
svm_param_dist = {
    'clf__C': [0.1, 1, 10, 100],
    'clf__gamma': ['scale', 'auto'],
    'clf__kernel': ['rbf', 'poly']
}
svm_search = RandomizedSearchCV(
    svm_pipe, svm_param_dist, n_iter=6, cv=cv,
    scoring='roc_auc_ovr' if not binary_class else 'roc_auc', n_jobs=-1, random_state=42
)
svm_search.fit(X_train, y_train)

print("Best SVM params:", svm_search.best_params_)
print("Best SVM AUC (CV):", svm_search.best_score_)

best_svm = svm_search.best_estimator_

if binary_class:
    y_proba_svm = best_svm.predict_proba(X_test)[:,1]
else:
    y_proba_svm = best_svm.predict_proba(X_test)
auc_svm = roc_auc_score(y_test, y_proba_svm, multi_class='ovr' if not binary_class else None)
print("SVM Test AUC:", auc_svm)

# -----------------------------
# 5. Select final model
# -----------------------------
final_model = best_rf if auc_rf >= auc_svm else best_svm
print("Selected final model:", final_model)


Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best RF params: {'clf__max_depth': 5, 'clf__max_features': 0.8, 'clf__min_samples_split': 5, 'clf__n_estimators': 100}
Best RF AUC (CV): 0.791053332525619
RF Test AUC: 0.7662321407838649
Best SVM params: {'clf__kernel': 'rbf', 'clf__gamma': 'scale', 'clf__C': 0.1}
Best SVM AUC (CV): 0.7874781372315427
SVM Test AUC: 0.7999719692823141
Selected final model: Pipeline(steps=[('scaler', StandardScaler()),
                ('clf', SVC(C=0.1, probability=True, random_state=42))])


90 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\MsterX\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\MsterX\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\MsterX\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\pipeline.py", line 663, in fit
    self._final_estimator.fit(Xt, y, **la