# Imports

In [1]:
import numpy as np
import pandas as pd

import os
import pickle
import threading

from sklearn.feature_selection import RFECV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.base import clone

from xgboost import XGBClassifier

import optuna

# Demo dataset

In [2]:
# Generate some data (just for demonstration purposes)
X, y = make_classification(n_samples=1000, n_features=30, n_informative=10, random_state=42)

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Simple variant
**Do a hyperparameter optimization and then refit an RFECV + XGB with the best hyperparameters again.**

In [3]:
def objective(trial, X_train, y_train):

    # XGB hyperparameters (for better list of hyperparameters see https://github.com/optuna/optuna-examples/tree/main/xgboost)
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 1, 10)
    }

    xgb_model = XGBClassifier(**params)

    # Combine the classifier with the scaler. This has the advantage that the scaler will be fit
    # automatically with the correct training data in every fold of the cross-validation in the RFECV
    classifier_pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", xgb_model)
    ])

    # If you're doing this with a pipeline, you need to give RFECV the importance_getter
    importance_getter = "named_steps.classifier.feature_importances_"

    classifier = RFECV(classifier_pipeline, cv=3, n_jobs=4, scoring="f1", importance_getter=importance_getter)
    classifier.fit(X_train, y_train)

    # The RFECV already performs a cross-validation, so no custom validation set is needed!
    # This way the model used in the RFECV is exactly the same (same hyperparams) as the model used for evaluation.
    # We can just get the mean score for the best number of features (which is stored in n_features_ after fitting).
    # This line only works if rfecv.step==1 (I think, havent tested it with other values)
    score = classifier.cv_results_["mean_test_score"][classifier.n_features_ - 1] 
    
    trial.set_user_attr("n_features", classifier.n_features_) # Just for logging
    return score

In [4]:
# Do the hyperparameter optimization
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial, X_train, y_train), n_jobs=4, n_trials=50)

[I 2024-10-12 17:34:39,491] A new study created in memory with name: no-name-230f6837-1331-4589-9c76-54e0db442183
[I 2024-10-12 17:34:47,110] Trial 3 finished with value: 0.8460200957085044 and parameters: {'learning_rate': 0.05243446457557887, 'n_estimators': 279, 'max_depth': 1}. Best is trial 3 with value: 0.8460200957085044.
[I 2024-10-12 17:34:53,980] Trial 1 finished with value: 0.8421207139703824 and parameters: {'learning_rate': 0.0015335301046056164, 'n_estimators': 303, 'max_depth': 8}. Best is trial 3 with value: 0.8460200957085044.
[I 2024-10-12 17:34:54,862] Trial 0 finished with value: 0.9002548463326043 and parameters: {'learning_rate': 0.028566888124578346, 'n_estimators': 602, 'max_depth': 2}. Best is trial 0 with value: 0.9002548463326043.
[I 2024-10-12 17:34:56,766] Trial 2 finished with value: 0.8160865853895296 and parameters: {'learning_rate': 0.0072765397472806855, 'n_estimators': 850, 'max_depth': 1}. Best is trial 0 with value: 0.9002548463326043.
[I 2024-10-12

In [5]:
# Refit the model and RFECV with the best hyperparameters on all training data
best_params = study.best_params
xgb_model = XGBClassifier(**best_params)
classifier_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", xgb_model)
])

# The RFECV itself is an sklearn estimator! It can be used a normal classifier.
classifier = RFECV(classifier_pipeline, cv=3, n_jobs=4, scoring="f1", importance_getter="named_steps.classifier.feature_importances_")
classifier.fit(X_train, y_train)

# Evaluate the model on the test set
# Again: The RFECV object can be used similarly to a normal model
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))
print(f"Number of features: {classifier.n_features_}")

Test score: 0.924
Number of features: 17


# Fancy variant
I've found a way to optimize the refit part: The exact RFECV refit in the cell above has already been done in one of the optuna trials.  
**So we can save the best classifier during the optimization and just access it later.**  
You can do this using a global variable or by saving the object to disk as a file. I first thought that a global variable doesn't work because of multithreading and implemented it with files. But global variables should actually work too and are simpler.  
As mentioned, optuna usually runs multithreaded, i.e. with multiple trials running at the same time.  
So you have to synchronize access to the file/variable with a [`threading.Lock`](https://docs.python.org/3/library/threading.html#lock-objects).

## With global variables

In [None]:
best_classifier = None
best_score = -1

In [None]:
def objective(trial, X_train, y_train, lock):

    global best_classifier
    global best_score

    # XGB hyperparameters (for better list of hyperparameters see https://github.com/optuna/optuna-examples/tree/main/xgboost)
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 1, 10)
    }

    xgb_model = XGBClassifier(**params)

    # Combine the classifier with the scaler. This has the advantage that the scaler will be fit
    # automatically with the correct training data in every fold of the cross-validation in the RFECV
    classifier_pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", xgb_model)
    ])
    # If you're doing this with a pipeline, you need to give RFECV the importance_getter
    importance_getter = "named_steps.classifier.feature_importances_"

    classifier = RFECV(classifier_pipeline, cv=3, n_jobs=1, scoring="f1", importance_getter=importance_getter)
    classifier.fit(X_train, y_train)

    # The RFECV already performs a cross-validation, so we can just get the mean score 
    # for the best number of features (which is the one stored in n_features_ after fitting)
    # This line only works if rfecv.step==1 (I think, havent tested it with other values)
    score = classifier.cv_results_["mean_test_score"][classifier.n_features_ - 1] 

    with lock: # Exclusive to avoid race conditions and lost updates with the other optuna threads
        if score > best_score:
            # Save best classifier
            best_classifier = classifier
            best_score = score
    
    trial.set_user_attr("n_features", classifier.n_features_)
    return score

In [None]:
# Do the hyperparameter optimization
study = optuna.create_study(direction="maximize")
lock = threading.Lock()
study.optimize(lambda trial: objective(trial, X_train, y_train, lock), n_jobs=16, n_trials=50)

[I 2024-10-12 18:33:13,694] A new study created in memory with name: no-name-139224cb-e533-432f-bfef-b19c649d4fad
[I 2024-10-12 18:34:22,460] Trial 6 finished with value: 0.6898220962091953 and parameters: {'learning_rate': 0.0005989028123519016, 'n_estimators': 462, 'max_depth': 1}. Best is trial 6 with value: 0.6898220962091953.
[I 2024-10-12 18:34:31,289] Trial 10 finished with value: 0.7973090503003516 and parameters: {'learning_rate': 0.0001025082173794712, 'n_estimators': 226, 'max_depth': 3}. Best is trial 10 with value: 0.7973090503003516.
[I 2024-10-12 18:34:48,163] Trial 12 finished with value: 0.9105669703617049 and parameters: {'learning_rate': 0.05527663090631521, 'n_estimators': 148, 'max_depth': 10}. Best is trial 12 with value: 0.9105669703617049.
[I 2024-10-12 18:34:51,882] Trial 9 finished with value: 0.8977680421174047 and parameters: {'learning_rate': 0.019275236694670378, 'n_estimators': 339, 'max_depth': 3}. Best is trial 12 with value: 0.9105669703617049.
[I 2024

In [None]:
# Evaluate the model on the test set
y_pred = best_classifier.predict(X_test)
print(classification_report(y_test, y_pred))
print(f"Number of features: {best_classifier.n_features_}")

              precision    recall  f1-score   support

           0       0.93      0.90      0.92       123
           1       0.91      0.94      0.92       127

    accuracy                           0.92       250
   macro avg       0.92      0.92      0.92       250
weighted avg       0.92      0.92      0.92       250

Number of features: 18


## With files

In [3]:
def objective(trial, X_train, y_train, temp_results_path, lock):

    # XGB hyperparameters (for better list of hyperparameters see https://github.com/optuna/optuna-examples/tree/main/xgboost)
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 1, 10)
    }

    xgb_model = XGBClassifier(**params)

    # Combine the classifier with the scaler. This has the advantage that the scaler will be fit
    # automatically with the correct training data in every fold of the cross-validation in the RFECV
    classifier_pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", xgb_model)
    ])
    # If you're doing this with a pipeline, you need to give RFECV the importance_getter
    importance_getter = "named_steps.classifier.feature_importances_"

    classifier = RFECV(classifier_pipeline, cv=3, n_jobs=4, scoring="f1", importance_getter=importance_getter)
    classifier.fit(X_train, y_train)

    # The RFECV already performs a cross-validation, so we can just get the mean score 
    # for the best number of features (which is the one stored in n_features_ after fitting)
    # This line only works if rfecv.step==1 (I think, havent tested it with other values)
    score = classifier.cv_results_["mean_test_score"][classifier.n_features_ - 1] 

    with lock: # Exclusive to avoid race conditions and lost updates with the other optuna threads
        # Get current best score
        files = os.listdir(temp_results_path)
        if len(files) == 1:
            # Convert file name to score
            best_score = float(files[0].split("_")[-1].split(".")[0])
        elif len(files) == 0:
            best_score = -1
        else:
            raise ValueError(f"More than one file in {temp_results_path}! {files}")

        if score > best_score:
            if best_score != -1:
                # Delete old best estimator
                os.remove(os.path.join(temp_results_path, files[0]))
            # Save best estimator
            path = os.path.join(temp_results_path, f"temp_best_estimator_{score}.pkl")
            with open(path, 'wb') as f:
                # Saves the thing that we would retrain ourselves in the simpler variant
                pickle.dump(classifier, f)
    
    trial.set_user_attr("n_features", classifier.n_features_)
    return score

In [4]:
# Setup
temp_results_path = "temp_results" # Your path here
os.makedirs(temp_results_path, exist_ok=True)
lock = threading.Lock()

# Do the hyperparameter optimization
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial, X_train, y_train, temp_results_path, lock), n_jobs=4, n_trials=50)

[I 2024-10-12 18:02:27,307] A new study created in memory with name: no-name-13e8378d-b9f6-4922-9b6b-4b372e72b383
[I 2024-10-12 18:02:42,536] Trial 0 finished with value: 0.9076228333955191 and parameters: {'learning_rate': 0.04539687366328101, 'n_estimators': 594, 'max_depth': 2}. Best is trial 0 with value: 0.9076228333955191.
[I 2024-10-12 18:02:49,846] Trial 1 finished with value: 0.7837392249185079 and parameters: {'learning_rate': 0.0007979452630045476, 'n_estimators': 482, 'max_depth': 2}. Best is trial 0 with value: 0.9076228333955191.
[I 2024-10-12 18:02:53,753] Trial 2 finished with value: 0.8959800183365397 and parameters: {'learning_rate': 0.005892803155073984, 'n_estimators': 815, 'max_depth': 3}. Best is trial 0 with value: 0.9076228333955191.
[I 2024-10-12 18:03:00,768] Trial 3 finished with value: 0.9137847222222222 and parameters: {'learning_rate': 0.01954442022129439, 'n_estimators': 930, 'max_depth': 5}. Best is trial 3 with value: 0.9137847222222222.
[I 2024-10-12 1

In [5]:
# Now just load the best classifier, no new RFECV necessary
path = os.path.join(temp_results_path, os.listdir(temp_results_path)[0])
with open(path, 'rb') as f:
    classifier = pickle.load(f)

# Evaluate the model on the test set
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))
print(f"Number of features: {classifier.n_features_}")

Test score: 0.94
Number of features: 14
