## Goal: Compare four Python-based HPO libraries:

1. Optuna -  Bayesian optimization

2. Hyperopt - Bayesian optimization with TPE

3. Optunity - Evolutionary/nature-inspired algorithms

4. SMAC (Sequential Model-Based Algorithm Configuration)

### Benchmarking Criteria:

1. CASH Problem (Combined Algorithm Selection and Hyperparameter optimization):

2. NeurIPS Black-box Optimization Challenge:


In [None]:
pip install openml

In [None]:
import openml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import optuna
import time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from hyperopt import fmin, tpe, hp, Trials

In [None]:
dataset_ids = {
    "dna": 40670,
    "electricity": 151,    
    "gas_drift": 1476,     
    "nomao": 1486,
    "pendigits": 32,
    "semeion": 1501,      
}
datasets = {}

for name, did in dataset_ids.items():
    print(f"Downloading: {name}")
    d = openml.datasets.get_dataset(did)
    X, y, _, _ = d.get_data(target=d.default_target_attribute)
    df = pd.concat([X, y], axis=1)
    datasets[name] = df
    print(f"{name} → shape: {df.shape}")

In [None]:
def info(m):
    print(datasets[m].head())
    print(datasets[m].info())
    col_name = [col for col in datasets[m].columns if col.lower() == "class"][0]
    print('Total Class:', datasets[m][col_name].unique())

info('electricity')

### K fold cross validation

In [None]:
# where K = 3
def kfold_cross_validation(model, X, y):
    kf = KFold(n_splits=3, shuffle=True, random_state=42)
    f1_scores = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        score = f1_score(y_val, preds, average='weighted')
        f1_scores.append(score)

    return np.mean(f1_scores)

## Benchmark 1

1. Bernoulli naive Bayes 
2. Multinomial naive Bayes 
3. Decision Tree 
4. Extra Trees 
5. Gradient Boosting 
6. Random Forest 
7. K Nearest Neighbors 
8. Logistic Regression 
9. Linear SVM 
10. SGD Classifier 
11. XGB Classifier 
12. LGBM Classifier 

A total of 58 hyperparameters, including continuous and categorical

### Optuna

In [None]:
def objective(trial, X, y, classifier):
    if classifier == 'bernoulliNB':
        binarize = trial.suggest_categorical('binarize', [0.0, 0.5, 1.0])
        alpha = trial.suggest_float('alpha', 0.0001, 1.0)
        model = BernoulliNB(binarize=binarize, alpha=alpha)
        return kfold_cross_validation(model, X, y)
        
    elif classifier == 'multinomialNB':
        alpha = trial.suggest_float('alpha', 0.0001, 1.0)
        fit_prior = trial.suggest_categorical('fit_prior', [True, False])
        model = MultinomialNB(alpha, fit_prior)
        return kfold_cross_validation(model, X, y)

    elif classifier == 'DecisionTree':
        criterion = trial.suggest_categorical("criterion", ['gini','entropy'])
        max_depth = trial.suggest_int('max_depth',2,32)
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
        max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
        model = DecisionTreeClassifier(criterion=criterion,max_depth=max_depth,
                                       min_samples_split=min_samples_split,
                                       max_features=max_features,random_state=42)
        return kfold_cross_validation(model, X, y)

    elif classifier == 'ExtraTrees':
        n_estimators=trial.suggest_int("n_estimators", 50, 200),
        criterion=trial.suggest_categorical("criterion", ["gini", "entropy"]),
        max_depth=trial.suggest_int("max_depth", 2, 32),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
        max_features=trial.suggest_categorical("max_features", ["sqrt", "log2", None])
        model = ExtraTreesClassifier(n_estimators=n_estimators,max_depth=max_depth,max_features=max_features,
                                     min_samples_split=min_samples_split,min_samples_leaf=min_samples_leaf,
                                     random_state=42,n_jobs=-1)
        return kfold_cross_validation(model, X, y)

    elif classifier == 'GradientBoosting':
        n_estimators=trial.suggest_int("n_estimators", 50, 200),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.3),
        subsample=trial.suggest_float("subsample", 0.5, 1.0),
        max_depth=trial.suggest_int("max_depth", 2, 32),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
        model = GradientBoostingClassifier(n_estimators=n_estimators,learning_rate=learning_rate,max_depth=max_depth,
                                           min_samples_split=min_samples_split,min_samples_leaf=min_samples_leaf,
                                           max_features=max_features,random_state=42)
        return kfold_cross_validation(model, X, y)

    elif classifier == 'RandomForest':
        model = RandomForestClassifier(
            n_estimators=trial.suggest_int("n_estimators", 50, 200),
            max_depth=trial.suggest_int("max_depth", 2, 32),
            min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
            max_features=trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
            random_state=42
        )
        return kfold_cross_validation(model, X, y)

    elif classifier == 'KNN':
        model = KNeighborsClassifier(
            n_neighbors=trial.suggest_int("n_neighbors", 3, 15),
            weights=trial.suggest_categorical("weights", ["uniform", "distance"]),
            algorithm=trial.suggest_categorical("algorithm", ["auto", "ball_tree", "kd_tree", "brute"])
        )
        return kfold_cross_validation(model, X, y)
    
    elif classifier == 'LogisticRegression':
        model = LogisticRegression(
            penalty=trial.suggest_categorical("penalty", ["l2", "l1"]),
            C=trial.suggest_float("C", 1e-3, 10.0),
            solver=trial.suggest_categorical("solver", ["lbfgs", "saga"]),
            max_iter=trial.suggest_int("max_iter", 100, 1000),
            random_state=42
        )
        return kfold_cross_validation(model, X, y)
    
    elif classifier == 'SVC':
        model = SVC(
            C=trial.suggest_float("C", 1e-3, 10.0),
            kernel=trial.suggest_categorical("kernel", ["linear", "rbf", "poly"]),
            gamma=trial.suggest_categorical("gamma", ["scale", "auto"]),
            probability=True
        )
        return kfold_cross_validation(model, X, y)
    
    elif classifier == 'SGDClassifier':
        model = SGDClassifier(
            loss=trial.suggest_categorical("loss", ["hinge", "log_loss", "modified_huber"]),
            penalty=trial.suggest_categorical("penalty", ["l2", "l1", "elasticnet"]),
            alpha=trial.suggest_float("alpha", 1e-5, 1e-1),
            learning_rate=trial.suggest_categorical("learning_rate", ["constant", "optimal", "invscaling"]),
            eta0=trial.suggest_float("eta0", 1e-3, 1.0),
            random_state=42
        )
        return kfold_cross_validation(model, X, y)
    
    elif classifier == 'XGBClassifier':
        model = XGBClassifier(
            n_estimators=trial.suggest_int("n_estimators", 50, 200),
            max_depth=trial.suggest_int("max_depth", 3, 10),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.3),
            subsample=trial.suggest_float("subsample", 0.5, 1.0),
            colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1.0),
            use_label_encoder=False,
            eval_metric='logloss',
            random_state=42
        )
        return kfold_cross_validation(model, X, y)
    
    elif classifier == 'LGBMClassifier':
        model = LGBMClassifier(
            n_estimators=trial.suggest_int("n_estimators", 50, 200),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.3),
            max_depth=trial.suggest_int("max_depth", 3, 10),
            num_leaves=trial.suggest_int("num_leaves", 20, 100),
            subsample=trial.suggest_float("subsample", 0.5, 1.0),
            colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1.0),
            random_state=42
        )
        return kfold_cross_validation(model, X, y)

In [None]:
import time

def run_optuna_search_TPE(X, y, classifier):
    start_time = time.time()
    study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
    study.optimize(lambda trial: objective(trial,X,y, classifier), n_trials=50)

    end_time =time.time()

    best_score = study.best_value
    best_params = study.best_params
    time_taken = end_time - start_time

    return best_score, time_taken, best_params

In [None]:
models = ['BernoulliNB', 'MultinomialNB', 'DecisionTree', 'ExtraTrees', 'GradientBoosting',
          'RandomForest', 'KNN', 'LogisticRegression', 'SVC', 'SGDClassifier', 'XGBClassifier', 'LGBMClassifier']

In [None]:
X = datasets['dna'].iloc[:,:-1]
y = datasets['dna'].iloc[:,-1]

score, time, params = run_optuna_search_TPE(X, y, 'DecisionTree')

In [None]:
print(score)
print(params)
print(time)

In [None]:
import time
def run_optuna_search_random(X, y, classifier):
    start_time = time.time()
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial,X,y, classifier), n_trials=50)

    end_time =time.time()

    best_score = study.best_value
    best_params = study.best_params
    time_taken = end_time - start_time

    return best_score, time_taken, best_params

score, time, params = run_optuna_search_ramdom(X, y, 'DecisionTree')



In [None]:
print(score)
print(params)
print(time)

### HyperOPT

In [None]:
def objective(params, classifier, X, y):
    if classifier == 'bernoulliNB':
        binarize = params['binarize']
        alpha = params['alpha']
        model = BernoulliNB(binarize=binarize, alpha=alpha)
        return kfold_cross_validation(model, X, y)

    elif classifier == 'multinomialNB':
        alpha = params['alpha']
        fit_prior = params['fit_prior']
        model = MultinomialNB(alpha=alpha, fit_prior=fit_prior)
        return kfold_cross_validation(model, X, y)

    elif classifier == 'DecisionTree':
        criterion = params['criterion']
        max_depth = params['max_depth']
        min_samples_split = params['min_samples_split']
        max_features = params['max_features']
        model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth,
                                       min_samples_split=min_samples_split,
                                       max_features=max_features, random_state=42)
        return kfold_cross_validation(model, X, y)

    elif classifier == 'ExtraTrees':
        n_estimators = params['n_estimators']
        criterion = params['criterion']
        max_depth = params['max_depth']
        min_samples_split = params['min_samples_split']
        max_features = params['max_features']
        model = ExtraTreesClassifier(n_estimators=n_estimators, max_depth=max_depth,
                                     max_features=max_features, min_samples_split=min_samples_split,
                                     random_state=42)
        return kfold_cross_validation(model, X, y)

    elif classifier == 'GradientBoosting':
        n_estimators = params['n_estimators']
        learning_rate = params['learning_rate']
        subsample = params['subsample']
        max_depth = params['max_depth']
        min_samples_split = params['min_samples_split']
        model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate,
                                           max_depth=max_depth, min_samples_split=min_samples_split,
                                           subsample=subsample, random_state=42)
        return kfold_cross_validation(model, X, y)

    elif classifier == 'RandomForest':
        model = RandomForestClassifier(
            n_estimators=params['n_estimators'],
            max_depth=params['max_depth'],
            min_samples_split=params['min_samples_split'],
            max_features=params['max_features'],
            random_state=42
        )
        return kfold_cross_validation(model, X, y)

    elif classifier == 'KNN':
        model = KNeighborsClassifier(
            n_neighbors=params['n_neighbors'],
            weights=params['weights'],
            algorithm=params['algorithm']
        )
        return kfold_cross_validation(model, X, y)
    
    elif classifier == 'LogisticRegression':
        model = LogisticRegression(
            penalty=params['penalty'],
            C=params['C'],
            solver=params['solver'],
            max_iter=params['max_iter'],
            random_state=42
        )
        return kfold_cross_validation(model, X, y)
    
    elif classifier == 'SVC':
        model = SVC(
            C=params['C'],
            kernel=params['kernel'],
            gamma=params['gamma'],
            probability=True
        )
        return kfold_cross_validation(model, X, y)
    
    elif classifier == 'SGDClassifier':
        model = SGDClassifier(
            loss=params['loss'],
            penalty=params['penalty'],
            alpha=params['alpha'],
            learning_rate=params['learning_rate'],
            eta0=params['eta0'],
            random_state=42
        )
        return kfold_cross_validation(model, X, y)
    
    elif classifier == 'XGBClassifier':
        model = XGBClassifier(
            n_estimators=params['n_estimators'],
            max_depth=params['max_depth'],
            learning_rate=params['learning_rate'],
            subsample=params['subsample'],
            colsample_bytree=params['colsample_bytree'],
            use_label_encoder=False,
            eval_metric='logloss',
            random_state=42
        )
        return kfold_cross_validation(model, X, y)
    
    elif classifier == 'LGBMClassifier':
        model = LGBMClassifier(
            n_estimators=params['n_estimators'],
            learning_rate=params['learning_rate'],
            max_depth=params['max_depth'],
            num_leaves=params['num_leaves'],
            subsample=params['subsample'],
            colsample_bytree=params['colsample_bytree'],
            random_state=42
        )
        return kfold_cross_validation(model, X, y)

In [None]:
def get_search_space(classifier):
    if classifier == 'bernoulliNB':
        return {
            'binarize': hp.choice('binarize', [0.0, 0.5, 1.0]),
            'alpha': hp.float('alpha', 0.0001, 1.0)
        }

    elif classifier == 'multinomialNB':
        return {
            'alpha': hp.float('alpha', 0.0001, 1.0),
            'fit_prior': hp.choice('fit_prior', [True, False])
        }

    elif classifier == 'DecisionTree':
        return {
            'criterion': hp.choice('criterion', ['gini', 'entropy']),
            'max_depth': hp.randint('max_depth', 2, 32),
            'min_samples_split': hp.randint('min_samples_split', 2, 10),
            'max_features': hp.choice('max_features', ['sqrt', 'log2', None])
        }

    elif classifier == 'ExtraTrees':
        return {
            'n_estimators': hp.int('n_estimators', 50, 200),
            'criterion': hp.choice('criterion', ['gini', 'entropy']),
            'max_depth': hp.randint('max_depth', 2, 32),
            'min_samples_split': hp.randint('min_samples_split', 2, 10),
            'max_features': hp.choice('max_features', ['sqrt', 'log2', None])
        }

    elif classifier == 'GradientBoosting':
        return {
            'n_estimators': hp.randint('n_estimators', 50, 200),
            'learning_rate': hp.float('learning_rate', 0.01, 0.3),
            'subsample': hp.float('subsample', 0.5, 1.0),
            'max_depth': hp.randint('max_depth', 2, 32),
            'min_samples_split': hp.randint('min_samples_split', 2, 10)
        }

    elif classifier == 'RandomForest':
        return {
            'n_estimators': hp.randint('n_estimators', 50, 200),
            'max_depth': hp.randint('max_depth', 2, 32),
            'min_samples_split': hp.randint('min_samples_split', 2, 10),
            'max_features': hp.choice('max_features', ['sqrt', 'log2', None])
        }

    elif classifier == 'KNN':
        return {
            'n_neighbors': hp.randint('n_neighbors', 3, 15),
            'weights': hp.choice('weights', ['uniform', 'distance']),
            'algorithm': hp.choice('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
        }

    elif classifier == 'LogisticRegression':
        return {
            'penalty': hp.choice('penalty', ['l2', 'l1']),
            'C': hp.float('C', 1e-3, 10.0),
            'solver': hp.choice('solver', ['lbfgs', 'saga']),
            'max_iter': hp.randint('max_iter', 100, 1000)
        }

    elif classifier == 'SVC':
        return {
            'C': hp.float('C', 1e-3, 10.0),
            'kernel': hp.choice('kernel', ['linear', 'rbf', 'poly']),
            'gamma': hp.choice('gamma', ['scale', 'auto'])
        }

    elif classifier == 'SGDClassifier':
        return {
            'loss': hp.choice('loss', ['hinge', 'log_loss', 'modified_huber']),
            'penalty': hp.choice('penalty', ['l2', 'l1']),
            'alpha': hp.uniform('alpha', 1e-5, 1e-1),
            'learning_rate': hp.choice('learning_rate', ['constant', 'optimal', 'invscaling']),
            'eta0': hp.uniform('eta0', 1e-3, 1.0)
        }
    elif classifier == 'XGBClassifier':
        return {
            'n_estimators': hp.randint('n_estimators', 50, 200),
            'max_depth': hp.randint('max_depth', 3, 10),
            'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
            'subsample': hp.uniform('subsample', 0.5, 1.0),
            'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0)
        }

    elif classifier == 'LGBMClassifier':
        return {
            'n_estimators': hp.randint('n_estimators', 50, 200),
            'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
            'max_depth': hp.randint('max_depth', 3, 10),
            'num_leaves': hp.randint('num_leaves', 20, 100),
            'subsample': hp.uniform('subsample', 0.5, 1.0),
            'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0)
        }                     

In [None]:
import time

def hyperOPT_TPE(classifier, X,y):
    start_time = time.time()
    trials = Trials()

    best = fmin(
        fn=lambda params: objective(params, X, y, classifier),
        space=get_search_space(classifier),
        algo=tpe.suggest,
        max_evals=50,  
        trials=trials
    )
    end_time = time.time()
    total_time = end_time - start_time
    return total_time, best

In [None]:
time, best = hyperOPT_TPE('RandomForest',X,y)
print(time)
print(best)

In [None]:
import time

def hyperOPT_Rand(classifier, X,y):
    start_time = time.time()
    trials = Trials()

    best = fmin(
        fn=lambda params: objective(params, X, y, classifier),
        space=get_search_space(classifier),
        algo=rand.suggest,
        max_evals=50,  
        trials=trials
    )
    end_time = time.time()
    total_time = end_time - start_time
    return total_time, best

In [None]:
time, best = hyperOPT_TPE('RandomForest',X,y)
print(time)
print(best)

### Optunity