In [1]:
## Workflow
## We can first run to get the important parameters first, and then look to optimise only those specific ones

import optuna
import pandas as pd
import numpy as np
import sys, os
import matplotlib.pyplot as plt
import logging

sys.path.append(os.getcwd() + "/../src/")

from utils import create_dataset
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from lightgbm.sklearn import LGBMRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import SplineTransformer, PowerTransformer
from sklearn.metrics import mean_absolute_error

from sklearn.pipeline import Pipeline

btc = create_dataset()
btc = btc.reset_index(drop=True)

X_pred = btc.iloc[-1].drop(['Date', 'target'])
btc = btc.dropna()

X, y = btc.drop(columns=['target', 'Date']).astype(np.float64), btc['target'].astype(np.float64)

SEED = 2052
np.random.seed(SEED)
TS_SPLITS = 10

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
n_samples = len(X)

def objective(trial):

    def suggest_params(trial):

        return {
            'spline': {
                'n_knots': trial.suggest_int('n_knots', 5, 20),
                'degree': trial.suggest_int('degree', 2, 5)
            },

            'svm': {
                'kernel': trial.suggest_categorical('svm__kernel', ['rbf', 'sigmoid']),
                'gamma': trial.suggest_float('svm__gamma', 1e-5, 1),
                'C': trial.suggest_float('svm__C', 1e-5, 1e4),
                'epsilon': trial.suggest_float('svm__epsilon', 1e-5, 1e4),
                'max_iter': 10000,
            },

            'rf': {
                'n_estimators': trial.suggest_int('rf__n_estimators', 50, 450),
                'max_depth': trial.suggest_int('rf__max_depth', 4, 16),
                'min_samples_split': trial.suggest_int('rf__min_samples_split', 2, 15),
                'min_samples_leaf': trial.suggest_int('rf__min_samples_lead', 2, 15),
                'max_features': trial.suggest_categorical('rf__max_features', ['sqrt', 'log2', 1.0]),
                'min_impurity_decrease': trial.suggest_float('rf__min_impurity_decrease', 1e-3, 1),
                'ccp_alpha': trial.suggest_float('rf__ccp_alpha', 1e-3, 1e2),
                'random_state': SEED
            },

            'ada': {
                'n_estimators': trial.suggest_int('ada__n_estimators', 50, 350),
                'learning_rate': trial.suggest_float('ada__learning_rate', 1e-5, 1e3),
                'loss': trial.suggest_categorical('ada__loss', ['linear', 'square', 'exponential']),
                'random_state': SEED

            },

            'lgbm': {
                'n_estimators': trial.suggest_int('lgbm__n_estimators', 500, 3000),
                'max_depth': trial.suggest_int('lgbm__max_depth', 4, 16),
                'min_child_weight': trial.suggest_int('lgbm__min_child_weight', 1, 6),
                'learning_rate': trial.suggest_float('lgbm__learning_rate', 1e-5, 1),
                'reg_alpha': trial.suggest_float('lgbm__reg_alpha', 1e-5, 1e2),
                'reg_lambda': trial.suggest_float('lgbm__reg_lambda', 1e-5, 1e2),
                'verbosity': -1,
                'random_state': SEED
            }
        }

    scores = []
    for index in range(TS_SPLITS):

        ## Time series split
        train_size = n_samples - (TS_SPLITS - index)
        X_train = X.values[:train_size + 1]
        y_train = y.values[:train_size + 1]

        X_val = X.values[train_size].reshape(1, -1)
        y_val = y.values[train_size]

        ## Suggesting parameters
        params = suggest_params(trial=trial)
        spline, svm, rf, ada, lgbm = params['spline'], params['svm'], params['rf'], params['ada'], params['lgbm']

        ## Construction of model
        ensemble = VotingRegressor(
            estimators=[
                ('svm', SVR(**svm)),
                ('rf', RandomForestRegressor(**rf)),
                ('ada', AdaBoostRegressor(**ada)),
                ('lgbm', LGBMRegressor(**lgbm))
            ]
        )

        pipeline = Pipeline(
            [
                ('power-transformer', PowerTransformer()),
                ('spline', SplineTransformer(**spline)),
                ('ensemble', ensemble)
            ]
        )

        # Fit model
        pipeline.fit(X=X_train, y=y_train)
        y_pred = pipeline.predict(X=X_val)

        # Scoring
        score = np.abs(y_pred - y_val)
        scores.append(score)
        trial.report(np.mean(scores), index)

        if trial.should_prune():
            raise optuna.TrialPruned()

    return np.mean(scores)

In [3]:
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
study = optuna.create_study(study_name='Ensemble Optimisation', pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=5)

[I 2024-07-13 16:23:22,763] A new study created in memory with name: Ensemble Optimisation


A new study created in memory with name: Ensemble Optimisation


[I 2024-07-13 16:23:34,307] Trial 0 finished with value: 10.672126263180104 and parameters: {'n_knots': 18, 'degree': 5, 'svm__kernel': 'rbf', 'svm__gamma': 0.8248131460549702, 'svm__C': 5703.832601832904, 'svm__epsilon': 7323.595071941993, 'rf__n_estimators': 102, 'rf__max_depth': 15, 'rf__min_samples_split': 3, 'rf__min_samples_lead': 2, 'rf__max_features': 'log2', 'rf__min_impurity_decrease': 0.5352215428489109, 'rf__ccp_alpha': 24.947646765018845, 'ada__n_estimators': 58, 'ada__learning_rate': 772.1869696986244, 'ada__loss': 'square', 'lgbm__n_estimators': 1930, 'lgbm__max_depth': 7, 'lgbm__min_child_weight': 1, 'lgbm__learning_rate': 0.47140119553830134, 'lgbm__reg_alpha': 12.314997391673566, 'lgbm__reg_lambda': 89.20635522654716}. Best is trial 0 with value: 10.672126263180104.


Trial 0 finished with value: 10.672126263180104 and parameters: {'n_knots': 18, 'degree': 5, 'svm__kernel': 'rbf', 'svm__gamma': 0.8248131460549702, 'svm__C': 5703.832601832904, 'svm__epsilon': 7323.595071941993, 'rf__n_estimators': 102, 'rf__max_depth': 15, 'rf__min_samples_split': 3, 'rf__min_samples_lead': 2, 'rf__max_features': 'log2', 'rf__min_impurity_decrease': 0.5352215428489109, 'rf__ccp_alpha': 24.947646765018845, 'ada__n_estimators': 58, 'ada__learning_rate': 772.1869696986244, 'ada__loss': 'square', 'lgbm__n_estimators': 1930, 'lgbm__max_depth': 7, 'lgbm__min_child_weight': 1, 'lgbm__learning_rate': 0.47140119553830134, 'lgbm__reg_alpha': 12.314997391673566, 'lgbm__reg_lambda': 89.20635522654716}. Best is trial 0 with value: 10.672126263180104.


[I 2024-07-13 16:23:42,375] Trial 1 finished with value: 13.388204643584618 and parameters: {'n_knots': 9, 'degree': 3, 'svm__kernel': 'rbf', 'svm__gamma': 0.22105604442554946, 'svm__C': 4051.6170243029237, 'svm__epsilon': 5147.022016374605, 'rf__n_estimators': 186, 'rf__max_depth': 16, 'rf__min_samples_split': 5, 'rf__min_samples_lead': 4, 'rf__max_features': 'sqrt', 'rf__min_impurity_decrease': 0.8939631727403385, 'rf__ccp_alpha': 99.75273828711435, 'ada__n_estimators': 60, 'ada__learning_rate': 911.3968680968782, 'ada__loss': 'square', 'lgbm__n_estimators': 2999, 'lgbm__max_depth': 16, 'lgbm__min_child_weight': 4, 'lgbm__learning_rate': 0.8836221187482044, 'lgbm__reg_alpha': 40.04743529553038, 'lgbm__reg_lambda': 22.553064119775087}. Best is trial 0 with value: 10.672126263180104.


Trial 1 finished with value: 13.388204643584618 and parameters: {'n_knots': 9, 'degree': 3, 'svm__kernel': 'rbf', 'svm__gamma': 0.22105604442554946, 'svm__C': 4051.6170243029237, 'svm__epsilon': 5147.022016374605, 'rf__n_estimators': 186, 'rf__max_depth': 16, 'rf__min_samples_split': 5, 'rf__min_samples_lead': 4, 'rf__max_features': 'sqrt', 'rf__min_impurity_decrease': 0.8939631727403385, 'rf__ccp_alpha': 99.75273828711435, 'ada__n_estimators': 60, 'ada__learning_rate': 911.3968680968782, 'ada__loss': 'square', 'lgbm__n_estimators': 2999, 'lgbm__max_depth': 16, 'lgbm__min_child_weight': 4, 'lgbm__learning_rate': 0.8836221187482044, 'lgbm__reg_alpha': 40.04743529553038, 'lgbm__reg_lambda': 22.553064119775087}. Best is trial 0 with value: 10.672126263180104.


[I 2024-07-13 16:23:51,123] Trial 2 finished with value: 7.246637418278513 and parameters: {'n_knots': 11, 'degree': 5, 'svm__kernel': 'rbf', 'svm__gamma': 0.07288156647545967, 'svm__C': 2053.6808544166006, 'svm__epsilon': 8823.026572428462, 'rf__n_estimators': 239, 'rf__max_depth': 8, 'rf__min_samples_split': 4, 'rf__min_samples_lead': 4, 'rf__max_features': 'sqrt', 'rf__min_impurity_decrease': 0.844796400797656, 'rf__ccp_alpha': 77.29323670325599, 'ada__n_estimators': 289, 'ada__learning_rate': 370.23123598706485, 'ada__loss': 'linear', 'lgbm__n_estimators': 962, 'lgbm__max_depth': 7, 'lgbm__min_child_weight': 3, 'lgbm__learning_rate': 0.5986711551178933, 'lgbm__reg_alpha': 74.56474838929846, 'lgbm__reg_lambda': 90.87481874671016}. Best is trial 2 with value: 7.246637418278513.


Trial 2 finished with value: 7.246637418278513 and parameters: {'n_knots': 11, 'degree': 5, 'svm__kernel': 'rbf', 'svm__gamma': 0.07288156647545967, 'svm__C': 2053.6808544166006, 'svm__epsilon': 8823.026572428462, 'rf__n_estimators': 239, 'rf__max_depth': 8, 'rf__min_samples_split': 4, 'rf__min_samples_lead': 4, 'rf__max_features': 'sqrt', 'rf__min_impurity_decrease': 0.844796400797656, 'rf__ccp_alpha': 77.29323670325599, 'ada__n_estimators': 289, 'ada__learning_rate': 370.23123598706485, 'ada__loss': 'linear', 'lgbm__n_estimators': 962, 'lgbm__max_depth': 7, 'lgbm__min_child_weight': 3, 'lgbm__learning_rate': 0.5986711551178933, 'lgbm__reg_alpha': 74.56474838929846, 'lgbm__reg_lambda': 90.87481874671016}. Best is trial 2 with value: 7.246637418278513.


[I 2024-07-13 16:24:01,416] Trial 3 finished with value: 5.359505551590854 and parameters: {'n_knots': 18, 'degree': 5, 'svm__kernel': 'sigmoid', 'svm__gamma': 0.9957981636569726, 'svm__C': 9870.011594535108, 'svm__epsilon': 5197.280771629869, 'rf__n_estimators': 416, 'rf__max_depth': 12, 'rf__min_samples_split': 10, 'rf__min_samples_lead': 7, 'rf__max_features': 'sqrt', 'rf__min_impurity_decrease': 0.872662051401799, 'rf__ccp_alpha': 51.71170468544542, 'ada__n_estimators': 249, 'ada__learning_rate': 341.54109351633184, 'ada__loss': 'linear', 'lgbm__n_estimators': 963, 'lgbm__max_depth': 13, 'lgbm__min_child_weight': 6, 'lgbm__learning_rate': 0.6035756136220481, 'lgbm__reg_alpha': 49.206650526786525, 'lgbm__reg_lambda': 3.046962307327141}. Best is trial 3 with value: 5.359505551590854.


Trial 3 finished with value: 5.359505551590854 and parameters: {'n_knots': 18, 'degree': 5, 'svm__kernel': 'sigmoid', 'svm__gamma': 0.9957981636569726, 'svm__C': 9870.011594535108, 'svm__epsilon': 5197.280771629869, 'rf__n_estimators': 416, 'rf__max_depth': 12, 'rf__min_samples_split': 10, 'rf__min_samples_lead': 7, 'rf__max_features': 'sqrt', 'rf__min_impurity_decrease': 0.872662051401799, 'rf__ccp_alpha': 51.71170468544542, 'ada__n_estimators': 249, 'ada__learning_rate': 341.54109351633184, 'ada__loss': 'linear', 'lgbm__n_estimators': 963, 'lgbm__max_depth': 13, 'lgbm__min_child_weight': 6, 'lgbm__learning_rate': 0.6035756136220481, 'lgbm__reg_alpha': 49.206650526786525, 'lgbm__reg_lambda': 3.046962307327141}. Best is trial 3 with value: 5.359505551590854.


[I 2024-07-13 16:24:18,126] Trial 4 finished with value: 7.63606653681034 and parameters: {'n_knots': 17, 'degree': 4, 'svm__kernel': 'rbf', 'svm__gamma': 0.5574602601902808, 'svm__C': 1736.066525415641, 'svm__epsilon': 4851.958474213238, 'rf__n_estimators': 272, 'rf__max_depth': 16, 'rf__min_samples_split': 15, 'rf__min_samples_lead': 10, 'rf__max_features': 'log2', 'rf__min_impurity_decrease': 0.24848689713903432, 'rf__ccp_alpha': 86.76270192596559, 'ada__n_estimators': 264, 'ada__learning_rate': 872.1601189722477, 'ada__loss': 'square', 'lgbm__n_estimators': 2176, 'lgbm__max_depth': 13, 'lgbm__min_child_weight': 5, 'lgbm__learning_rate': 0.08630800568243575, 'lgbm__reg_alpha': 31.60190511532843, 'lgbm__reg_lambda': 71.06670533551456}. Best is trial 3 with value: 5.359505551590854.


Trial 4 finished with value: 7.63606653681034 and parameters: {'n_knots': 17, 'degree': 4, 'svm__kernel': 'rbf', 'svm__gamma': 0.5574602601902808, 'svm__C': 1736.066525415641, 'svm__epsilon': 4851.958474213238, 'rf__n_estimators': 272, 'rf__max_depth': 16, 'rf__min_samples_split': 15, 'rf__min_samples_lead': 10, 'rf__max_features': 'log2', 'rf__min_impurity_decrease': 0.24848689713903432, 'rf__ccp_alpha': 86.76270192596559, 'ada__n_estimators': 264, 'ada__learning_rate': 872.1601189722477, 'ada__loss': 'square', 'lgbm__n_estimators': 2176, 'lgbm__max_depth': 13, 'lgbm__min_child_weight': 5, 'lgbm__learning_rate': 0.08630800568243575, 'lgbm__reg_alpha': 31.60190511532843, 'lgbm__reg_lambda': 71.06670533551456}. Best is trial 3 with value: 5.359505551590854.
