In [1]:
## Workflow
## We can first run to get the important parameters first, and then look to optimise only those specific ones

import optuna
import pandas as pd
import numpy as np
import sys, os
import matplotlib.pyplot as plt
import logging

sys.path.append(os.getcwd() + "/../src/")

from utils import create_dataset
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from lightgbm.sklearn import LGBMRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import SplineTransformer, PowerTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit

from sklearn.pipeline import Pipeline

btc = create_dataset()
btc = btc.reset_index(drop=True)

X_pred = btc.iloc[-1].drop(['Date', 'target'])
btc = btc.dropna()

X, y = btc.drop(columns=['target', 'Date']).astype(np.float64), btc['target'].astype(np.float64)

SEED = 2052
np.random.seed(SEED)
TS_SPLITS = 10
NCALLS = 1000

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def objective(trial):

    def suggest_params(trial):

        return {
            'spline': {
                'n_knots': trial.suggest_int('n_knots', 5, 20),
                'degree': trial.suggest_int('degree', 2, 5)
            },

            'svm': {
                'kernel': trial.suggest_categorical('svm__kernel', ['rbf', 'sigmoid']),
                'gamma': trial.suggest_float('svm__gamma', 1e-5, 1),
                'C': trial.suggest_float('svm__C', 1, 1e2),
                'epsilon': trial.suggest_float('svm__epsilon', 1e-1, 1e1),
                'max_iter': 20000,
            },

            'rf': {
                'n_estimators': trial.suggest_int('rf__n_estimators', 100, 450),
                'max_depth': trial.suggest_int('rf__max_depth', 4, 50),
                'min_samples_split': trial.suggest_int('rf__min_samples_split', 2, 15),
                'min_samples_leaf': trial.suggest_int('rf__min_samples_leaf', 2, 15),
                'max_features': trial.suggest_categorical('rf__max_features', ['sqrt', 'log2', 1.0]),
                'min_impurity_decrease': trial.suggest_float('rf__min_impurity_decrease', 0, 1),
                'ccp_alpha': trial.suggest_float('rf__ccp_alpha', 0, 10),
                'random_state': SEED
            },

            'ada': {
                'n_estimators': trial.suggest_int('ada__n_estimators', 50, 350),
                'learning_rate': trial.suggest_float('ada__learning_rate', 1e-5, 1e3),
                'loss': trial.suggest_categorical('ada__loss', ['linear', 'square', 'exponential']),
                'random_state': SEED

            },

            'lgbm': {
                'n_estimators': trial.suggest_int('lgbm__n_estimators', 500, 3000),
                'max_depth': trial.suggest_int('lgbm__max_depth', 4, 50),
                'min_child_weight': trial.suggest_int('lgbm__min_child_weight', 1, 6),
                'learning_rate': trial.suggest_float('lgbm__learning_rate', 1e-5, 1),
                'reg_alpha': trial.suggest_float('lgbm__reg_alpha', 0, 1e1),
                'reg_lambda': trial.suggest_float('lgbm__reg_lambda', 0, 1e1),
                'verbosity': -1,
                'random_state': SEED
            },
        }

    scores = []

    ts_cv = TimeSeriesSplit(
        n_splits=TS_SPLITS,
        test_size=1
    )


    for index, (train_index, test_index) in enumerate(ts_cv.split(X)):

        ## Time series split
        X_train = X.values[train_index]
        y_train = y.values[train_index]

        X_val = X.values[test_index].reshape(1, -1)
        y_val = y.values[test_index]

        ## Suggesting parameters
        params = suggest_params(trial=trial)
        spline, svm, rf, ada, lgbm = params['spline'], params['svm'], params['rf'], params['ada'], params['lgbm']

        ## Construction of model
        ensemble = VotingRegressor(
            estimators=[
                ('svm', SVR(**svm)),
                ('rf', RandomForestRegressor(**rf)),
                ('ada', AdaBoostRegressor(**ada)),
                ('lgbm', LGBMRegressor(**lgbm))
            ]
        )

        pipeline = Pipeline(
            [
                ('power-transformer', PowerTransformer()),
                ('spline', SplineTransformer(**spline)),
                ('ensemble', ensemble)
            ]
        )

        # Fit model
        pipeline.fit(X=X_train, y=y_train)
        y_pred = pipeline.predict(X=X_val)

        # Scoring
        score = np.abs(y_val - y_pred)
        scores.append(score)
        trial.report(np.mean(scores), index)

        if trial.should_prune():
            raise optuna.TrialPruned()

    return np.mean(scores)

In [3]:
log_path = "./logs/"
file = "optuna_run"
fileHandler = logging.FileHandler("{0}/{1}.log".format(log_path, file))
optuna_logger = optuna.logging.get_logger("optuna")
optuna_logger.addHandler(fileHandler)

In [4]:
study = optuna.create_study(study_name='Ensemble Optimisation', pruner=optuna.pruners.MedianPruner(), direction='minimize')
study.optimize(objective, n_trials=NCALLS)

[I 2024-07-14 17:15:19,771] A new study created in memory with name: Ensemble Optimisation
[I 2024-07-14 17:15:27,204] Trial 0 finished with value: 49.110581250475356 and parameters: {'n_knots': 8, 'degree': 2, 'svm__kernel': 'sigmoid', 'svm__gamma': 0.8292058019437462, 'svm__C': 48.60366630403845, 'svm__epsilon': 7.756276157359219, 'rf__n_estimators': 201, 'rf__max_depth': 27, 'rf__min_samples_split': 12, 'rf__min_samples_leaf': 4, 'rf__max_features': 'log2', 'rf__min_impurity_decrease': 0.30251181536816996, 'rf__ccp_alpha': 4.557522545004648, 'ada__n_estimators': 251, 'ada__learning_rate': 311.7426911194335, 'ada__loss': 'square', 'lgbm__n_estimators': 1087, 'lgbm__max_depth': 11, 'lgbm__min_child_weight': 6, 'lgbm__learning_rate': 0.9920348026810295, 'lgbm__reg_alpha': 6.44334066653067, 'lgbm__reg_lambda': 5.081835954797844}. Best is trial 0 with value: 49.110581250475356.
[I 2024-07-14 17:15:37,181] Trial 1 finished with value: 10.708713225578423 and parameters: {'n_knots': 15, 'de