In [1]:
import optuna
import joblib
import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from stesml.model_tools import build_train_test_model

  from pandas import MultiIndex, Int64Index


In [3]:
def objective(trial):   
    # Save the study before running the next trial
    joblib.dump(study, "../studies/study_" + model_type + "_" + datetime.datetime.now().strftime("%Y%m%d-%H") + ".pkl")
    
    if model_type == 'NN':
        scale = True
        n_layers = trial.suggest_int("n_layers", 1, 5)
        n_hidden_units = trial.suggest_int("n_hidden_units", 10, 100)
        batch_size = trial.suggest_int("batch_size", 10, 10000, log=True)
        parameters = {'n_layers': n_layers, 'n_hidden_units': n_hidden_units, 'batch_size': batch_size, 'epochs': 200}
        result, addendum = build_train_test_model(data_dir, model_type, target, metric, scale, parameters, n_repeats, t_max=360)
    elif model_type == 'XGBoost':
        scale = False
        learning_rate = trial.suggest_float("learning_rate", 0.01, 1)
        subsample = trial.suggest_float("subsample", 0.01, 1)
        colsample_bytree = trial.suggest_float("colsample_bytree", 0.01, 1)
        num_boost_round = 10000 # Set this as a maximum, model will stop with early stopping
        parameters = {'learning_rate': learning_rate, 'subsample': subsample, 'colsample_bytree': colsample_bytree, 'num_boost_round': num_boost_round}
        result, addendum = build_train_test_model(data_dir, model_type, target, metric, scale, parameters, n_repeats, t_min=360)
    elif model_type == 'RandomForest':
        scale = False
        n_estimators = trial.suggest_int("n_estimators", 1, 200, log=True)
        max_depth = trial.suggest_int("max_depth", 1, 100)
        max_samples = trial.suggest_float("max_samples", .01, 1, log=True)
        parameters = {'n_estimators': n_estimators, 'max_depth': max_depth, 'max_samples': max_samples}
        result, addendum = build_train_test_model(data_dir, model_type, target, metric, scale, parameters, n_repeats)
    
    return result

In [4]:
data_dir = "../data/Sulfur_Models/heating/heating_all"
model_type = 'XGBoost' # Options: NN, XGBoost, RandomForest
target = 'h' # Options: Tavg, h
metric = 'rmse' # Options: rmse, r2
n_repeats = 1 # Number of times to repeat 5-fold CV. Each repeat gives a different shuffle.

if metric == 'rmse':
    direction = 'minimize'
elif metric == 'r2':
    directon = 'maximize'
    
load_study = True
study_name = "study_XGBoost_20220708-19.pkl"

In [5]:
if load_study:
    study = joblib.load("../studies/" + study_name)
    print("Best trial until now:")
    print(" Value: ", study.best_trial.value)
    print(" Params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")
else:
    study = optuna.create_study(direction=direction)

Best trial until now:
 Value:  5.524612415643093
 Params: 
    learning_rate: 0.7303137624182325
    subsample: 0.7460404902385005
    colsample_bytree: 0.8555578185452548


In [None]:
study.optimize(objective, n_trials=250)

Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	test-rmse:32.26539
[20]	test-rmse:8.18279
[28]	test-rmse:8.63249
Split #0, This Result: 8.8979, Average Result: 8.8979
Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	test-rmse:28.44063
[20]	test-rmse:9.57957
[40]	test-rmse:9.62026
[45]	test-rmse:9.61449
Split #1, This Result: 9.6149, Average Result: 9.2564
Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, wi

[32m[I 2022-07-08 23:20:20,260][0m Trial 9 finished with value: 7.156570678081183 and parameters: {'learning_rate': 0.31184190124088107, 'subsample': 0.6107729263035355, 'colsample_bytree': 0.18476491504401746}. Best is trial 2 with value: 5.524612415643093.[0m


Split #4, This Result: 5.0902, Average Result: 7.1566
Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	test-rmse:23.40025
[20]	test-rmse:9.61493
[23]	test-rmse:9.42959
Split #0, This Result: 9.5413, Average Result: 9.5413
Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	test-rmse:21.76374
[20]	test-rmse:9.04227
[40]	test-rmse:9.66384
[42]	test-rmse:9.66759
Split #1, This Result: 9.5823, Average Result: 9.5618
Parameters: { "num_boost_round" }

[32m[I 2022-07-08 23:22:32,953][0m Trial 10 finished with value: 7.477252481847981 and parameters: {'learning_rate': 0.530736353445202, 'subsample': 0.5955359879866121, 'colsample_bytree': 0.5024268524729334}. Best is trial 2 with value: 5.524612415643093.[0m


Split #4, This Result: 5.9517, Average Result: 7.4773
Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	test-rmse:44.22303
[20]	test-rmse:28.71423
[40]	test-rmse:19.63630
[60]	test-rmse:14.82174
[80]	test-rmse:12.35599
[100]	test-rmse:11.02347
[120]	test-rmse:10.09757
[140]	test-rmse:9.62545
[160]	test-rmse:9.35514
[180]	test-rmse:9.16467
[200]	test-rmse:8.92783
[220]	test-rmse:8.77007
[240]	test-rmse:8.61751
[260]	test-rmse:8.58850
[280]	test-rmse:8.41936
[300]	test-rmse:8.38986
[320]	test-rmse:8.37398
[340]	test-rmse:8.36272
[360]	test-rmse:8.34879
[380]	test-rmse:8.20878
[400]	test-rmse:8.17097
[414]	test-rmse:8.18536
Split #0, This Result: 8.1856, Average Result: 8.1856
Parameters: { "num_boost_round" } might n

In [None]:
best_params = study.best_params

In [18]:
best_params

{'n_layers': 2, 'n_hidden_units': 79, 'batch_size': 339, 'epochs': 7}

In [21]:
best_value = study.best_value

In [22]:
best_value

1.0137617092656621