In [1]:
import optuna
import joblib
import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from stesml.model_tools import train_and_validate_model

  from pandas import MultiIndex, Int64Index


In [3]:
def objective(trial):
    # Save the study before running the next trial
    joblib.dump(study, "../studies/study_" + model_type + "_" + datetime.datetime.now().strftime("%Y%m%d-%H") + ".pkl")
    
    if model_type == 'NN':
        scale = True
        n_layers = trial.suggest_int("n_layers", 1, 5)
        n_hidden_units = trial.suggest_int("n_hidden_units", 10, 100)
        batch_size = trial.suggest_int("batch_size", 1, 100, log=True)
        parameters = {'n_layers': n_layers, 'n_hidden_units': n_hidden_units, 'batch_size': batch_size, 'epochs': 200}
        result, addendum = train_and_validate_model(data_dir, model_type, target, metric, scale, parameters, n_repeats, t_max=360, split_test_data=split_test_data)
    elif model_type == 'XGBoost':
        scale = False
        learning_rate = trial.suggest_float("learning_rate", 0.01, 1)
        num_boost_round = 10000 # Set this as a maximum, model will stop with early stopping
        parameters = {'learning_rate': learning_rate, 'num_boost_round': num_boost_round}
        result, addendum = train_and_validate_model(data_dir, model_type, target, metric, scale, parameters, n_repeats, split_test_data=split_test_data)
    elif model_type == 'RandomForest':
        scale = False
        n_estimators = trial.suggest_int("n_estimators", 1, 200, log=True)
        max_depth = trial.suggest_int("max_depth", 1, 100)
        max_samples = trial.suggest_float("max_samples", .01, 1, log=True)
        parameters = {'n_estimators': n_estimators, 'max_depth': max_depth, 'max_samples': max_samples}
        result, addendum = train_and_validate_model(data_dir, model_type, target, metric, scale, parameters, n_repeats, split_test_data=split_test_data)
    
    return result

In [4]:
data_dir = "../data/Sulfur_Models/heating/heating_all"
model_type = 'XGBoost' # Options: NN, XGBoost, RandomForest
target = 'h' # Options: Tavg, h
metric = 'rmse' # Options: rmse, r2
n_repeats = 1 # Number of times to repeat 5-fold CV. Each repeat gives a different shuffle.
split_test_data = True # Split data into train (64%), val (16%), and test (20%) (True) or just train (80%) and val (20%) (False)

if metric == 'rmse':
    direction = 'minimize'
elif metric == 'r2':
    directon = 'maximize'

load_study = True
study_name = "study_XGBoost_20220718-14.pkl"

In [5]:
if load_study:
    study = joblib.load("../studies/" + study_name)
    print("Best trial until now:")
    print(" Value: ", study.best_trial.value)
    print(" Params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")
else:
    study = optuna.create_study(direction=direction)

Best trial until now:
 Value:  4.304502234734034
 Params: 
    learning_rate: 0.028326389959607595


In [6]:
study.optimize(objective, n_trials=250)

Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	val-rmse:103.75775
[20]	val-rmse:78.71982
[40]	val-rmse:59.95507
[60]	val-rmse:45.74416
[80]	val-rmse:35.08567
[100]	val-rmse:27.08765
[120]	val-rmse:21.00374
[140]	val-rmse:16.65107
[160]	val-rmse:13.43546
[180]	val-rmse:11.16351
[200]	val-rmse:9.57466
[220]	val-rmse:8.47469
[240]	val-rmse:7.74722
[260]	val-rmse:7.29557
[280]	val-rmse:7.00392
[300]	val-rmse:6.84176
[320]	val-rmse:6.75688
[340]	val-rmse:6.67866
[360]	val-rmse:6.62701
[380]	val-rmse:6.59599
[400]	val-rmse:6.58846
[420]	val-rmse:6.56684
[438]	val-rmse:6.57442
Split #0, This Result: 6.5737, Average Result: 6.5737
Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with 

KeyboardInterrupt: 

In [None]:
best_params = study.best_params

In [18]:
best_params

{'n_layers': 2, 'n_hidden_units': 79, 'batch_size': 339, 'epochs': 7}

In [21]:
best_value = study.best_value

In [22]:
best_value

1.0137617092656621