In [1]:
import optuna
import joblib
import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from stesml.model_tools import build_train_test_model

  from pandas import MultiIndex, Int64Index


In [3]:
def objective(trial):   
    # Save the study before running the next trial
    joblib.dump(study, "../studies/study_" + model_type + "_" + datetime.datetime.now().strftime("%Y%m%d-%H") + ".pkl")
    
    if model_type == 'NN':
        scale = True
        n_layers = trial.suggest_int("n_layers", 1, 5)
        n_hidden_units = trial.suggest_int("n_hidden_units", 10, 100)
        batch_size = trial.suggest_int("batch_size", 10, 10000, log=True)
        parameters = {'n_layers': n_layers, 'n_hidden_units': n_hidden_units, 'batch_size': batch_size}
        result, addendum = build_train_test_model(data_dir, model_type, target, metric, scale, parameters, n_repeats)
    elif model_type == 'XGBoost':
        scale = False
        learning_rate = trial.suggest_float("learning_rate", 0.01, 1)
        subsample = trial.suggest_float("subsample", 0.01, 1)
        parameters = {'learning_rate': learning_rate, 'subsample': subsample}
        result, addendum = build_train_test_model(data_dir, model_type, target, metric, scale, parameters, n_repeats)
    elif model_type == 'RandomForest':
        scale = False
        n_estimators = trial.suggest_int("n_estimators", 1, 200, log=True)
        parameters = {'n_estimators': n_estimators}
        result, addendum = build_train_test_model(data_dir, model_type, target, metric, scale, parameters, n_repeats)
    
    return result

In [4]:
data_dir = "../data/Sulfur_Models/"
model_type = 'XGBoost' # Options: NN, XGBoost, RandomForest
target = 'h' # Options: Tavg, h
metric = 'rmse' # Options: rmse, r2
n_repeats = 1 # Number of times to repeat 5-fold CV. Each repeat gives a different shuffle.

if metric == 'rmse':
    direction = 'minimize'
elif metric == 'r2':
    directon = 'maximize'
    
load_study = False
study_name = "study_NN_20220630-18.pkl"

In [5]:
if load_study:
    study = joblib.load("../studies/" + study_name)
    print("Best trial until now:")
    print(" Value: ", study.best_trial.value)
    print(" Params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")
else:
    study = optuna.create_study(direction=direction)

[32m[I 2022-07-01 13:23:05,469][0m A new study created in memory with name: no-name-eaa77fb7-ee41-491c-b886-895ebe2e41dc[0m


In [None]:
study.optimize(objective, n_trials=50)

[0]	test-rmse:35.43438
[20]	test-rmse:3.16562
[40]	test-rmse:3.03699
[60]	test-rmse:2.96915
[80]	test-rmse:2.94310
[100]	test-rmse:2.92716
[120]	test-rmse:2.90888
[140]	test-rmse:2.90643
[160]	test-rmse:2.89698
[180]	test-rmse:2.89407
[200]	test-rmse:2.89461
[206]	test-rmse:2.89601
Split #0, This Result: 2.8960, Average Result: 2.8960
[0]	test-rmse:34.71487
[20]	test-rmse:4.86566
[32]	test-rmse:4.85124
Split #1, This Result: 4.8512, Average Result: 3.8736
[0]	test-rmse:37.07577
[20]	test-rmse:2.90944
[40]	test-rmse:2.68084
[60]	test-rmse:2.61667
[80]	test-rmse:2.59508
[100]	test-rmse:2.58445
[120]	test-rmse:2.57161
[131]	test-rmse:2.56919
Split #2, This Result: 2.5692, Average Result: 3.4388
[0]	test-rmse:38.60198
[20]	test-rmse:6.10948
[40]	test-rmse:6.04255
[60]	test-rmse:6.00368
[80]	test-rmse:5.98754
[100]	test-rmse:5.97862
[120]	test-rmse:5.97060
[140]	test-rmse:5.96933
[160]	test-rmse:5.96845
[180]	test-rmse:5.97014
[183]	test-rmse:5.96871
Split #3, This Result: 5.9687, Average R

[32m[I 2022-07-01 13:32:18,982][0m Trial 0 finished with value: 4.53370750359884 and parameters: {'learning_rate': 0.48277875344568977, 'subsample': 0.8711264251051485}. Best is trial 0 with value: 4.53370750359884.[0m


[0]	test-rmse:34.48186
[20]	test-rmse:3.92867
[40]	test-rmse:3.88196
[60]	test-rmse:3.88998
[75]	test-rmse:3.86691
Split #0, This Result: 3.8669, Average Result: 3.8669
[0]	test-rmse:32.33478
[20]	test-rmse:5.26821
[40]	test-rmse:5.38090
[43]	test-rmse:5.37188
Split #1, This Result: 5.3728, Average Result: 4.6199
[0]	test-rmse:35.40485
[20]	test-rmse:3.80706
[40]	test-rmse:3.74139
[60]	test-rmse:3.69773
[80]	test-rmse:3.73448
[81]	test-rmse:3.74214
Split #2, This Result: 3.7421, Average Result: 4.3273
[0]	test-rmse:37.12159
[20]	test-rmse:6.37744
[40]	test-rmse:6.26271
[60]	test-rmse:6.22249
[80]	test-rmse:6.21757
[100]	test-rmse:6.19866
[120]	test-rmse:6.14898
[140]	test-rmse:6.13860
[148]	test-rmse:6.14068
Split #3, This Result: 6.1404, Average Result: 4.7806
[0]	test-rmse:32.39539
[20]	test-rmse:6.55092
[40]	test-rmse:6.43380
[60]	test-rmse:6.49694
[64]	test-rmse:6.49969


[32m[I 2022-07-01 13:36:27,584][0m Trial 1 finished with value: 5.124469194550841 and parameters: {'learning_rate': 0.5167270301273807, 'subsample': 0.2438273694274839}. Best is trial 0 with value: 4.53370750359884.[0m


Split #4, This Result: 6.5001, Average Result: 5.1245
[0]	test-rmse:61.64845
[20]	test-rmse:10.12393
[40]	test-rmse:3.64425
[60]	test-rmse:3.29407
[80]	test-rmse:3.21854
[100]	test-rmse:3.15216
[120]	test-rmse:3.07960
[140]	test-rmse:3.06126
[160]	test-rmse:3.05812
[180]	test-rmse:3.03813
[200]	test-rmse:3.03485
[220]	test-rmse:3.03435
[226]	test-rmse:3.03375
Split #0, This Result: 3.0338, Average Result: 3.0338
[0]	test-rmse:59.95872
[20]	test-rmse:10.20899
[40]	test-rmse:4.73945
[60]	test-rmse:4.82384
[66]	test-rmse:4.82228
Split #1, This Result: 4.8223, Average Result: 3.9281
[0]	test-rmse:63.65412
[20]	test-rmse:11.06096
[40]	test-rmse:4.06471
[60]	test-rmse:2.97902
[80]	test-rmse:2.82283
[100]	test-rmse:2.76173
[120]	test-rmse:2.74258
[140]	test-rmse:2.69482
[160]	test-rmse:2.68050
[171]	test-rmse:2.68836
Split #2, This Result: 2.6867, Average Result: 3.5143
[0]	test-rmse:64.13353
[20]	test-rmse:14.52848
[40]	test-rmse:7.70284
[60]	test-rmse:6.92540
[80]	test-rmse:6.64081
[100]	te

[32m[I 2022-07-01 13:52:24,286][0m Trial 2 finished with value: 4.609374186843963 and parameters: {'learning_rate': 0.09007663646258796, 'subsample': 0.5675611471851605}. Best is trial 0 with value: 4.53370750359884.[0m


Split #4, This Result: 6.2107, Average Result: 4.6094
[0]	test-rmse:12.40833
[20]	test-rmse:4.80148
[40]	test-rmse:4.67581
[51]	test-rmse:4.70889
Split #0, This Result: 4.7089, Average Result: 4.7089
[0]	test-rmse:11.96041
[20]	test-rmse:6.16521
[21]	test-rmse:6.16537
Split #1, This Result: 6.1654, Average Result: 5.4372
[0]	test-rmse:14.00516
[20]	test-rmse:5.76684
[31]	test-rmse:5.52869
Split #2, This Result: 5.5287, Average Result: 5.4677
[0]	test-rmse:16.67754
[20]	test-rmse:6.99967
[40]	test-rmse:6.98941
[60]	test-rmse:6.96849
[78]	test-rmse:6.97156
Split #3, This Result: 6.9726, Average Result: 5.8439
[0]	test-rmse:12.80421
[20]	test-rmse:6.87436
[40]	test-rmse:6.77105
[60]	test-rmse:6.79914
[73]	test-rmse:6.81487


[32m[I 2022-07-01 14:02:56,785][0m Trial 3 finished with value: 6.0379612705021675 and parameters: {'learning_rate': 0.8763129003951626, 'subsample': 0.5610945203443286}. Best is trial 0 with value: 4.53370750359884.[0m


Split #4, This Result: 6.8142, Average Result: 6.0380
[0]	test-rmse:44.57804
[20]	test-rmse:3.43827


In [None]:
best_params = study.best_params

In [18]:
best_params

{'n_layers': 2, 'n_hidden_units': 79, 'batch_size': 339, 'epochs': 7}

In [21]:
best_value = study.best_value

In [22]:
best_value

1.0137617092656621