In [1]:
import optuna
import joblib
import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from stesml.model_tools import build_train_test_model

  from pandas import MultiIndex, Int64Index


In [3]:
def objective(trial):   
    # Save the study before running the next trial
    joblib.dump(study, "../studies/study_" + model_type + "_" + datetime.datetime.now().strftime("%Y%m%d-%H") + ".pkl")
    
    if model_type == 'NN':
        scale = True
        n_layers = trial.suggest_int("n_layers", 1, 5)
        n_hidden_units = trial.suggest_int("n_hidden_units", 10, 100)
        batch_size = trial.suggest_int("batch_size", 10, 10000, log=True)
        epochs = 20 # Neglect optmizing # of epochs, allow early stopping to determine # of epochs
        parameters = {'n_layers': n_layers, 'n_hidden_units': n_hidden_units, 'batch_size': batch_size, 'epochs': epochs}
        result, addendum = build_train_test_model(data_dir, model_type, target, metric, scale, parameters, n_repeats)
    elif model_type == 'XGBoost':
        scale = False
        #n_estimators = trial.suggest_int("n_estimators", 50, 2000, log=True)
        num_boost_round = trial.suggest_int("num_boost_round", 10, 2000, log=True)
        learning_rate = trial.suggest_float("learning_rate", 0.01, 1)
        subsample = trial.suggest_float("subsample", 0.01, 1)
        #parameters = {'n_estimators': n_estimators, 'learning_rate': learning_rate, 'subsample': subsample}
        parameters = {'num_boost_round': num_boost_round, 'learning_rate': learning_rate, 'subsample': subsample}
        result, addendum = build_train_test_model(data_dir, model_type, target, metric, scale, parameters, n_repeats)
    elif model_type == 'RandomForest':
        scale = False
        n_estimators = trial.suggest_int("n_estimators", 1, 200, log=True)
        parameters = {'n_estimators': n_estimators}
        result, addendum = build_train_test_model(data_dir, model_type, target, metric, scale, parameters, n_repeats)
    
    return result

In [4]:
data_dir = "../data/Sulfur_Models/"
model_type = 'XGBoost' # Options: NN, XGBoost, RandomForest
target = 'h' # Options: Tavg, h
metric = 'rmse' # Options: rmse, r2
n_repeats = 1 # Number of times to repeat 5-fold CV. Each repeat gives a different shuffle.

if metric == 'rmse':
    direction = 'minimize'
elif metric == 'r2':
    directon = 'maximize'
    
load_study = False
study_name = "study_NN_20220630-18.pkl"

In [5]:
if load_study:
    study = joblib.load("../studies/" + study_name)
    print("Best trial until now:")
    print(" Value: ", study.best_trial.value)
    print(" Params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")
else:
    study = optuna.create_study(direction=direction)

[32m[I 2022-07-01 12:58:19,242][0m A new study created in memory with name: no-name-e816322b-dca0-482e-b61c-5881e40d4e48[0m


In [None]:
study.optimize(objective, n_trials=50)

[0]	test-rmse:42.00425
[20]	test-rmse:4.05371
[40]	test-rmse:3.87025
[60]	test-rmse:3.81885
[80]	test-rmse:3.80827
[100]	test-rmse:3.78953
[120]	test-rmse:3.77736
[140]	test-rmse:3.74959
[160]	test-rmse:3.74683
[180]	test-rmse:3.74111
[200]	test-rmse:3.74231
Split #0, This Result: 3.7423, Average Result: 3.7423
[0]	test-rmse:40.89905
[20]	test-rmse:4.39948
[40]	test-rmse:4.30621
[59]	test-rmse:4.38127
Split #1, This Result: 4.3813, Average Result: 4.0618
[0]	test-rmse:43.64683
[20]	test-rmse:2.91550
[40]	test-rmse:2.83054
[56]	test-rmse:2.86552
Split #2, This Result: 2.8646, Average Result: 3.6627
[0]	test-rmse:44.92471
[20]	test-rmse:6.59071
[40]	test-rmse:6.41923
[60]	test-rmse:6.39549
[80]	test-rmse:6.38345
[100]	test-rmse:6.36455
[118]	test-rmse:6.36429
Split #3, This Result: 6.3643, Average Result: 4.3381
[0]	test-rmse:41.14594
[20]	test-rmse:6.42169
[40]	test-rmse:6.35088
[60]	test-rmse:6.34314
[80]	test-rmse:6.29696
[100]	test-rmse:6.30743
[120]	test-rmse:6.29448
[136]	test-rmse

[32m[I 2022-07-01 13:06:21,286][0m Trial 0 finished with value: 4.728907386525946 and parameters: {'num_boost_round': 518, 'learning_rate': 0.38448171739473397, 'subsample': 0.6494435242693423}. Best is trial 0 with value: 4.728907386525946.[0m


Split #4, This Result: 6.2920, Average Result: 4.7289
[0]	test-rmse:29.43078
[20]	test-rmse:3.94879
[40]	test-rmse:3.84034
[42]	test-rmse:3.83426
Split #0, This Result: 3.8343, Average Result: 3.8343
[0]	test-rmse:28.18161
[20]	test-rmse:5.25564
[25]	test-rmse:5.25342
Split #1, This Result: 5.2509, Average Result: 4.5426
[0]	test-rmse:30.95565
[20]	test-rmse:3.41296


In [None]:
best_params = study.best_params

In [18]:
best_params

{'n_layers': 2, 'n_hidden_units': 79, 'batch_size': 339, 'epochs': 7}

In [21]:
best_value = study.best_value

In [22]:
best_value

1.0137617092656621