## This notebook is used to run hyperparameter optimization studies with Optuna.

In [1]:
import optuna
import joblib
import datetime
import os
import pandas as pd

import xgboost as xgb
from xgboost import XGBRegressor

from eagle_jobs.operation_support import train_test_split
from eagle_jobs.data_preprocessing import label_encode_columns

from sklearn.metrics import mean_squared_error

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def get_dfs():
    # Import pkl file
    filepath = os.path.join('../data/', 'eagle_data.parquet')
    eagle_df = pd.read_parquet(filepath)
    
    categorical_features = ['user','partition']
    label_encode_columns(eagle_df, categorical_features)
    
    start_time = eagle_df.submit_time.min()
    end_time = eagle_df.submit_time.max()
    split_times = pd.date_range(start_time, end_time, periods=22)[1:21]
    
    train_dfs = []
    test_dfs = []
    for split_time in split_times:
        train_df, test_df = train_test_split(eagle_df, split_time, training_window=100, testing_window=1)
        train_dfs.append(train_df)
        test_dfs.append(test_df)
    
    return train_dfs, test_dfs

In [5]:
train_dfs, test_dfs = get_dfs()

In [6]:
def load_data(i, train_dfs, test_dfs):
    features = ['wallclock_req','processors_req','mem_req','nodes_req','gpus_req','user','partition']
    target = 'run_time'
    X_train = train_dfs[i][features]
    y_train = train_dfs[i][target]
    X_test = test_dfs[i][features]
    y_test = test_dfs[i][target]
    return X_train, y_train, X_test, y_test

In [7]:
# Define the function to fit and evaluate a model with given hyperparameters on a single dataset 
def fit_model(params):
    params['eval_metric'] = 'rmse'
    rmse_list = list()
    eval_name = 'val'
    for i in range(len(train_dfs)):
        X_train, y_train, X_test, y_test = load_data(i, train_dfs, test_dfs)
        model = XGBRegressor(**params) 
        model.fit(X_train, y_train) 
        y_pred = model.predict(X_test) 
        y_pred = model.predict(X_test) 
        rmse = mean_squared_error(y_test, y_pred, squared=False) 
        rmse_list.append(rmse)
    rmse_avg = sum(rmse_list)/len(rmse_list) / 3600
    return sum(rmse_list)/len(rmse_list) / 3600

### Define the objective function.
See https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/001_first.html#sphx-glr-tutorial-10-key-features-001-first-py for an introduction to using Optuna for hyperparameter optimization.

In [8]:
def objective(trial):
    # Save the study before running the next trial
    joblib.dump(study, "../results/optuna_studies/study_" + datetime.datetime.now().strftime("%Y%m%d-%H") + ".pkl")
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    max_depth = trial.suggest_int("max_depth", 3, 7)
    learning_rate = trial.suggest_float("learning_rate", 0.1, 0.5)
    gamma = trial.suggest_float("gamma", 0, 1)
    subsample = trial.suggest_float("subsample", 0.5, 1)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1)
    random_state = 42
    
    params = {'learning_rate': learning_rate, 'subsample': subsample, 'colsample_bytree': colsample_bytree, \
                  'gamma': gamma, 'max_depth': max_depth, 'n_estimators': n_estimators, 'random_state': random_state}
    result = fit_model(params)

    return result

### If continuing a previous study, set `load_study` to `True` and provide the study name.
*Note:* Studies are saved in the `studies` directory.

In [22]:
load_study = True
study_name = "study.pkl"

### Either load the previous study or create a new study.

In [23]:
if load_study:
    study = joblib.load("../results/optuna_studies/" + study_name)
    print("Best trial until now:")
    print(" Value: ", study.best_trial.value)
    print(" Params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")
else:
    study = optuna.create_study(direction='minimize')

Best trial until now:
 Value:  10.062161645354056
 Params: 
    n_estimators: 149
    max_depth: 6
    learning_rate: 0.2727776229229609
    gamma: 0.020592249476861557
    subsample: 0.9072211939661046
    colsample_bytree: 0.7581516666988656


### Run the study.
(Increase `n_trials` to achieve better hyperparameters)

In [24]:
study.optimize(objective, n_trials=2)

[32m[I 2023-05-09 22:34:39,919][0m Trial 2 finished with value: 10.693527701830137 and parameters: {'n_estimators': 124, 'max_depth': 5, 'learning_rate': 0.46500367062942416, 'gamma': 0.7971318540434682, 'subsample': 0.685912161746016, 'colsample_bytree': 0.7608341747243939}. Best is trial 0 with value: 10.062161645354056.[0m
[32m[I 2023-05-09 22:36:32,644][0m Trial 3 finished with value: 10.128774109507294 and parameters: {'n_estimators': 110, 'max_depth': 3, 'learning_rate': 0.24514684699626466, 'gamma': 0.855343913396993, 'subsample': 0.8394807982777521, 'colsample_bytree': 0.895700308265232}. Best is trial 0 with value: 10.062161645354056.[0m


### Print the hyperparameter and objective function values for the best trial in the study.

In [25]:
print(study.best_params)

{'n_estimators': 149, 'max_depth': 6, 'learning_rate': 0.2727776229229609, 'gamma': 0.020592249476861557, 'subsample': 0.9072211939661046, 'colsample_bytree': 0.7581516666988656}


In [26]:
print(study.best_value)

10.062161645354056
