## This notebook is used to run hyperparameter optimization studies with Optuna.

In [1]:
import optuna
import joblib
import datetime
import os
import pandas as pd

import xgboost as xgb
from xgboost import XGBRegressor

from hpc_runtime_prediction.operation_support import train_test_split
from hpc_runtime_prediction.data_preprocessing import label_encode_columns

from sklearn.metrics import mean_squared_error

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def get_dfs():
    # Import pkl file
    filepath = os.path.join('../data/', 'eagle_data_anonymized_20230222.pkl')
    eagle_df = pd.read_pickle(filepath)
    
    categorical_features = ['user','partition']
    label_encode_columns(eagle_df, categorical_features)
    
    start_time = eagle_df.submit_time.min()
    end_time = eagle_df.submit_time.max()
    split_times = pd.date_range(start_time, end_time, periods=22)[1:21]
    
    train_dfs = []
    test_dfs = []
    for split_time in split_times:
        train_df, test_df = train_test_split(eagle_df, split_time, training_window=100, testing_window=1)
        train_dfs.append(train_df)
        test_dfs.append(test_df)
    
    return train_dfs, test_dfs

In [4]:
train_dfs, test_dfs = get_dfs()

In [5]:
def load_data(i, train_dfs, test_dfs):
    features = ['wallclock_req','processors_req','mem_req','nodes_req','gpus_req','user','partition']
    target = 'run_time'
    X_train = train_dfs[i][features]
    y_train = train_dfs[i][target]
    X_test = test_dfs[i][features]
    y_test = test_dfs[i][target]
    return X_train, y_train, X_test, y_test

In [6]:
# Define the function to fit and evaluate a model with given hyperparameters on a single dataset 
def fit_model(params):
    params['eval_metric'] = 'rmse'
    #n_estimators = params['n_estimators']
    rmse_list = list()
    eval_name = 'val'
    for i in range(len(train_dfs)):
        X_train, y_train, X_test, y_test = load_data(i, train_dfs, test_dfs)
        #dtrain = xgb.DMatrix(data=X_train,
        #                     label=y_train)
        #dval = xgb.DMatrix(data=X_test,
        #                     label=y_test)
        #model = xgb.train(params=params,
        #                dtrain=dtrain,
        #                num_boost_round=num_boost_round, # If training ever reaches 10000 rounds without early stopping, this should be increased
        #                early_stopping_rounds=20,
        #                evals=[(dval,eval_name)],
        #                verbose_eval=20)
        #X_test = xgb.DMatrix(data=X_test)
        model = XGBRegressor(**params) 
        model.fit(X_train, y_train) 
        y_pred = model.predict(X_test) 
        y_pred = model.predict(X_test) 
        rmse = mean_squared_error(y_test, y_pred, squared=False) 
        rmse_list.append(rmse)
    rmse_avg = sum(rmse_list)/len(rmse_list) / 3600
    return sum(rmse_list)/len(rmse_list) / 3600

In [14]:
def objective(trial):
    # Save the study before running the next trial
    joblib.dump(study, "../results/hyper_opt/optuna_studies/study_xgb_no_earlystopping_prime_" + datetime.datetime.now().strftime("%Y%m%d-%H") + ".pkl")
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    max_depth = trial.suggest_int("max_depth", 3, 7)
    learning_rate = trial.suggest_float("learning_rate", 0.1, 0.5)
    gamma = trial.suggest_float("gamma", 0, 1)
    subsample = trial.suggest_float("subsample", 0.5, 1)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1)
    random_state = 42
    
    params = {'learning_rate': learning_rate, 'subsample': subsample, 'colsample_bytree': colsample_bytree, \
                  'gamma': gamma, 'max_depth': max_depth, 'n_estimators': n_estimators, 'random_state': random_state}
    result = fit_model(params)

    return result

### Define parameters used for model training in the objective function below.

### Define the objective function.
See https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/001_first.html#sphx-glr-tutorial-10-key-features-001-first-py for an introduction to using Optuna for hyperparameter optimization.

### If continuing a previous study, set `load_study` to `True` and provide the study name.
*Note:* Studies are saved in the `studies` directory.

### Note:
Arrived at 8.8 within 30 minutes

In [15]:
load_study = True
study_name = "study_xgb_no_earlystopping_20230302-23.pkl"

### Either load the previous study or create a new study.

In [16]:
if load_study:
    study = joblib.load("../results/hyper_opt/optuna_studies/" + study_name)
    print("Best trial until now:")
    print(" Value: ", study.best_trial.value)
    print(" Params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")
else:
    study = optuna.create_study(direction='minimize')

Best trial until now:
 Value:  8.514990033892648
 Params: 
    n_estimators: 168
    max_depth: 7
    learning_rate: 0.3968571956999504
    gamma: 0.640232768439118
    subsample: 0.747747407403972
    colsample_bytree: 0.6280085182287491


### Run the study.

In [17]:
study.optimize(objective, n_trials=250)

[32m[I 2023-03-03 00:19:26,209][0m Trial 88 finished with value: 8.70028669130951 and parameters: {'n_estimators': 130, 'max_depth': 7, 'learning_rate': 0.38392194201022134, 'gamma': 0.6231499928907753, 'subsample': 0.8152492076837146, 'colsample_bytree': 0.6630512349951684}. Best is trial 81 with value: 8.514990033892648.[0m
[32m[I 2023-03-03 00:22:33,886][0m Trial 89 finished with value: 8.889830834577081 and parameters: {'n_estimators': 117, 'max_depth': 7, 'learning_rate': 0.36599444671494896, 'gamma': 0.581910476445027, 'subsample': 0.8087919090745046, 'colsample_bytree': 0.6934003239926613}. Best is trial 81 with value: 8.514990033892648.[0m
[32m[I 2023-03-03 00:26:38,081][0m Trial 90 finished with value: 8.758695616559024 and parameters: {'n_estimators': 161, 'max_depth': 7, 'learning_rate': 0.37463229322292996, 'gamma': 0.6193772163317779, 'subsample': 0.8495965117624804, 'colsample_bytree': 0.6625293007158504}. Best is trial 81 with value: 8.514990033892648.[0m
[32m[

KeyboardInterrupt: 

### Print the hyperparameter and objective function values for the best trial in the study.

In [75]:
print(study.best_params)

{'n_estimators': 168, 'max_depth': 7, 'learning_rate': 0.3968571956999504, 'gamma': 0.640232768439118, 'subsample': 0.747747407403972, 'colsample_bytree': 0.6280085182287491}


In [76]:
print(study.best_value)

8.514990033892648
