## This notebook is used to run hyperparameter optimization studies with Optuna.

In [2]:
from sbatch_pred.runtime_prediction.operation_support import train_test_split

In [10]:
import optuna
import joblib
import datetime
import os
import pandas as pd

import xgboost as xgb
from xgboost import XGBRegressor

from eagle_jobs.operation_support import train_test_split
from eagle_jobs.data_preprocessing import label_encode_columns

from sklearn.metrics import mean_squared_error

In [11]:
def get_dfs():
    # Import pkl file
    filepath = os.path.join('../data/', 'eagle_data.parquet')
    eagle_df = pd.read_parquet(filepath)
    eagle_df = eagle_df[eagle_df.state.isin(['COMPLETED','TIMEOUT'])]
    
    categorical_features = ['user','partition']
    label_encode_columns(eagle_df, categorical_features)
    
    start_time = eagle_df.submit_time.min()
    end_time = eagle_df.submit_time.max()
    split_times = pd.date_range(start_time, end_time, periods=22)[1:21]
    
    train_dfs = []
    test_dfs = []
    for split_time in split_times:
        train_df, test_df = train_test_split(eagle_df, split_time, training_window=100, testing_window=1)
        train_dfs.append(train_df)
        test_dfs.append(test_df)
    
    return train_dfs, test_dfs

In [12]:
train_dfs, test_dfs = get_dfs()

In [13]:
def load_data(i, train_dfs, test_dfs):
    features = ['wallclock_req','processors_req','mem_req','user','partition']
    target = 'run_time'
    X_train = train_dfs[i][features]
    y_train = train_dfs[i][target]
    X_test = test_dfs[i][features]
    y_test = test_dfs[i][target]
    return X_train, y_train, X_test, y_test

In [14]:
# Define the function to fit and evaluate a model with given hyperparameters on a single dataset 
def fit_model(params):
    params['eval_metric'] = 'rmse'
    rmse_list = list()
    eval_name = 'val'
    for i in range(len(train_dfs)):
        X_train, y_train, X_test, y_test = load_data(i, train_dfs, test_dfs)
        model = XGBRegressor(**params) 
        model.fit(X_train, y_train) 
        y_pred = model.predict(X_test) 
        y_pred = model.predict(X_test) 
        rmse = mean_squared_error(y_test, y_pred, squared=False) 
        rmse_list.append(rmse)
    rmse_avg = sum(rmse_list)/len(rmse_list) / 3600
    return sum(rmse_list)/len(rmse_list) / 3600

### Define the objective function.
See https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/001_first.html#sphx-glr-tutorial-10-key-features-001-first-py for an introduction to using Optuna for hyperparameter optimization.

In [15]:
def objective(trial):
    # Save the study before running the next trial
    joblib.dump(study, "../results/optuna_studies/study_" + datetime.datetime.now().strftime("%Y%m%d-%H") + ".pkl")
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    max_depth = trial.suggest_int("max_depth", 3, 7)
    learning_rate = trial.suggest_float("learning_rate", 0.1, 0.5)
    gamma = trial.suggest_float("gamma", 0, 1)
    subsample = trial.suggest_float("subsample", 0.5, 1)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1)
    random_state = 42
    
    params = {'learning_rate': learning_rate, 'subsample': subsample, 'colsample_bytree': colsample_bytree, \
                  'gamma': gamma, 'max_depth': max_depth, 'n_estimators': n_estimators, 'random_state': random_state}
    result = fit_model(params)

    return result

### If continuing a previous study, set `load_study` to `True` and provide the study name.
*Note:* Studies are saved in the `studies` directory.

In [23]:
load_study = True
study_name = "study.pkl"

### Either load the previous study or create a new study.

In [24]:
if load_study:
    study = joblib.load("../results/optuna_studies/" + study_name)
    print("Best trial until now:")
    print(" Value: ", study.best_trial.value)
    print(" Params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")
else:
    study = optuna.create_study(direction='minimize')

Best trial until now:
 Value:  9.270566975521128
 Params: 
    n_estimators: 50
    max_depth: 6
    learning_rate: 0.386232604942757
    gamma: 0.9000526173345429
    subsample: 0.5724346830456464
    colsample_bytree: 0.6279917874378335


### Run the study.
(Increase `n_trials` for better results)

In [25]:
study.optimize(objective, n_trials=20)

[32m[I 2023-05-10 13:55:56,103][0m Trial 10 finished with value: 10.156270852704752 and parameters: {'n_estimators': 143, 'max_depth': 4, 'learning_rate': 0.20555038779640641, 'gamma': 0.718721418030296, 'subsample': 0.822486113548387, 'colsample_bytree': 0.8204176872498461}. Best is trial 0 with value: 9.270566975521128.[0m
[32m[I 2023-05-10 13:57:04,409][0m Trial 11 finished with value: 9.935124534726027 and parameters: {'n_estimators': 54, 'max_depth': 6, 'learning_rate': 0.3403906566172594, 'gamma': 0.6741634281618086, 'subsample': 0.5105333649160274, 'colsample_bytree': 0.5137889549351082}. Best is trial 0 with value: 9.270566975521128.[0m
[32m[I 2023-05-10 13:58:20,131][0m Trial 12 finished with value: 9.178962558649168 and parameters: {'n_estimators': 51, 'max_depth': 7, 'learning_rate': 0.3740972645086412, 'gamma': 0.978533535354144, 'subsample': 0.845190065929861, 'colsample_bytree': 0.6009457519119111}. Best is trial 12 with value: 9.178962558649168.[0m
[32m[I 2023-

### Print the hyperparameter and objective function values for the best trial in the study.

In [26]:
print(study.best_params)

{'n_estimators': 51, 'max_depth': 7, 'learning_rate': 0.3740972645086412, 'gamma': 0.978533535354144, 'subsample': 0.845190065929861, 'colsample_bytree': 0.6009457519119111}


In [27]:
print(study.best_value)

9.178962558649168
