In [1]:
import numpy as np
import pandas as pd

from sbatch_pred.queuetime_prediction.model_training import (get_model_data,
                                                             get_feature_correlation,
                                                             get_input_features,
                                                             partition_params,
                                                             assign_cluster,
                                                             get_partition_results,
                                                             get_results_df,
                                                             save_results)

import optuna
from sklearn.metrics import mean_absolute_error

pd.set_option('display.max_rows', 100)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Choose knowledge depth: cluster, partition, node, or combined
knowledge_depth='combined'

# Choose wallclock knowledge: user, pred, or perfect
wallclock_knowledge='pred'

model_data_df = get_model_data(knowledge_depth, wallclock_knowledge)

Loaded model data at path ../../data/model_data/model_data_partition_pred.parquet
Loaded model data at path ../../data/model_data/model_data_node_pred.parquet
Finished combinining partition- and node-level data


In [3]:
model_data_df.columns

Index(['priority', 'qos_num', 'mem_req', 'gpus_req', 'processors_req',
       'nodes_req', 'wallclock_req_log', 'array_pos', 'queue_depth_log',
       'queue_size_log', 'queue_mem_log', 'queue_avg_size', 'queue_avg_mem',
       'nodes_available', 'runtime_remaining_min_log', 'mem_remaining_min_log',
       'partition', 'state', 'start_time', 'submit_time', 'queue_wait',
       'queue_wait_log', 'queue_depth_min_log_NODE', 'queue_size_min_log_NODE',
       'queue_mem_min_log_NODE', 'queue_avg_size_min_log_NODE',
       'queue_avg_mem_min_log_NODE', 'nodes_available_NODE'],
      dtype='object')

In [None]:
# Train Window: The number of days of data to use to train the model
#               ...Training data is collected from [split time - train window, split time)
train_window = 120

# Test Window: The number of days of data to use to test the model
#               ...Testing data is collected from [split time, split time + test_window)
test_window = 1

# Start Date: The initial split time (starts at 00:00am)
start_date = '2023-06-01'

# n_days: The number of days to test/number of split times (recommended 100-200 days)
n_days = 200

# Target: The target feature. Can use 'queue_wait' or 'queue_wait_log' (log scaled) for regression
#         ...or 'cluster' for classification
target = 'cluster'

# Model Type: Either regression (xgb_reg) or classification (xgb_cls)
model_type = 'xgb_cls'

# If using Classification model, we need to assign each job to the appropriate cluster/class
if model_type == 'xgb_cls':
    model_data_df['cluster'] = model_data_df.apply(lambda row: assign_cluster(row), axis=1)

# Get the input features, depending on the knowledge depth being used
features = get_input_features(knowledge_depth)

In [None]:
# Get the correlation between input features and target feature
get_feature_correlation(model_data_df, target)

In [6]:
partition_results = {}
for partition in model_data_df.partition.unique():
    try:
        params = partition_params[partition]
    except:
        params = {}
    if partition == 'off3':
        # There is no node-level information for the off3 partition
        # so we skip this partition to ensure results for all knowledge-depths are comparable.
        continue 
    print(f'Getting results for {partition} partition')
    partition_results[partition] = get_partition_results(model_data_df, train_window, test_window, start_date, 
                          features, target, partition, model_type=model_type, params=params, n_days=n_days, verbose=False)

Getting results for standard partition
Getting results for short-stdby partition
Getting results for gpu partition
Getting results for debug partition
Getting results for short partition
Getting results for bigmem partition
Getting results for off2 partition
Getting results for long-stdby partition
Getting results for long partition
Getting results for standard-stdby partition
Getting results for off1 partition
Getting results for debug-stdby partition
Getting results for bigscratch partition


Traceback (most recent call last):
  File "/Users/kmenear/Projects/sbatch_pred/src/sbatch_pred/queuetime_prediction/model_training.py", line 674, in xgb_classification
    predictions = le.inverse_transform(predictions)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kmenear/Projects/sbatch_pred/environment/lib/python3.12/site-packages/sklearn/preprocessing/_label.py", line 160, in inverse_transform
    raise ValueError("y contains previously unseen labels: %s" % str(diff))
ValueError: y contains previously unseen labels: [0]


Exception type: ValueError
Error message: y contains previously unseen labels: [0]
Something went wrong. Returning None
Getting results for off1-stdby partition
Exception type: ValueError
Error message: y contains previously unseen labels: [0]
Something went wrong. Returning None


Traceback (most recent call last):
  File "/Users/kmenear/Projects/sbatch_pred/src/sbatch_pred/queuetime_prediction/model_training.py", line 674, in xgb_classification
    predictions = le.inverse_transform(predictions)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kmenear/Projects/sbatch_pred/environment/lib/python3.12/site-packages/sklearn/preprocessing/_label.py", line 160, in inverse_transform
    raise ValueError("y contains previously unseen labels: %s" % str(diff))
ValueError: y contains previously unseen labels: [0]


Getting results for gpul partition
Getting results for gpu-stdby partition
Getting results for bigmem-stdby partition
Exception type: ValueError
Error message: y contains previously unseen labels: [0]
Something went wrong. Returning None


Traceback (most recent call last):
  File "/Users/kmenear/Projects/sbatch_pred/src/sbatch_pred/queuetime_prediction/model_training.py", line 674, in xgb_classification
    predictions = le.inverse_transform(predictions)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/kmenear/Projects/sbatch_pred/environment/lib/python3.12/site-packages/sklearn/preprocessing/_label.py", line 160, in inverse_transform
    raise ValueError("y contains previously unseen labels: %s" % str(diff))
ValueError: y contains previously unseen labels: [0]


In [7]:
results_df = save_results(partition_results, knowledge_depth, wallclock_knowledge, model_type=model_type)

In [8]:
results_df

Unnamed: 0,priority,qos_num,mem_req,gpus_req,processors_req,nodes_req,wallclock_req_log,array_pos,queue_depth_log,queue_size_log,...,queue_depth_min_log_NODE,queue_size_min_log_NODE,queue_mem_min_log_NODE,queue_avg_size_min_log_NODE,queue_avg_mem_min_log_NODE,nodes_available_NODE,cluster,split_time,cluster_pred,feature_set
0,120405400,2,767232.0,0,324,9,11.589887,0.0,4.220977,16.610102,...,8.667353,18.170355,29.526944,9.503027,20.859608,104.0,6,2023-06-01,8,combined_pred
1,120405400,2,767232.0,0,324,9,11.589887,0.0,2.406945,15.529131,...,8.654186,18.016671,29.373801,9.362511,20.719632,115.0,4,2023-06-01,8,combined_pred
2,120405400,2,767232.0,0,324,9,11.589887,0.0,2.406945,15.529131,...,8.654186,18.016671,29.373801,9.362511,20.719632,115.0,4,2023-06-01,8,combined_pred
3,120405400,2,767232.0,0,324,9,11.589887,0.0,2.572612,15.656289,...,8.654535,18.027878,29.384966,9.373369,20.730448,115.0,5,2023-06-01,8,combined_pred
4,120405400,2,767232.0,0,324,9,11.589887,0.0,2.572612,15.656289,...,8.654535,18.027878,29.384966,9.373369,20.730448,115.0,5,2023-06-01,8,combined_pred
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1053595,181728490,1,751616.0,0,36,1,10.134603,0.0,-2.302585,-2.302585,...,7.987558,13.975232,25.328552,5.987958,17.341027,4.0,0,2023-12-11,0,combined_pred
1053596,181728490,1,751616.0,0,36,1,10.134603,0.0,0.095310,9.272579,...,7.982450,13.982194,25.403560,6.000025,17.421136,2.0,0,2023-12-11,0,combined_pred
1053597,181976050,1,751616.0,0,36,1,10.134603,0.0,0.095310,9.272579,...,8.006068,18.053510,29.408030,10.047480,21.401996,5.0,0,2023-12-11,0,combined_pred
1053598,181976050,1,751616.0,0,36,1,10.134603,0.0,0.095310,9.272579,...,8.006068,18.053510,29.408030,10.047480,21.401996,5.0,0,2023-12-11,0,combined_pred


## Hyperparameter Optimization

In [None]:
# Get accuracy for classification models
def get_accuracy(df, adjacent=False):
    if adjacent:
        return len(df[(df.cluster_pred == df.cluster - 1) |
                   (df.cluster_pred == df.cluster) |
                   (df.cluster_pred == df.cluster + 1)]) / len(df.cluster)
    return len(df[df.cluster_pred == df.cluster]) / len(df.cluster)

In [None]:
# Optuna objective functioon
def objective(trial):
    params = {
        'verbosity': 0,
        'objective': 'reg:absoluteerror',
        'booster': 'gbtree',
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'eta': trial.suggest_float('eta', 0.01, 0.5),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0)
    }

    # Optimize the models for the partition specified in the next cell
    partition_results = {}
    for partition in partition_data_df.partition.unique():
        print(f'Getting results for {partition} partition')
        partition_results[partition] = get_partition_results(partition_data_df, train_window, test_window, start_date, 
                              features, target, partition, model_type=model_type, params=params, n_days=n_days, verbose=False)

    # Combine all results into one Dataframe
    results_df = get_results_df(partition_results, model_type, f'{knowledge_depth}_{wallclock_knowledge}')

    # Evaluate the results
    if model_type == 'xgb_reg':
        mae = mean_absolute_error(results_df['wait_time_act_hours'], results_df['wait_time_pred_hours'])
        return mae
    elif model_type == 'xgb_cls':
        accuracy = get_accuracy(results_df)
        return accuracy

In [None]:
# Choose the partition to optimize
partition = 'standard'
partition_data_df = model_data_df[model_data_df.partition == partition].copy()

In [None]:
storage_url = f"sqlite:///study_{model_type}_{partition}.db"
study_name=f'{model_type} Optimization: {partition} Partition'
n_trials = 100

if model_type == 'xgb_reg':
    direction == 'minimize'
elif model_type == 'xgb_cls':
    direction == 'maximize'

# Create & run the Optuna study
study = optuna.create_study(direction=direction, storage=storage_url, study_name=study_name, load_if_exists=True)
study.optimize(objective, n_trials=n_trials)