## This notebook is used to optimize the runtime prediction model.
It should be noted that this process is computationally expensive and will take a long time to run for a significant number of split times. Because of this, the example shown here is only for two split times. However, because the results of each split time are independent, this code can be parallelized. How we achieved this is by using the Python scripts in the python_scripts directory, submitting each split time as a different job to the HPC system (Eagle), and combining the results in post-processing. Future updates to this repo will include parallelized code so this inherent parallelism can be taken advantage of directly within this notebook. 

# Import necessary packages

In [1]:
import pandas as pd
import datetime
import os

from eagle_jobs.model_optimization import optimize_training_window
from eagle_jobs.model_optimization import optimize_testing_window
from eagle_jobs.model_optimization import optimize_numerical_features
from eagle_jobs.model_optimization import optimize_categorical_features

2023-05-09 22:38:55.241293: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Import Eagle data

In [2]:
filepath = os.path.join('../data/', 'eagle_data.parquet')
eagle_df = pd.read_parquet(filepath)

# Get split times

In [9]:
start_time = pd.Timestamp('2022-01-01')
end_time = pd.Timestamp('2022-01-07')
split_times = pd.date_range(start_time, end_time, freq='6D')

# Optimize Training Window

In [10]:
model_type='XGBoost' # Options are 'XGBoost', 'NN', and 'TFIDF'
r2_train, rmse_train = optimize_training_window(eagle_df, split_times, model_type)

optimize_training_window_df = pd.DataFrame({'training_window': list(r2_train.keys()), \
                                            'r2': list(r2_train.values()), 'rmse': list(rmse_train.values())})

now = datetime.datetime.now().strftime('%Y%m%d_%H%M')
filename = 'optimize_training_window_df_'+now+'.pkl'
filepath = os.path.join('..','results',filename)
optimize_training_window_df.to_pickle(filepath)

optimal_training_window = min(rmse_train, key=lambda k: sum(rmse_train[k]) / len(rmse_train[k]))
print('Optimal Training Window:', optimal_training_window)

Split time: 2022-01-01 00:00:00, training window: 1, r2: -0.492, rmse: 45352
Split time: 2022-01-01 00:00:00, training window: 5, r2: -0.272, rmse: 41881
Split time: 2022-01-01 00:00:00, training window: 10, r2: -0.132, rmse: 39507
Split time: 2022-01-01 00:00:00, training window: 15, r2: -0.016, rmse: 37426
Split time: 2022-01-01 00:00:00, training window: 20, r2: -0.141, rmse: 39663
Split time: 2022-01-01 00:00:00, training window: 25, r2: -0.216, rmse: 40948
Split time: 2022-01-01 00:00:00, training window: 30, r2: -0.010, rmse: 37317
Split time: 2022-01-01 00:00:00, training window: 35, r2: 0.096, rmse: 35297
Split time: 2022-01-01 00:00:00, training window: 40, r2: 0.100, rmse: 35232
Split time: 2022-01-01 00:00:00, training window: 45, r2: 0.140, rmse: 34437
Split time: 2022-01-01 00:00:00, training window: 50, r2: 0.095, rmse: 35311
Split time: 2022-01-01 00:00:00, training window: 55, r2: 0.141, rmse: 34408
Split time: 2022-01-01 00:00:00, training window: 60, r2: 0.157, rmse: 

# Optimize Testing Window

In [11]:
r2_test, rmse_test = optimize_testing_window(eagle_df, split_times, optimal_training_window)

optimize_testing_window_df = pd.DataFrame({'testing_window': list(r2_test.keys()), 'r2': list(r2_test.values()), 'rmse': list(rmse_test.values())})

now = datetime.datetime.now().strftime('%Y%m%d_%H%M')
filename = 'optimize_testing_window_df_'+now+'.pkl'
filepath = os.path.join('..','results',filename)
optimize_testing_window_df.to_pickle(filepath)

optimal_testing_window = min(rmse_test, key=lambda k: sum(rmse_test[k]) / len(rmse_test[k]))
print('Optimal Testing Window:', optimal_testing_window)

Split time: 2022-01-01 00:00:00, testing window: 1, r2: -0.252, rmse: 61426
Split time: 2022-01-01 00:00:00, testing window: 2, r2: -0.174, rmse: 33207
Split time: 2022-01-01 00:00:00, testing window: 3, r2: -1.197, rmse: 51599
Split time: 2022-01-01 00:00:00, testing window: 4, r2: 0.649, rmse: 31270
Split time: 2022-01-01 00:00:00, testing window: 5, r2: 0.369, rmse: 19353
Split time: 2022-01-01 00:00:00, testing window: 6, r2: 0.294, rmse: 33422
Split time: 2022-01-01 00:00:00, testing window: 7, r2: -0.131, rmse: 23218
Split time: 2022-01-01 00:00:00, testing window: 8, r2: -0.016, rmse: 22537
Split time: 2022-01-01 00:00:00, testing window: 9, r2: -0.093, rmse: 16686
Split time: 2022-01-01 00:00:00, testing window: 10, r2: -1.403, rmse: 4688
Split time: 2022-01-01 00:00:00, testing window: 11, r2: 0.000, rmse: 346
Split time: 2022-01-01 00:00:00, testing window: 12, r2: -29855.298, rmse: 256
Split time: 2022-01-01 00:00:00, testing window: 14, r2: -0.426, rmse: 10235
Split time: 2

# Optimize Feature Set

### Optimize Numerical Features
* Assume we should use Wallclock Req as a minimum feature set.
* Use optimal number of training & testing days
* In addition to wallclock, run all combinations of Nodes, Processors, GPUs, and Mem (2^4 = 16 combinations)

In [12]:
r2_num_feat, rmse_num_feat = optimize_numerical_features(eagle_df, split_times, optimal_training_window, optimal_testing_window)

optimize_numerical_features_df = pd.DataFrame({'features': list(r2_num_feat.keys()), 'r2': list(r2_num_feat.values()), 'rmse': list(rmse_num_feat.values())})

now = datetime.datetime.now().strftime('%Y%m%d_%H%M')
filename = 'optimize_numerical_features_df_'+now+'.pkl'
filepath = os.path.join('..','results',filename)
optimize_numerical_features_df.to_pickle(filepath)

optimal_numerical_features = min(rmse_num_feat, key=lambda k: sum(rmse_num_feat[k]) / len(rmse_num_feat[k]))
print('Optimal Numerical Features:', optimal_numerical_features)

Split time: 2022-01-01 00:00:00, features: ('wallclock_req',), r2: -0.285, rmse: 38427
Split time: 2022-01-01 00:00:00, features: ('wallclock_req', 'nodes_req'), r2: -0.184, rmse: 36889
Split time: 2022-01-01 00:00:00, features: ('wallclock_req', 'processors_req'), r2: 0.288, rmse: 28596
Split time: 2022-01-01 00:00:00, features: ('wallclock_req', 'gpus_req'), r2: -0.296, rmse: 38587
Split time: 2022-01-01 00:00:00, features: ('wallclock_req', 'mem_req'), r2: 0.087, rmse: 32379
Split time: 2022-01-01 00:00:00, features: ('wallclock_req', 'nodes_req', 'processors_req'), r2: 0.229, rmse: 29763
Split time: 2022-01-01 00:00:00, features: ('wallclock_req', 'nodes_req', 'gpus_req'), r2: -0.190, rmse: 36978
Split time: 2022-01-01 00:00:00, features: ('wallclock_req', 'nodes_req', 'mem_req'), r2: 0.211, rmse: 30113
Split time: 2022-01-01 00:00:00, features: ('wallclock_req', 'processors_req', 'gpus_req'), r2: 0.363, rmse: 27060
Split time: 2022-01-01 00:00:00, features: ('wallclock_req', 'proc

### Optimize Categorical Features + Encoding
* With optimal train days, test days, and numerical feature set, test the following encodings:
    * Label encoding: Label all categorical features
    * Target encoding: Get target encoding for all categorical features
    * Onehot (top n) Encoding: One-hot encode top n instances of categorical features (group the rest as 'other')
        * Top 20 users
        * Top 6 partitions
        * Top 15 accounts
        * Top 20 name
        * Top 20 work_dir
* Run all combinations of User, Account, Partition, QOS, Name, and Work Dir (2^6 = 64 combinations)

In [13]:
optimal_features = dict()
for encoding in ['label','onehot','target']:
    r2_cat_feat, rmse_cat_feat = optimize_categorical_features(eagle_df, split_times, optimal_training_window, optimal_testing_window, optimal_numerical_features, encoding)

    optimize_categorical_features_df = pd.DataFrame({'features': list(r2_cat_feat.keys()), 'r2': list(r2_cat_feat.values()), 'rmse': list(rmse_cat_feat.values())})

    now = datetime.datetime.now().strftime('%Y%m%d_%H%M')
    filename = 'optimize_categorical_features_df_'+encoding+'_'+now+'.pkl'
    filepath = os.path.join('..','results',filename)
    optimize_categorical_features_df.to_pickle(filepath)

    optimal_features[encoding] = min(rmse_cat_feat, key=lambda k: sum(rmse_cat_feat[k]) / len(rmse_cat_feat[k]))
    print(f'Optimal Features with {encoding} Encoding:', optimal_features[encoding])

Encoding: label, Split time: 2022-01-01 00:00:00, features: ('wallclock_req', 'processors_req', 'gpus_req'), r2: 0.363, rmse: 27060
Encoding: label, Split time: 2022-01-01 00:00:00, features: ('wallclock_req', 'processors_req', 'gpus_req', 'user'), r2: -0.001, rmse: 33918
Encoding: label, Split time: 2022-01-01 00:00:00, features: ('wallclock_req', 'processors_req', 'gpus_req', 'account'), r2: -3.537, rmse: 72193
Encoding: label, Split time: 2022-01-01 00:00:00, features: ('wallclock_req', 'processors_req', 'gpus_req', 'partition'), r2: 0.321, rmse: 27931
Encoding: label, Split time: 2022-01-01 00:00:00, features: ('wallclock_req', 'processors_req', 'gpus_req', 'qos'), r2: -0.160, rmse: 36506
Encoding: label, Split time: 2022-01-01 00:00:00, features: ('wallclock_req', 'processors_req', 'gpus_req', 'work_dir'), r2: 0.336, rmse: 27613
Encoding: label, Split time: 2022-01-01 00:00:00, features: ('wallclock_req', 'processors_req', 'gpus_req', 'name'), r2: 0.320, rmse: 27952
Encoding: labe

In [14]:
print('Optimal Training Window:', optimal_training_window)
print('Optimal Testing Window:', optimal_testing_window)
print('Optimal Numerical Features:', optimal_numerical_features)
for encoding in ['label','onehot','target']:
    print(f'Optimal Features with {encoding} Encoding:', optimal_features[encoding])

Optimal Training Window: 60
Optimal Testing Window: 11
Optimal Numerical Features: ('wallclock_req', 'processors_req', 'gpus_req')
Optimal Features with label Encoding: ('wallclock_req', 'processors_req', 'gpus_req', 'partition', 'name')
Optimal Features with onehot Encoding: ('wallclock_req', 'processors_req', 'gpus_req', 'partition')
Optimal Features with target Encoding: ('wallclock_req', 'processors_req', 'gpus_req', 'user', 'account', 'work_dir')
