### Use this notebook to combine the results of optimizations performed in parallel (i.e. when using Python scripts to independently get results for different split times)\n",

This notebook assumes a specific organization and naming convention of the results files. If your organization or naming convention is different, make the necessary changes to the `files = ...` line in the beginning of each cell.

In [1]:
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# Categorical Feature Optimization

In [37]:
files = glob.glob('../results/categorical_features/raw/XGBoost/*label.pkl')
dfs = []
for f in files:
    df = pd.read_pickle(f)
    df['date'] = f.split('/')[5].split('_')[2].split('.')[0]
    df['encoding'] = 'label'
    dfs.append(df)
label_df = pd.concat(dfs)
label_df['r2'] = label_df.r2.apply(lambda x: x[0] if len(x) > 0 else np.nan)
label_df['rmse'] = label_df.rmse.apply(lambda x: x[0] if len(x) > 0 else np.nan)
label_df.sort_values(['date','features'], inplace=True)
#label_df.to_pickle('categorical_features_optimization_label_XGBoost.pkl')
label_df

Unnamed: 0,features,r2,rmse,date,encoding
0,"(wallclock_req, nodes_req, processors_req, gpu...",0.263236,33061.421597,2019-01-01,label
2,"(wallclock_req, nodes_req, processors_req, gpu...",0.345554,31159.760974,2019-01-01,label
15,"(wallclock_req, nodes_req, processors_req, gpu...",0.421529,29295.308439,2019-01-01,label
12,"(wallclock_req, nodes_req, processors_req, gpu...",0.347551,31112.186290,2019-01-01,label
34,"(wallclock_req, nodes_req, processors_req, gpu...",0.417514,29396.793151,2019-01-01,label
...,...,...,...,...,...
51,"(wallclock_req, nodes_req, processors_req, gpu...",-0.438730,18384.372457,2022-12-30,label
10,"(wallclock_req, nodes_req, processors_req, gpu...",-0.138540,16354.352707,2022-12-30,label
31,"(wallclock_req, nodes_req, processors_req, gpu...",-0.393375,18092.276991,2022-12-30,label
5,"(wallclock_req, nodes_req, processors_req, gpu...",-0.010824,15409.800774,2022-12-30,label


In [38]:
files = glob.glob('../results/categorical_features/raw/XGBoost/*onehot.pkl')
dfs = []
for f in files:
    df = pd.read_pickle(f)
    df['date'] = f.split('/')[5].split('_')[2].split('.')[0]
    df['encoding'] = 'onehot'
    dfs.append(df)
onehot_df = pd.concat(dfs)
onehot_df['r2'] = onehot_df.r2.apply(lambda x: x[0] if len(x) > 0 else np.nan)
onehot_df['rmse'] = onehot_df.rmse.apply(lambda x: x[0] if len(x) > 0 else np.nan)
onehot_df.sort_values(['date','features'], inplace=True)
#onehot_df.to_pickle('categorical_features_optimization_onehot_XGBoost.pkl')
onehot_df

Unnamed: 0,features,r2,rmse,date,encoding
0,"(wallclock_req, nodes_req, processors_req, gpu...",0.263236,33061.421597,2019-01-01,onehot
2,"(wallclock_req, nodes_req, processors_req, gpu...",0.263236,33061.421597,2019-01-01,onehot
15,"(wallclock_req, nodes_req, processors_req, gpu...",0.227144,33861.525924,2019-01-01,onehot
12,"(wallclock_req, nodes_req, processors_req, gpu...",0.330322,31520.291706,2019-01-01,onehot
34,"(wallclock_req, nodes_req, processors_req, gpu...",0.308109,32038.804769,2019-01-01,onehot
...,...,...,...,...,...
51,"(wallclock_req, nodes_req, processors_req, gpu...",-0.324997,17642.763189,2022-12-30,onehot
10,"(wallclock_req, nodes_req, processors_req, gpu...",-0.382319,18020.356930,2022-12-30,onehot
31,"(wallclock_req, nodes_req, processors_req, gpu...",-0.378409,17994.848868,2022-12-30,onehot
5,"(wallclock_req, nodes_req, processors_req, gpu...",0.175527,13917.058496,2022-12-30,onehot


In [39]:
files = glob.glob('../results/categorical_features/raw/XGBoost/*target.pkl')
dfs = []
for f in files:
    df = pd.read_pickle(f)
    df['date'] = f.split('/')[5].split('_')[2].split('.')[0]
    df['encoding'] = 'target'
    dfs.append(df)
target_df = pd.concat(dfs)
target_df['r2'] = target_df.r2.apply(lambda x: x[0] if len(x) > 0 else np.nan)
target_df['rmse'] = target_df.rmse.apply(lambda x: x[0] if len(x) > 0 else np.nan)
target_df.sort_values(['date','features'], inplace=True)
#target_df.to_pickle('categorical_features_optimization_target_XGBoost.pkl')
target_df

Unnamed: 0,features,r2,rmse,date,encoding
0,"(wallclock_req, nodes_req, processors_req, gpu...",0.263236,33061.421597,2019-01-01,target
2,"(wallclock_req, nodes_req, processors_req, gpu...",0.322462,31704.736809,2019-01-01,target
15,"(wallclock_req, nodes_req, processors_req, gpu...",0.337718,31345.767891,2019-01-01,target
12,"(wallclock_req, nodes_req, processors_req, gpu...",0.337055,31361.438362,2019-01-01,target
34,"(wallclock_req, nodes_req, processors_req, gpu...",0.327132,31595.280978,2019-01-01,target
...,...,...,...,...,...
51,"(wallclock_req, nodes_req, processors_req, gpu...",-0.262022,17218.397635,2022-12-30,target
10,"(wallclock_req, nodes_req, processors_req, gpu...",-0.147042,16415.299586,2022-12-30,target
31,"(wallclock_req, nodes_req, processors_req, gpu...",-0.272393,17288.999879,2022-12-30,target
5,"(wallclock_req, nodes_req, processors_req, gpu...",-0.174937,16613.707116,2022-12-30,target


# Numerical Feature Optimization

In [13]:
files = glob.glob('../results/numerical_features/raw/XGBoost/numerical*')

In [40]:
dfs = []
for f in files:
    df = pd.read_pickle(f)
    df['date'] = f.split('/')[5].split('_')[2].split('.')[0]
    dfs.append(df)
numerical_df = pd.concat(dfs)
numerical_df['r2'] = numerical_df.r2.apply(lambda x: x[0] if len(x) > 0 else np.nan)
numerical_df['rmse'] = numerical_df.rmse.apply(lambda x: x[0] if len(x) > 0 else np.nan)
numerical_df.sort_values(['date','features'], inplace=True)
#numerical_df.to_pickle('numerical_features_optimization_XGBoost.pkl')
numerical_df

Unnamed: 0,features,r2,rmse,date
0,"(wallclock_req, nodes_req, processors_req, gpu...",0.263236,33061.421597,2019-01-01
2,"(wallclock_req, nodes_req, processors_req, gpu...",0.322462,31704.736809,2019-01-01
15,"(wallclock_req, nodes_req, processors_req, gpu...",0.337718,31345.767891,2019-01-01
12,"(wallclock_req, nodes_req, processors_req, gpu...",0.337055,31361.438362,2019-01-01
34,"(wallclock_req, nodes_req, processors_req, gpu...",0.327132,31595.280978,2019-01-01
...,...,...,...,...
51,"(wallclock_req, nodes_req, processors_req, gpu...",-0.262022,17218.397635,2022-12-30
10,"(wallclock_req, nodes_req, processors_req, gpu...",-0.147042,16415.299586,2022-12-30
31,"(wallclock_req, nodes_req, processors_req, gpu...",-0.272393,17288.999879,2022-12-30
5,"(wallclock_req, nodes_req, processors_req, gpu...",-0.174937,16613.707116,2022-12-30


# Testing Window Optimization

In [42]:
files = glob.glob('../results/testing_window/raw/XGBoost/testing*')

In [43]:
dfs = []
for f in files:
    df = pd.read_pickle(f)
    df['date'] = f.split('/')[5].split('_')[2].split('.')[0]
    dfs.append(df)
testing_df = pd.concat(dfs)
testing_df['r2'] = testing_df.r2.apply(lambda x: x[0] if len(x) > 0 else np.nan)
testing_df['rmse'] = testing_df.rmse.apply(lambda x: x[0] if len(x) > 0 else np.nan)
testing_df.sort_values(['date','testing_window'], inplace=True)
#testing_df.to_pickle('testing_window_optimization_XGBoost.pkl')
testing_df

Unnamed: 0,testing_window,r2,rmse,date
0,1,0.295153,32337.372358,2019-01-01
1,2,0.118667,18493.460121,2019-01-01
2,3,0.387812,11866.173818,2019-01-01
3,4,-0.005460,163069.047507,2019-01-01
4,5,-0.052807,40712.568978,2019-01-01
...,...,...,...,...
55,56,,,2022-12-30
56,57,,,2022-12-30
57,58,,,2022-12-30
58,59,,,2022-12-30


# Training Window Optimization

## XGBoost Model

In [44]:
files = glob.glob('../results/training_window/raw/xgboost/training*')

In [45]:
dfs = []
for f in files:
    df = pd.read_pickle(f)
    df['date'] = f.split('/')[5].split('_')[2].split('.')[0]
    dfs.append(df)
training_df = pd.concat(dfs)
training_df['r2'] = training_df.r2.apply(lambda x: x[0] if len(x) > 0 else np.nan)
training_df['rmse'] = training_df.rmse.apply(lambda x: x[0] if len(x) > 0 else np.nan)
training_df.sort_values(['date','training_window'], inplace=True)
#training_df.to_pickle('training_window_optimization_XGBoost.pkl')
training_df

Unnamed: 0,training_window,r2,rmse,date
0,1,-1.116175,36283.683912,2019-07-01
1,5,-1.306889,37883.403604,2019-07-01
2,10,-1.381235,38489.007301,2019-07-01
3,15,-0.278657,28204.123998,2019-07-01
4,20,0.132430,23232.067750,2019-07-01
...,...,...,...,...
32,160,0.511208,24143.262615,2022-12-30
33,165,0.502681,24352.947497,2022-12-30
34,170,0.496848,24495.345473,2022-12-30
35,175,0.498100,24464.850548,2022-12-30


## Neural Network Model

In [46]:
files = glob.glob('../results/training_window/raw/nn/training*')

In [47]:
dfs = []
for f in files:
    df = pd.read_pickle(f)
    df['date'] = f.split('/')[5].split('_')[3].split('.')[0]
    dfs.append(df)
training_nn_df = pd.concat(dfs)
training_nn_df['r2'] = training_nn_df.r2.apply(lambda x: x[0] if len(x) > 0 else np.nan)
training_nn_df['rmse'] = training_nn_df.rmse.apply(lambda x: x[0] if len(x) > 0 else np.nan)
training_nn_df.sort_values(['date','training_window'], inplace=True)
#training_nn_df.to_pickle('training_window_optimization_NN.pkl')
training_nn_df

Unnamed: 0,training_window,r2,rmse,date
0,1,0.198056,48863.684971,2019-01-01
1,5,0.201766,48750.525222,2019-01-01
2,10,0.256332,47054.802677,2019-01-01
3,15,0.280876,46271.760701,2019-01-01
4,20,0.281401,46254.887547,2019-01-01
...,...,...,...,...
32,160,0.252164,30847.189643,2022-12-29
33,165,0.257576,30735.367566,2022-12-29
34,170,0.260532,30674.120102,2022-12-29
35,175,0.250968,30871.849681,2022-12-29


## TFIDF Model

In [48]:
files = glob.glob('../results/training_window/raw/tfidf/training*')

In [25]:
def convert_values(x):
    if not isinstance(x, list):
        return x
    else:
        if len(x) > 0:
            return x[0]
        else:
            return np.nan

In [49]:
dfs = []
for f in files:
    df = pd.read_pickle(f)
    df['date'] = f.split('/')[5].split('_')[3].split('.')[0]
    dfs.append(df)
training_tfidf_df = pd.concat(dfs)
training_tfidf_df['r2'] = training_tfidf_df.r2.apply(convert_values)
training_tfidf_df['rmse'] = training_tfidf_df.rmse.apply(convert_values)
training_tfidf_df.sort_values(['date','training_window'], inplace=True)
#training_tfidf_df.to_pickle('training_window_optimization_TFIDF.pkl')
training_tfidf_df

Unnamed: 0,training_window,r2,rmse,date
0,1.0,0.134387,35835.974553,2019-01-01
1,5.0,0.246452,33435.892946,2019-01-01
2,10.0,-0.025021,38996.333621,2019-01-01
3,15.0,0.113322,36269.390075,2019-01-01
4,20.0,0.314941,31880.223314,2019-01-01
...,...,...,...,...
32,160.0,0.293503,29982.481355,2022-12-29
33,165.0,0.294374,29964.003178,2022-12-29
34,170.0,0.297847,29890.166047,2022-12-29
35,175.0,0.297541,29896.687403,2022-12-29


# Recent Jobs Optimization

In [51]:
files = glob.glob('../results/recent_jobs/raw/recent*')

In [52]:
dfs = []
for f in files:
    df = pd.read_pickle(f)
    df['date'] = f.split('/')[4].split('_')[2].split('.')[0]
    dfs.append(df)
recent_df = pd.concat(dfs)
#recent_df['r2'] = recent_df.r2.apply(lambda x: x[0] if len(x) > 0 else np.nan)
#recent_df['rmse'] = recent_df.rmse.apply(lambda x: x[0] if len(x) > 0 else np.nan)
recent_df.sort_values(['date','n'], inplace=True)
#recent_df.to_pickle('recent_jobs_optimization.pkl')
recent_df

Unnamed: 0,n,r2,rmse,date
0,1,0.069543,37153.992518,2019-01-01
1,2,0.206052,34320.479621,2019-01-01
2,3,0.243140,33509.290348,2019-01-01
3,4,0.241663,33541.966349,2019-01-01
4,5,0.226025,33886.042887,2019-01-01
...,...,...,...,...
195,196,0.141663,43717.953849,2022-12-29
196,197,0.141275,43727.831283,2022-12-29
197,198,0.141271,43727.935396,2022-12-29
198,199,0.140848,43738.700858,2022-12-29


# Similar Jobs Optimization

In [53]:
files = glob.glob('../results/similar_jobs/raw/similar*')

In [54]:
dfs = []
for f in files:
    df = pd.read_pickle(f)
    df['date'] = f.split('/')[4].split('_')[2].split('.')[0]
    dfs.append(df)
similar_df = pd.concat(dfs)
similar_df.sort_values(['date','n'], inplace=True)
#similar_df.to_pickle('similar_jobs_optimization.pkl')
similar_df

Unnamed: 0,n,r2,rmse,date
0,1.0,0.210754,34218.696466,2019-01-01
1,2.0,0.249452,33369.252338,2019-01-01
2,3.0,0.247905,33403.627472,2019-01-01
3,4.0,0.231551,33764.847675,2019-01-01
4,5.0,0.207866,34281.244564,2019-01-01
...,...,...,...,...
195,196.0,0.270352,40307.657094,2022-12-29
196,197.0,0.269423,40333.306646,2022-12-29
197,198.0,0.268513,40358.427918,2022-12-29
198,199.0,0.267604,40383.488637,2022-12-29
