In [107]:
import os
from os import walk
import torch
import pandas as pd
import numpy as np
from evaluation_scenario import EvaluationScenario, EvaluationScenarioCollection, EvaluationUtils, EvaluationMetrics

In [108]:
model_paths: str = '../models'
evaluations_paths: str = '../evaluation/all'

In [120]:
test_df = pd.read_csv('../datasets/test_df.csv', index_col=0)
test_df.head(3)

Unnamed: 0_level_0,inst_num,cpu_usage,gpu_wrk_util,avg_mem,max_mem,avg_gpu_wrk_mem,max_gpu_wrk_mem,plan_cpu,plan_mem,plan_gpu,...,OpenmpiWorker,OssToVolumeWorker,PyTorchWorker,TVMTuneMain,chief,evaluator,ps,tensorflow,worker,xComputeWorker
start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-03-03 01:10:34+08:00,50.0,30.347826,0.192308,3.253242,4.530273,0.556227,1.723633,600.0,29.296875,100.0,...,0,0,0,0,0,0,0,0,1,0
1970-03-03 01:10:34+08:00,50.0,31.73913,0.16,3.4434,4.883789,0.614375,1.719727,600.0,29.296875,100.0,...,0,0,0,0,0,0,0,0,1,0
1970-03-03 01:10:34+08:00,50.0,30.52381,0.0,3.147928,4.001953,0.61381,5.731445,600.0,29.296875,100.0,...,0,0,0,0,0,0,0,0,1,0


In [121]:
actual_cpu_series: pd.Series = test_df.iloc[:, 1]
actual_mem_series: pd.Series = test_df.iloc[:, 4]

start_date
1970-03-03 01:10:34+08:00     30.347826
1970-03-03 01:10:34+08:00     31.739130
1970-03-03 01:10:34+08:00     30.523810
1970-03-03 01:10:34+08:00     29.478261
1970-03-03 01:10:34+08:00     28.565217
                                ...    
1970-03-16 23:55:21+08:00     64.200000
1970-03-16 23:55:22+08:00     76.125000
1970-03-16 23:56:10+08:00    117.071429
1970-03-16 23:56:37+08:00    111.200000
1970-03-16 23:58:10+08:00     47.333333
Name: cpu_usage, Length: 586041, dtype: float64

In [110]:
def get_all_files_in_path(path: str, file_extension: str = '') -> list[str]:
    all_files: list[str] = list()
    for (dir_path, dir_names, file_names) in walk(path):
        file_names = [f'{dir_path}/{file}' for file in file_names if file.endswith(file_extension) if 'train_' not in file]
        all_files.extend(file_names)
        
    return all_files

In [111]:
model_file_paths: list[str] = get_all_files_in_path(model_paths, '.pt')
evaluation_file_paths: list[str] = get_all_files_in_path(evaluations_paths, '.csv')

In [112]:
model_file_paths

['../models/utilization_lstm_model_with_rmse_loss_function--Wed Mar 29 12:39:05 2023.pt',
 '../models/utilization_lstm_model_with_medium_batch_size--Wed Mar 29 12:33:56 2023.pt',
 '../models/utilization_lstm_model_with_small_batch_size--Wed Mar 29 12:28:35 2023.pt',
 '../models/utilization_lstm_model_with_penalty_loss_function--Wed Mar 29 12:44:31 2023.pt',
 '../models/utilization_lstm_model_with_tasks--Wed Mar 29 12:55:18 2023.pt',
 '../models/utilization_lstm_model_with_large_batch_size--Wed Mar 29 12:22:33 2023.pt',
 '../models/utilization_lstm_model_without_tasks--Wed Mar 29 12:49:50 2023.pt',
 '../models/utilization_lstm_model_with_instances--Wed Mar 29 13:00:38 2023.pt']

In [113]:
evaluation_file_paths

['../evaluation/all/loss_progression_with_tasks.csv',
 '../evaluation/all/loss_progression_with_no_tasks.csv',
 '../evaluation/all/loss_progression_with_small_batch_size.csv',
 '../evaluation/all/util_lstm_test_with_small_batch_size.csv',
 '../evaluation/all/loss_progression_with_medium_batch_size.csv',
 '../evaluation/all/loss_progression_with_penalty_loss.csv',
 '../evaluation/all/util_lstm_test_with_tasks.csv',
 '../evaluation/all/loss_progression_with_instances.csv',
 '../evaluation/all/util_lstm_test_with_instances.csv',
 '../evaluation/all/util_lstm_test_with_no_tasks.csv',
 '../evaluation/all/loss_progression_with_rmse_loss.csv',
 '../evaluation/all/util_lstm_test_with_rmse_loss.csv',
 '../evaluation/all/util_lstm_test_with_large_batch_size.csv',
 '../evaluation/all/loss_progression_with_large_batch_size.csv',
 '../evaluation/all/util_lstm_test_with_medium_batch_size.csv',
 '../evaluation/all/util_lstm_test_with_penalty_loss.csv']

In [114]:
collection = EvaluationScenarioCollection(evaluation_file_paths)
evaluations: dict[str, EvaluationScenario] = collection.evaluation_collection

In [115]:
evaluations

{'with_penalty_loss': EvaluationScenario(name='with_penalty_loss'),
 'with_tasks': EvaluationScenario(name='with_tasks'),
 'with_no_tasks': EvaluationScenario(name='with_no_tasks'),
 'with_small_batch_size': EvaluationScenario(name='with_small_batch_size'),
 'with_medium_batch_size': EvaluationScenario(name='with_medium_batch_size'),
 'with_instances': EvaluationScenario(name='with_instances'),
 'with_rmse_loss': EvaluationScenario(name='with_rmse_loss'),
 'with_large_batch_size': EvaluationScenario(name='with_large_batch_size')}

In [119]:
no_tasks_eval: EvaluationScenario = evaluations['with_no_tasks']
no_tasks_eval.get_cpu_test_df()

Unnamed: 0,actual cpu usage,predicted cpu usage,allocated cpu
0,30.347824,838.999695,682.624023
1,31.739130,838.999695,682.624023
2,30.523808,838.999695,682.624023
3,29.478258,838.999695,682.624023
4,28.565216,838.999695,682.624023
...,...,...,...
4995,445.396210,466.580170,682.624023
4996,517.638245,466.580170,682.624023
4997,554.960754,466.580170,682.624023
4998,479.877533,466.580170,682.624023


# User Utilisation

This section contains the utilisation that was done by users.

In [101]:
def prepare_user_utilisation_df(df: pd.DataFrame) -> pd.DataFrame:
    user_df = df.copy()
    return user_df.iloc[:, [0, 2]]

In [102]:
user_cpu_test_df = prepare_user_utilisation_df(no_tasks_eval.get_cpu_test_df())
user_mem_test_df = prepare_user_utilisation_df(no_tasks_eval.get_mem_test_df())

In [103]:
user_mem_test_df

Unnamed: 0,max_mem_x,allocated mem
0,4.530273,37.405022
1,4.883789,37.405022
2,4.001953,37.405022
3,4.129883,37.405022
4,4.417969,37.405022
...,...,...
4995,9.834961,13.980206
4996,9.862305,13.980206
4997,9.208984,13.980206
4998,9.929688,13.980206


In [104]:
eval_cpu_correcter = test_df[['cpu_usage', 'plan_cpu']].iloc[:5000].reset_index(drop=True)
eval_mem_correcter = test_df[['max_mem', 'plan_mem']].iloc[:5000].reset_index(drop=True)

In [105]:
eval_cpu_correcter

Unnamed: 0,cpu_usage,plan_cpu
0,30.347826,600.0
1,31.739130,600.0
2,30.523810,600.0
3,29.478261,600.0
4,28.565217,600.0
...,...,...
4995,445.396226,600.0
4996,517.638298,600.0
4997,554.960784,600.0
4998,479.877551,600.0


In [15]:
user_cpu_test_df['allocated cpu'] = eval_cpu_correcter['plan_cpu']
user_mem_test_df['allocated mem'] = eval_mem_correcter['plan_mem']

In [89]:
user_cpu_test_df

Unnamed: 0,actual cpu usage,allocated cpu
0,30.347824,600.0
1,31.739130,600.0
2,30.523808,600.0
3,29.478258,600.0
4,28.565216,600.0
...,...,...
4995,445.396210,600.0
4996,517.638245,600.0
4997,554.960754,600.0
4998,479.877533,600.0


In [16]:
actual_cpu = user_cpu_test_df['actual cpu usage']
# print(actual_cpu.describe().round(3).to_latex())

In [17]:
actual_mem = eval_mem_correcter['max_mem']
actual_mem.rename('actual mem usage', inplace=True)

0       4.530273
1       4.883789
2       4.001953
3       4.129883
4       4.417969
          ...   
4995    9.834961
4996    9.862305
4997    9.208984
4998    9.929688
4999    9.517578
Name: actual mem usage, Length: 5000, dtype: float64

In [18]:
# print(actual_mem.describe().round(3).to_latex())

In [19]:
# print(actual_cpu.describe().to_latex())

In [43]:
def merge_describe_with_actual(df: pd.DataFrame, column: int, is_cpu: bool) -> pd.DataFrame:
    temp_ser: pd.Series = df.iloc[:, column]
    temp_describe = temp_ser.describe().round(2)
    
    merge_ser: pd.Series = actual_cpu if is_cpu else actual_mem
    merge_describe = merge_ser.describe().round(2)
    
    percentage = temp_describe['count'] / merge_describe['count'] * 100
    
    temp_describe['percentage'] = percentage
    merge_describe['percentage'] = 100
    
    ret_df: pd.DataFrame = pd.concat([temp_describe, merge_describe], axis=1)
    
    ret_df.drop(index=['count'], inplace=True)
    
    return ret_df

def print_latex_table(df: pd.DataFrame, column: int, is_cpu: bool) -> None:
    df = merge_describe_with_actual(df, column, is_cpu)
    print(df.to_latex())
    
def print_metrics(actual: pd.Series, predicted: pd.Series, table_name: str = 'metrics') -> None:
    print(EvaluationMetrics.get_all_metrics(actual, predicted, table_name).round(3).to_latex())

In [44]:
test = user_cpu_test_df.iloc[:, 1]
test

0       600.0
1       600.0
2       600.0
3       600.0
4       600.0
        ...  
4995    600.0
4996    600.0
4997    600.0
4998    600.0
4999    600.0
Name: allocated cpu, Length: 5000, dtype: float64

In [45]:
pd.concat([test, user_cpu_test_df.iloc[:, 0]], axis=1)

Unnamed: 0,allocated cpu,actual cpu usage
0,600.0,30.347824
1,600.0,31.739130
2,600.0,30.523808
3,600.0,29.478258
4,600.0,28.565216
...,...,...
4995,600.0,445.396210
4996,600.0,517.638245
4997,600.0,554.960754
4998,600.0,479.877533


In [46]:
user_cpu_test_df_over_alloc = EvaluationUtils.get_over_allocated_df(user_cpu_test_df, 0, 1)
user_cpu_test_df_under_alloc = EvaluationUtils.get_under_allocated_df(user_cpu_test_df, 0, 1)

In [47]:
print_metrics(actual_cpu, user_cpu_test_df.iloc[:, 1], 'User CPU')

\begin{tabular}{lrrrrr}
\toprule
{} &     RMSE &     MAPE &   SMAPE &    OA &    UA \\
\midrule
User CPU &  597.431 &  944.986 &  99.682 &  78.6 &  21.4 \\
\bottomrule
\end{tabular}



  print(EvaluationMetrics.get_all_metrics(actual, predicted, table_name).round(3).to_latex())


In [48]:
print_metrics(actual_mem, user_mem_test_df.iloc[:, 1], 'User Memory')

\begin{tabular}{lrrrrr}
\toprule
{} &    RMSE &     MAPE &   SMAPE &     OA &     UA \\
\midrule
User Memory &  18.654 &  545.366 &  90.441 &  75.46 &  24.54 \\
\bottomrule
\end{tabular}



  print(EvaluationMetrics.get_all_metrics(actual, predicted, table_name).round(3).to_latex())


In [49]:
print_latex_table(user_cpu_test_df, 1, True)

\begin{tabular}{lrr}
\toprule
{} &  allocated cpu &  actual cpu usage \\
\midrule
mean       &         569.86 &            364.39 \\
std        &         300.24 &            559.70 \\
min        &           5.00 &              0.00 \\
25\%        &         400.00 &             89.34 \\
50\%        &         600.00 &            165.37 \\
75\%        &         600.00 &            455.05 \\
max        &        3200.00 &           7133.87 \\
percentage &         100.00 &            100.00 \\
\bottomrule
\end{tabular}



  print(df.to_latex())


In [50]:
print_latex_table(user_cpu_test_df, 1, True)

\begin{tabular}{lrr}
\toprule
{} &  allocated cpu &  actual cpu usage \\
\midrule
mean       &         569.86 &            364.39 \\
std        &         300.24 &            559.70 \\
min        &           5.00 &              0.00 \\
25\%        &         400.00 &             89.34 \\
50\%        &         600.00 &            165.37 \\
75\%        &         600.00 &            455.05 \\
max        &        3200.00 &           7133.87 \\
percentage &         100.00 &            100.00 \\
\bottomrule
\end{tabular}



  print(df.to_latex())


In [63]:
print_latex_table(user_mem_test_df, 1, False)

\begin{tabular}{lrr}
\toprule
{} &  allocated mem &  actual mem usage \\
\midrule
mean       &          20.53 &              8.84 \\
std        &          12.72 &             10.55 \\
min        &           1.95 &              0.02 \\
25\%        &           7.81 &              3.01 \\
50\%        &          29.30 &              4.68 \\
75\%        &          29.30 &             11.85 \\
max        &         117.19 &            251.58 \\
percentage &         100.00 &            100.00 \\
\bottomrule
\end{tabular}



  print(df.to_latex())


In [52]:
user_mem_test_df_over_alloc = EvaluationUtils.get_over_allocated_df(user_mem_test_df, 0, 1)
user_mem_test_df_under_alloc = EvaluationUtils.get_under_allocated_df(user_mem_test_df, 0, 1)

In [53]:
user_mem_test_df_over_alloc

Unnamed: 0,max_mem_x,allocated mem
0,4.530273,29.296875
1,4.883789,29.296875
2,4.001953,29.296875
3,4.129883,29.296875
4,4.417969,29.296875
...,...,...
4985,9.586914,9.765625
4987,9.734375,9.765625
4988,9.188477,9.765625
4997,9.208984,9.765625


# No Task Usage

This section contains the evaluation for the base LSTM model.

In [64]:
no_tasks_eval.get_cpu_train_df()

Unnamed: 0,actual cpu usage,predicted cpu usage,allocated cpu
0,773.050842,233.303329,400.000031
1,608.671021,901.894348,600.000000
2,608.102478,901.894348,600.000000
3,385.129120,901.894348,600.000000
4,614.094116,901.894348,600.000000
...,...,...,...
4995,23.076925,97.586464,600.000000
4996,22.769234,97.586464,600.000000
4997,19.000000,97.586464,600.000000
4998,23.812502,97.586464,600.000000


In [66]:
print_latex_table(no_tasks_eval.get_cpu_test_df(), 1, True)

\begin{tabular}{lrr}
\toprule
{} &  predicted cpu usage &  actual cpu usage \\
\midrule
mean       &               358.55 &            364.39 \\
std        &               510.83 &            559.70 \\
min        &                 5.93 &              0.00 \\
25\%        &               153.19 &             89.34 \\
50\%        &               195.87 &            165.37 \\
75\%        &               363.64 &            455.05 \\
max        &              5553.30 &           7133.87 \\
percentage &               100.00 &            100.00 \\
\bottomrule
\end{tabular}



  print(df.to_latex())


In [74]:
actual_mem

0       4.530273
1       4.883789
2       4.001953
3       4.129883
4       4.417969
          ...   
4995    9.834961
4996    9.862305
4997    9.208984
4998    9.929688
4999    9.517578
Name: actual mem usage, Length: 5000, dtype: float64

In [81]:
no_task_eval_mem: pd.DataFrame = no_tasks_eval.get_mem_test_df().rename(columns={'max_mem_x': 'actual mem', 'max_mem_y': 'predicted mem'})
no_task_eval_mem

Unnamed: 0,actual mem,predicted mem,allocated mem
0,4.530273,7.355535,37.405022
1,4.883789,7.355535,37.405022
2,4.001953,7.355535,37.405022
3,4.129883,7.355535,37.405022
4,4.417969,7.355535,37.405022
...,...,...,...
4995,9.834961,6.465895,13.980206
4996,9.862305,6.465895,13.980206
4997,9.208984,6.465895,13.980206
4998,9.929688,6.465895,13.980206


In [83]:
print_metrics(no_task_eval_mem['actual mem'], no_task_eval_mem['predicted mem'])

\begin{tabular}{lrrrrr}
\toprule
{} &    RMSE &     MAPE &   SMAPE &     OA &     UA \\
\midrule
metrics &  14.725 &  128.113 &  87.045 &  36.28 &  63.72 \\
\bottomrule
\end{tabular}



  print(EvaluationMetrics.get_all_metrics(actual, predicted, table_name).round(3).to_latex())


In [65]:
print_latex_table(no_tasks_eval.get_mem_test_df(), 1, False)

\begin{tabular}{lrr}
\toprule
{} &  max\_mem\_y &  actual mem usage \\
\midrule
mean       &       4.39 &              8.84 \\
std        &       9.22 &             10.55 \\
min        &       0.13 &              0.02 \\
25\%        &       1.94 &              3.01 \\
50\%        &       1.94 &              4.68 \\
75\%        &       4.87 &             11.85 \\
max        &     115.85 &            251.58 \\
percentage &     100.00 &            100.00 \\
\bottomrule
\end{tabular}



  print(df.to_latex())


In [56]:
print(no_tasks_eval.get_over_allocated_cpu_test_df().describe().to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  actual cpu usage &  predicted cpu usage &  allocated cpu \\
\midrule
count &       2867.000000 &          2867.000000 &    2867.000000 \\
mean  &        131.462852 &           459.559058 &     648.267166 \\
std   &        263.816777 &           616.023805 &     596.064678 \\
min   &          0.000000 &            44.983273 &    -300.816711 \\
25\%   &         50.073023 &           153.186890 &     352.055664 \\
50\%   &         99.442307 &           200.163757 &     682.624023 \\
75\%   &        141.599152 &           466.580170 &     682.624023 \\
max   &       3738.677979 &          5553.298828 &    4980.012207 \\
\bottomrule
\end{tabular}



  print(no_tasks_eval.get_over_allocated_cpu_test_df().describe().to_latex())


In [57]:
print(no_tasks_eval.get_over_allocated_cpu_test_df().iloc[:, [0, 1]].describe().to_latex())

\begin{tabular}{lrr}
\toprule
{} &  actual cpu usage &  predicted cpu usage \\
\midrule
count &       2867.000000 &          2867.000000 \\
mean  &        131.462852 &           459.559058 \\
std   &        263.816777 &           616.023805 \\
min   &          0.000000 &            44.983273 \\
25\%   &         50.073023 &           153.186890 \\
50\%   &         99.442307 &           200.163757 \\
75\%   &        141.599152 &           466.580170 \\
max   &       3738.677979 &          5553.298828 \\
\bottomrule
\end{tabular}



  print(no_tasks_eval.get_over_allocated_cpu_test_df().iloc[:, [0, 1]].describe().to_latex())


# With Task Usage

This section contains the evaluation for the LSTM model that has task knowledge included.

In [58]:
with_tasks_eval: EvaluationScenario = evaluations['with_tasks']

In [59]:
with_tasks_eval.get_cpu_train_df()

Unnamed: 0,actual cpu usage,predicted cpu usage,allocated cpu
0,773.050842,525.474304,400.000031
1,608.671021,897.155884,600.000000
2,608.102478,897.155884,600.000000
3,385.129120,897.155884,600.000000
4,614.094116,897.155884,600.000000
...,...,...,...
4995,23.076925,129.883926,600.000000
4996,22.769234,129.883926,600.000000
4997,19.000000,129.883926,600.000000
4998,23.812502,129.883926,600.000000


In [86]:
print_latex_table(with_tasks_eval.get_cpu_test_df(), 1, True)
print_metrics(with_tasks_eval.get_cpu_test_df()['actual cpu usage'], with_tasks_eval.get_cpu_test_df()['predicted cpu usage'])

\begin{tabular}{lrr}
\toprule
{} &  predicted cpu usage &  actual cpu usage \\
\midrule
mean       &               273.94 &            364.39 \\
std        &               463.12 &            559.70 \\
min        &                 3.86 &              0.00 \\
25\%        &                74.80 &             89.34 \\
50\%        &               219.60 &            165.37 \\
75\%        &               253.62 &            455.05 \\
max        &              4999.80 &           7133.87 \\
percentage &               100.00 &            100.00 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrrrrr}
\toprule
{} &     RMSE &     MAPE &   SMAPE &     OA &     UA \\
\midrule
metrics &  648.602 &  399.351 &  88.734 &  42.68 &  57.32 \\
\bottomrule
\end{tabular}



  print(df.to_latex())
  print(EvaluationMetrics.get_all_metrics(actual, predicted, table_name).round(3).to_latex())


In [88]:
print_latex_table(with_tasks_eval.get_mem_test_df(), 1, False)
print_metrics(with_tasks_eval.get_mem_test_df()['actual mem usage'], with_tasks_eval.get_mem_test_df()['predicted mem usage'])

\begin{tabular}{lrr}
\toprule
{} &  predicted mem usage &  actual mem usage \\
\midrule
mean       &                 3.78 &              8.84 \\
std        &                 4.20 &             10.55 \\
min        &                 0.14 &              0.02 \\
25\%        &                 0.74 &              3.01 \\
50\%        &                 2.02 &              4.68 \\
75\%        &                 8.54 &             11.85 \\
max        &                85.90 &            251.58 \\
percentage &               100.00 &            100.00 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrrrrr}
\toprule
{} &    RMSE &    MAPE &    SMAPE &     OA &     UA \\
\midrule
metrics &  11.993 &  198.76 &  118.231 &  23.62 &  76.38 \\
\bottomrule
\end{tabular}



  print(df.to_latex())
  print(EvaluationMetrics.get_all_metrics(actual, predicted, table_name).round(3).to_latex())
