In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from numpy import unique
from numpy import where
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import Birch
from sklearn.model_selection import GridSearchCV
from google.colab import drive
drive.mount('/content/drive')
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
!pip install catboost
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from math import sqrt


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [3]:
!unzip -q '/content/drive/My Drive/GWA-T-13_Materna-Workload-Traces.zip'

In [4]:
def readInputData(paths):
  for path in paths:
    all_files = glob.glob(path + "/*.csv")

    data = []
    for filename in all_files:
        df = pd.read_csv(filename, delimiter=';', index_col=None, header=0)
        data.append(df)
    dataframe = pd.concat(data, axis=0, ignore_index=True)
  return dataframe


In [5]:
Materna_Traces = ['/content/Materna-Trace-1','/content/Materna-Trace-2','/content/Materna-Trace-3']


In [6]:
inputData = readInputData(Materna_Traces)


In [7]:
 inputData.head()

Unnamed: 0,Timestamp,CPU cores,CPU capacity provisioned [MHZ],CPU usage [MHZ],CPU usage [%],Memory capacity provisioned [KB],Memory usage [KB],Memory usage [%],Disk read throughput [KB/s],Disk write throughput [KB/s],Disk size [GB],Network received throughput [KB/s],Network transmitted throughput [KB/s]
0,04.01.2016 00:00:00,6,0,1619,119,20971520,2472542,1179,498,307,709,951,1255
1,04.01.2016 00:05:00,6,0,2167,1593,20971520,3493855,1666,660,336,709,1311,1701
2,04.01.2016 00:10:00,6,0,2093,1538,20971520,3925868,1872,3448,839,709,1216,2626
3,04.01.2016 00:15:00,6,0,1622,1193,20971520,3772776,1799,592,293,709,951,1274
4,04.01.2016 00:20:00,6,0,1688,1241,20971520,3143631,1499,677,349,709,1141,1250


In [8]:
inputData.shape
df = inputData

In [9]:
df.describe()

Unnamed: 0,CPU cores,CPU capacity provisioned [MHZ],CPU usage [MHZ],Memory capacity provisioned [KB],Memory usage [KB],Disk read throughput [KB/s],Disk write throughput [KB/s],Network received throughput [KB/s],Network transmitted throughput [KB/s]
count,5329730.0,5329730.0,5329730.0,5329730.0,5329730.0,5329730.0,5329730.0,5329730.0,5329730.0
mean,2.637684,3.982851,259.608,8873539.0,705879.0,535.2213,179.006,92.93955,139.6144
std,1.628044,90.22759,636.5165,10095090.0,1135087.0,6929.329,1533.413,912.1657,1434.829
min,1.0,0.0,1.0,1048576.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,24.0,4194304.0,80531.0,0.0,1.0,0.0,0.0
50%,2.0,0.0,69.0,8388608.0,223137.0,0.0,8.0,1.0,1.0
75%,4.0,0.0,184.0,10485760.0,871996.0,0.0,39.0,15.0,17.0
max,8.0,2048.0,15528.0,134217700.0,23652520.0,377917.0,144988.0,112679.0,114219.0


In [10]:

def dbscan(eps, min_samples, data, start, end):

  model = DBSCAN(eps, min_samples)

  yhat = model.fit_predict(data)

  clusters = unique(yhat)

  for cluster in clusters:

    row_ix = where(yhat == cluster)

    plt.scatter(data[row_ix, start], data[row_ix, end])
    plt.title("eps="+str(eps)+"    "+"min_samples="+str(min_samples))
  plt.show()

In [11]:
print(df.columns)

Index(['Timestamp', 'CPU cores', 'CPU capacity provisioned [MHZ]',
       'CPU usage [MHZ]', 'CPU usage [%]', 'Memory capacity provisioned [KB]',
       'Memory usage [KB]', 'Memory usage [%]', 'Disk read throughput [KB/s]',
       'Disk write throughput [KB/s]', 'Disk size [GB]',
       'Network received throughput [KB/s]',
       'Network transmitted throughput [KB/s]'],
      dtype='object')


In [12]:
df = df.drop('Timestamp', axis=1)

df = df.replace(to_replace=r',', value='.', regex=True)

In [13]:
features = ['CPU cores', 'CPU capacity provisioned [MHZ]', 'Memory capacity provisioned [KB]',
            'Disk read throughput [KB/s]', 'Disk write throughput [KB/s]',
            'Disk size [GB]', 'Network received throughput [KB/s]',
            'Network transmitted throughput [KB/s]']


In [14]:
target = 'CPU usage [MHZ]'

In [15]:
data = df[features + [target]]

In [16]:
scaler = StandardScaler()
data_std = scaler.fit_transform(data)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(data_std, data[target], test_size=0.2, random_state=42)

In [18]:
gbm = GradientBoostingRegressor(random_state=42)
xgb = XGBRegressor(random_state=42)
catboost = CatBoostRegressor(random_state=42, verbose=0)

In [19]:
xgb.fit(X_train, y_train)

In [20]:
y_pred_xgb = xgb.predict(X_test)

In [21]:
rmse_xgb = sqrt(mean_squared_error(y_test, y_pred_xgb))
mape_xgb = np.mean(np.abs((np.exp(y_test) - np.exp(y_pred_xgb)) / np.exp(y_test))) * 100
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  mape_xgb = np.mean(np.abs((np.exp(y_test) - np.exp(y_pred_xgb)) / np.exp(y_test))) * 100


In [22]:
print("Performance Metrics for XGBoost:")
print(f"RMSE: {rmse_xgb:.2f}")
print(f"MAPE: {mape_xgb:.2f}%")
print(f"MAE: {mae_xgb:.2f}")
print(f"MSE: {mse_xgb:.2f}")
print(f"R-squared (R2): {r2_xgb:.2f}")
print("\n")

Performance Metrics for XGBoost:
RMSE: 31.97
MAPE: inf%
MAE: 4.29
MSE: 1021.84
R-squared (R2): 1.00




In [23]:
gbm.fit(X_train, y_train)
catboost.fit(X_train, y_train)

<catboost.core.CatBoostRegressor at 0x7e54aec2bc10>

In [24]:
y_pred_gbm = gbm.predict(X_test)
y_pred_catboost = catboost.predict(X_test)

In [25]:
rmse_gbm = sqrt(mean_squared_error(y_test, y_pred_gbm))
mae_gbm = mean_absolute_error(y_test, y_pred_gbm)
mse_gbm = mean_squared_error(y_test, y_pred_gbm)
r2_gbm = r2_score(y_test, y_pred_gbm)

In [26]:
rmse_catboost = sqrt(mean_squared_error(y_test, y_pred_catboost))
mae_catboost = mean_absolute_error(y_test, y_pred_catboost)
mse_catboost = mean_squared_error(y_test, y_pred_catboost)
r2_catboost = r2_score(y_test, y_pred_catboost)

In [27]:
def calculate_mape(y_true, y_pred):
    non_zero_indices = (y_true != 0)
    y_true_non_zero = y_true[non_zero_indices]
    y_pred_non_zero = y_pred[non_zero_indices]
    percentage_errors = np.abs((y_true_non_zero - y_pred_non_zero) / y_true_non_zero)
    mape = np.mean(percentage_errors) * 100
    return mape
mape_gbm = calculate_mape(y_test, y_pred_gbm)
print(f"MAPE: {mape_gbm:.2f}%")


MAPE: 1.86%


In [28]:
print("Performance Metrics for GBM:")
print(f"RMSE: {rmse_gbm:.2f}")
print(f"MAPE: {mape_gbm:.2f}%")
print(f"MAE: {mae_gbm:.2f}")
print(f"MSE: {mse_gbm:.2f}")
print(f"R-squared (R2): {r2_gbm:.2f}")
print("\n")

Performance Metrics for GBM:
RMSE: 4.94
MAPE: 1.86%
MAE: 2.32
MSE: 24.44
R-squared (R2): 1.00




In [29]:
def calculate_mape(y_true, y_pred):
    non_zero_indices = (y_true != 0)
    y_true_non_zero = y_true[non_zero_indices]
    y_pred_non_zero = y_pred[non_zero_indices]
    percentage_errors = np.abs((y_true_non_zero - y_pred_non_zero) / y_true_non_zero)
    mape = np.mean(percentage_errors) * 100
    return mape
mape_catboost = calculate_mape(y_test, y_pred_gbm)
print(f"MAPE: {mape_catboost:.2f}%")

MAPE: 1.86%


In [30]:
print("Performance Metrics for CatBoost:")
print(f"RMSE: {rmse_catboost:.2f}")
print(f"MAPE: {mape_catboost:.2f}%")
print(f"MAE: {mae_catboost:.2f}")
print(f"MSE: {mse_catboost:.2f}")
print(f"R-squared (R2): {r2_catboost:.2f}")

Performance Metrics for CatBoost:
RMSE: 23.88
MAPE: 1.86%
MAE: 3.06
MSE: 570.40
R-squared (R2): 1.00


In [31]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression


In [32]:
base_models = [
    ('gbm', GradientBoostingRegressor(random_state=42)),
    ('xgb', XGBRegressor(random_state=42)),
    ('catboost', CatBoostRegressor(random_state=42, verbose=0))
]


In [33]:
meta_learner = LinearRegression()

In [34]:
stacking_model = StackingRegressor(estimators=base_models, final_estimator=meta_learner)

In [35]:
stacking_model.fit(X_train, y_train)

In [36]:
y_pred_stacking = stacking_model.predict(X_test)

In [37]:
rmse_stacking = sqrt(mean_squared_error(y_test, y_pred_stacking))
mape_stacking = np.mean(np.abs((y_test - y_pred_stacking) / y_test)) * 100
mae_stacking = mean_absolute_error(y_test, y_pred_stacking)
mse_stacking = mean_squared_error(y_test, y_pred_stacking)
r2_stacking = r2_score(y_test, y_pred_stacking)

In [38]:
print("Performance Metrics for STEL (Stacking Ensemble Learning):")
print(f"RMSE: {rmse_stacking:.2f}")
print(f"MAPE: {mape_stacking:.2f}%")
print(f"MAE: {mae_stacking:.2f}")
print(f"MSE: {mse_stacking:.2f}")
print(f"R-squared (R2): {r2_stacking:.2f}")

Performance Metrics for STEL (Stacking Ensemble Learning):
RMSE: 30.82
MAPE: 1.13%
MAE: 4.15
MSE: 949.88
R-squared (R2): 1.00
