In [77]:
import pandas as pd
import write_csv
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, RandomForestRegressor
from xgboost import XGBRegressor

In [2]:
# Global Variables
whitelist_cols_e = ['Resource_List.ncpus', 'Resource_List.mem', 'resources_used.cput', 'queue', 
                      'resources_used.mem', 'dept', 'resources_used.walltime', 'Resource_List.mpiprocs', 
                  'job_id', 'user']
whitelist_cols_q = ['job_id', 'datetime']
dept_encoder = preprocessing.LabelEncoder() # Encoding for the dept attribute: Perform fit_labels function to load labels
queue_encoder = preprocessing.LabelEncoder() #Encoding for the queue attribute: Perform fit_labels funciton to load labels

In [3]:
# Extracts the necessary columns from the dataset
def extract_cols(df, whitelist_cols):
    return df[whitelist_cols]

In [4]:
# Extracts necessary queues for the dataset
# Extracted queues are as in the whitelist_queues attribute
def extract_queues(df):
    whitelist_queues = ['parallel12', 'serial', 'parallel20', 'parallel8', 'short', 
                        'parallel24', 'openmp', 'serial']
    return df[df['queue'].isin(whitelist_queues)]

In [5]:
# Creates new features to identify relations in data
def feature_eng(df):
    # CPU Efficiency
    # Created by taking CPU time over total time, giving estimate of number of cores used. 
    # Then, divide by CPUs requested
    cpu_efficiency = (df['resources_used.cput']/df['resources_used.walltime'])/df['Resource_List.ncpus']
    df['cpu_efficiency_eng'] = cpu_efficiency
    
    # CPU usage
    # Gauged using the CPU usage of the process
    # Current implementation uses cpupercent/100 -> derive the estimated number of cores used
    cpu_cores_used = df['resources_used.cput']/df['resources_used.walltime']
    df['estimated_cores_used_eng'] = cpu_cores_used
    
    # Remove NaN from feature engineering
    df['cpu_efficiency_eng'].fillna(0, inplace=True)
    df['estimated_cores_used_eng'].fillna(0, inplace=True)
    
    return df

In [6]:
# Load labels for LabelEncoder objects
def fit_labels(df):
    # Fit categorical attributes into numerical labels
    dept_encoder.fit(df['dept'])
    queue_encoder.fit(df['queue'])

In [53]:
# Function that performs the necessary transformation on the data
def feature_transform(df):
    # Requested memory scaling
    # Requested memory observed to have long tail distribution
    # Use logarithmic scaling
    df['Resource_List.mem'] = df['Resource_List.mem'].apply(lambda x: np.log2(x))
    df['resources_used.mem'] = df['resources_used.mem'].apply(lambda x: np.log2(x) if x > 0 else 0) # account for no cpus used
    
    # Request mpiprocs
    # Requested mpiprocs observed to have long tail distribution
    # Square root scaling performed due to presence of 0 valued attributes
    df['Resource_List.mpiprocs'] = df['Resource_List.mpiprocs'].apply(lambda x : np.sqrt(x))
    
    # Transform dept and queue attributes to numerical encoding.
    # Preconditions: Performed fit_labels function before this function call
    df['queue'] = queue_encoder.transform(df['queue'])
    df['dept'] = dept_encoder.transform(df['dept'])
    
    return df

In [54]:
# Data Extraction
df_q = pd.DataFrame()
df_q = df_q.append(write_csv.read_pkl('q_20190509v3.pkl')[0], sort=True)
df_q = extract_cols(df_q, whitelist_cols_q)
df_e = pd.DataFrame()
df_e = df_e.append(write_csv.read_pkl('e_20190509v3.pkl')[0], sort=True)
df_e = extract_cols(df_e, whitelist_cols_e)

df = df_e.merge(df_q, on='job_id', how='inner') # Merge Q and E logs

In [55]:
# Data Transformation and Engineering
df = feature_eng(df)
df = extract_queues(df)
fit_labels(df)
df = feature_transform(df)

In [10]:
# Exploratory Modelling
# Training/Test Split
whitelist_cols_x = ['Resource_List.ncpus', 'Resource_List.mem', 'queue', 'dept', 
                       'Resource_List.mpiprocs']
whitelist_cols_y = ['estimated_cores_used_eng']
x = df[whitelist_cols_x]
y = df[whitelist_cols_y]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2468)

### Linear Regression
Used as performance benchmark for the rest of the models tested

In [11]:
# Linear Regression - Used as benchmark
linear_regressor = LinearRegression()
lr_r2 = linear_regressor.fit(x_train, y_train).score(x_train, y_train) # R2 Score
print('Linear Regression R2 Training score: ', lr_r2)

y_pred = linear_regressor.predict(x_train)
print('Linear Regression Training MSE: ', metrics.mean_squared_error(y_pred, y_train)) # Training error

Linear Regression R2 Training score:  0.4851355682008523
Linear Regression Training MSE:  27.300621143455825


In [12]:
y_pred = linear_regressor.predict(x_test)
lr_r2 = linear_regressor.score(x_test, y_test)
print('Linear Regression R2 Training score: ', lr_r2)
print('Linear Regression Test MSE: ', metrics.mean_squared_error(y_pred, y_test))

Linear Regression R2 Training score:  0.45431821754790774
Linear Regression Test MSE:  29.860654289301387


### Support Vector Machines
Kernels used:
- Linear Kernel
- RBF Kernel
- Sigmoid Kernel

In [13]:
# SVR
# Linear Kernel
svr_linear = SVR(kernel='linear', C=100, gamma=0.1)
svr_linear = svr_linear.fit(x_train, y_train)
svr_lin_r2 = svr_linear.score(x_train, y_train)
print('Linear SVR R2 Training score: ', svr_lin_r2)

  y = column_or_1d(y, warn=True)


Linear SVR R2 Training score:  -0.7445956867339016


In [14]:
# RBF Kernel
svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1)
svr_rbf = svr_rbf.fit(x_train, y_train)
svr_rbf_r2 = svr_rbf.score(x_train, y_train)
print('RBF SVR R2 Training score: ', svr_rbf_r2)

y_pred = svr_rbf.predict(x_train)
print('RBF SVR Training MSE: ', metrics.mean_squared_error(y_pred, y_train))

  y = column_or_1d(y, warn=True)


RBF SVR R2 Training score:  0.6514104992662247
RBF SVR R2 Training MSE:  18.483914029298788


In [15]:
y_pred = svr_rbf.predict(x_test)
svr_rbf_r2 = svr_rbf.score(x_test, y_test)
print('RBF SVR R2 Test score: ', svr_rbf_r2)
print('RBF SVR Training MSE: ', metrics.mean_squared_error(y_pred, y_test))

RBF SVR R2 Test score:  0.5288172638764143
RBF SVR R2 Training MSE:  25.783937164346813


In [16]:
# Sigmoid Kernel
svr_sigmoid = SVR(kernel='sigmoid', C=100, gamma=0.1)
svr_sigmoid = svr_sigmoid.fit(x_train, y_train)
svr_sigmoid_r2 = svr_sigmoid.score(x_train, y_train)
print('Sigmoid SVR R2 Training score: ', svr_sigmoid_r2)

y_pred = svr_sigmoid.predict(x_train)
print('Sigmoid SVR Training MSE: ', metrics.mean_squared_error(y_pred, y_train))

  y = column_or_1d(y, warn=True)


Sigmoid SVR R2 Training score:  -0.41016277610358043
Sigmoid SVR Training MSE:  74.77370220832469


### Bagged Decison Trees

In [17]:
# Bagging Regressor
bag_reg = BaggingRegressor(n_estimators=50)
bag_reg = bag_reg.fit(x_train, y_train)
bag_reg_r2 = bag_reg.score(x_train, y_train)
print('Bagging Regressor R2 Training score: ', bag_reg_r2)

y_pred = bag_reg.predict(x_train)
print('Bagging Regressor Training MSE: ', metrics.mean_squared_error(y_pred, y_train))

  return column_or_1d(y, warn=True)


Bagging Regressor R2 Training score:  0.753533369085635
Bagging Regressor Training MSE:  13.06886181976919


In [18]:
bag_reg_r2 = bag_reg.score(x_test, y_test)
y_pred = bag_reg.predict(x_test)
print('Bagging Regressor R2 Test score: ', bag_reg_r2)
print('Bagging Regressor Training MSE: ', metrics.mean_squared_error(y_pred, y_test))

Bagging Regressor R2 Test score:  0.6643870323779733
Bagging Regressor Training MSE:  18.36532412010233


### Random Forest

In [79]:
# RandomForestRegressor
rf = RandomForestRegressor(n_estimators=50)
rf = rf.fit(x_train, y_train)
rf_r2 = rf.score(x_train, y_train)
print('Random Forest Regressor R2 Training score: ', rf_r2)

y_pred = rf.predict(x_train)
print('Random Forest Regressor Training MSE: ', metrics.mean_squared_error(y_pred, y_train))

  This is separate from the ipykernel package so we can avoid doing imports until


Random Forest Regressor R2 Training score:  0.7536869444546193
Random Forest Regressor Training MSE:  13.0607185053224


In [80]:
y_pred = rf.predict(x_test)
rf_r2 = rf.score(x_test, y_test)
print('Random Forest Regressor R2 Test score: ', rf_r2)
print('Random Forest Regressor Training MSE: ', metrics.mean_squared_error(y_pred, y_test))

Random Forest Regressor R2 Test score:  0.6659621550101082
Random Forest Regressor Training MSE:  18.279130675692134


### AdaBoost

In [19]:
# AdaBoost
adaboost = AdaBoostRegressor(n_estimators=100)
adaboost = adaboost.fit(x_train, y_train)
adaboost_r2 = adaboost.score(x_train, y_train)
print('AdaBoost R2 Training score: ', adaboost_r2)

y_pred = adaboost.predict(x_train)
print('AdaBoost Training MSE: ', metrics.mean_squared_error(y_pred, y_train))

AdaBoost R2 Training score:  0.5435879221244648
AdaBoost Training MSE:  24.20119249612159


  y = column_or_1d(y, warn=True)


In [20]:
adaboost_r2 = adaboost.score(x_test, y_test)
y_pred = adaboost.predict(x_test)
print('AdaBoost R2 Test score: ', adaboost_r2)
print('AdaBoost Test MSE: ', metrics.mean_squared_error(y_pred, y_test))

AdaBoost R2 Test score:  0.49661923835507593
AdaBoost Test MSE:  27.54586900779295


### XGBoost

In [21]:
# XGBoost
xgb = XGBRegressor(objective='reg:linear', n_estimators=100, learning_rate=0.1)
xgb = xgb.fit(x_train, y_train)
xgb_r2 = xgb.score(x_train, y_train)
print('XGBoost R2 Training score: ', xgb_r2)

y_pred = xgb.predict(x_train)
print('XGBoost Training MSE: ', metrics.mean_squared_error(y_pred, y_train))

XGBoost R2 Training score:  0.6765942894173518
XGBoost Training MSE:  17.14854675316031


In [22]:
xgb_r2 = xgb.score(x_test, y_test)
y_pred = xgb.predict(x_test)
print('XGBoost R2 Test score: ', xgb_r2)
print('XGBoost Test MSE: ', metrics.mean_squared_error(y_pred, y_test))

XGBoost R2 Test score:  0.6146875348562035
XGBoost Test MSE:  21.08496688915491


## Hyperparameter Tuning
Performed on Bagged Decision Trees Classifier, XGBoost and RBF SVR

### SVR with RBF Kernel
Hyperparameters tuned:
- C parameter: Parameter that influences the weight of the hinge loss. Higher C parameter means heavier penalty on the hinge loss value, and hence smaller margin.
- Gamma: Parameter influencing the RBF kernel function. High gamma denotes low variance of the kernel function, and hence high bias
- Epsilon: Margin for Support Vector Regressor

In [25]:
nfold=10
# Initial Grid Search
# C = [0.01, 0.1, 1, 10, 100]
# gamma = [0.001, 0.01, 0.1, 1]
# epsilon = [0.01, 0.1, 1, 10]

C = [5, 10, 15, 25]
gamma = [1.0, 2.5, 5.0, 10]
epsilon = [0.5, 1.0, 1.5, 2.5]
param_grid = {'C':C, 'gamma':gamma, 'epsilon':epsilon}

grid_search_svr = GridSearchCV(svr_rbf, param_grid, cv=nfold)
grid_search_svr.fit(x, y)
gs_params_svr = grid_search_svr.best_params_
print('SVR with RBF kernel best parameters: ')
print(gs_params_svr)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)


SVR with RBF kernel initial best parameters: 
{'C': 10, 'epsilon': 1, 'gamma': 1}


### BaggingRegressor
Hyperparameters tuned:
- n_estimators: Number of trees bagged.
- max_features(if bootstrap_features=True): How much of the original dataset features are sampled to be used in each tree. Similar to feature sampling in RandomForests
- max_samples(if bootstrap=True): How much of the original dataset is sampled for each decision tree.
- n_jobs: parallel processing. Set to -1 to use all available threads.

In [31]:
# Initial Grid Search
nfold=10
n_estimators = [50, 100, 250, 500]
max_features = [0.2, 0.4, 0.6, 0.8, 1.0]
max_samples = [0.2, 0.4, 0.6, 0.8, 1.0]
bootstrap = [True]
bootstrap_features = [True]
n_jobs = [-1]

param_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'bootstrap': bootstrap,
              'max_samples': max_samples, 'bootstrap_features': bootstrap_features}
grid_search_br = GridSearchCV(bag_reg, param_grid=param_grid, cv=nfold, n_jobs=-1)
grid_search_br.fit(x, y)
gs_best_params_br = grid_search_br.best_params_
print('Bagging Regressor best parameters: ')
print(gs_best_params_br)

KeyboardInterrupt: 

### XGBoost
Hyperparameters tuned:
- n_estimators: Number of trees to be used for boosting
- max_depth: Similar to max_depth in Bagging Regressor
- min_child_weight: Determines the minimum sum of weights assigned to instances at this node. If the sum of instance weights at the node is less than this amount, the tree stops partitioning at this leaf node.
- reg_alpha: L1 regularization term
- reg_lambda: L2 regularization term
- gamma: Decides the minimum reduction value in the loss to allow partitioning by that attribute
- subsample: Similar to max_samples in Bagging Regressor
- colsample_by_tree: Similar to max_features in Bagging Regressor

In [32]:
# Initial Grid Search
nfold=10
n_estimators = [50, 100, 250, 500]
max_depth = [3, 5, 7, 9]
min_child_weight = [1, 3, 5, 7]
gamma = [0.0, 0.1, 0.2, 0.3, 0.4] # Default 0.0 means greedily build until negative reduction in loss
subsample = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
colsample_bytree = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
reg_alpha = [0.1, 0.2, 0.4, 0.8, 1]
reg_lambda = [0.1, 0.2, 0.4, 0.8, 1]

param_grid = {'n_estimators': n_estimators, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 
             'gamma': gamma, 'subsample': subsample, 'colsample_bytree': colsample_bytree, 'reg_alpha': reg_alpha, 
             'reg_lambda': reg_lambda}
grid_search_xgb = GridSearchCV(xgb, param_grid, cv=nfold, n_jobs=-1)
grid_search_xgb.fit(x, y)
gs_best_params_xgb = grid_search_xgb.best_params_
print('XGBoost best parameters: ')
print(gs_best_params_xgb)

KeyboardInterrupt: 

### Predicting Memory Usage

Following the predictive analysis, we also aim to predict the memory utilization of a program. During the data visualisation and exploration phase, the number of CPUs used and the memory utilized shared a strong  positive correlation. Hence, we will attempt to predict the memory utilization based on the models that managed to fit the dataset for CPU efficiency best.

In [56]:
# Data preparation
whitelist_cols_x_mem = ['Resource_List.ncpus', 'Resource_List.mem', 'queue', 'dept', 
                       'Resource_List.mpiprocs']
whitelist_cols_y_mem = ['resources_used.mem']
x_mem = df[whitelist_cols_x_mem]
y_mem = df[whitelist_cols_y_mem]
x_train_mem, x_test_mem, y_train_mem, y_test_mem = train_test_split(x_mem, y_mem, test_size=0.2, random_state=2468)

### Linear Regression

In [62]:
# Linear Regression - Used as benchmark
linear_regressor_mem = LinearRegression()
lr_mem_r2 = linear_regressor_mem.fit(x_train_mem, y_train_mem).score(x_train_mem, y_train_mem) # R2 Score
print('Linear Regression R2 Training score: ', lr_mem_r2)

y_pred = linear_regressor_mem.predict(x_train_mem)
print('Linear Regression Training MSE: ', metrics.mean_squared_error(y_pred, y_train_mem)) # Training error

Linear Regression R2 Training score:  0.08480425544484116
Linear Regression Training MSE:  16.040130019461746


In [63]:
y_pred = linear_regressor_mem.predict(x_test_mem)
lr_mem_r2 = linear_regressor_mem.score(x_test_mem, y_test_mem)
print('Linear Regression R2 Training score: ', lr_mem_r2)
print('Linear Regression Test MSE: ', metrics.mean_squared_error(y_pred, y_test_mem))

Linear Regression R2 Training score:  0.08822289010041007
Linear Regression Test MSE:  16.29644234230616


### Support Vector Regressor (RBF Kernel)

In [67]:
svr_rbf_mem = SVR(kernel='rbf', C=100, gamma=0.1)
svr_rbf_mem = svr_rbf_mem.fit(x_train_mem, y_train_mem)
svr_rbf_mem_r2 = svr_rbf_mem.score(x_train_mem, y_train_mem)
print('RBF SVR R2 Training score: ', svr_rbf_mem_r2)

y_pred = svr_rbf_mem.predict(x_train_mem)
print('RBF SVR Training MSE: ', metrics.mean_squared_error(y_pred, y_train_mem))

  y = column_or_1d(y, warn=True)


RBF SVR R2 Training score:  0.41004211773376753
RBF SVR Training MSE:  10.339865754245034


In [68]:
y_pred = svr_rbf_mem.predict(x_test_mem)
svr_rbf_mem_r2 = svr_rbf_mem.score(x_test_mem, y_test_mem)
print('RBF SVR R2 Test score: ', svr_rbf_mem_r2)
print('RBF SVR Training MSE: ', metrics.mean_squared_error(y_pred, y_test_mem))

RBF SVR R2 Test score:  0.3658349763982566
RBF SVR Training MSE:  11.334605388120726


### Bagging Regressor

In [70]:
br_mem = BaggingRegressor(n_estimators=50)
br_mem = br_mem.fit(x_train_mem, y_train_mem)
br_mem_r2 = br_mem.score(x_train_mem, y_train_mem)
print('Bagging Regressor R2 Training score: ', br_mem_r2)

y_pred = br_mem.predict(x_train_mem)
print('Bagging Regressor Training MSE: ', metrics.mean_squared_error(y_pred, y_train_mem))

  return column_or_1d(y, warn=True)


Bagging Regressor R2 Training score:  0.5236827150946799
Bagging Regressor Training MSE:  8.348149809319688


In [71]:
br_mem_r2 = br_mem.score(x_test_mem, y_test_mem)
y_pred = br_mem.predict(x_test_mem)
print('Bagging Regressor R2 Test score: ', br_mem_r2)
print('Bagging Regressor Training MSE: ', metrics.mean_squared_error(y_pred, y_test_mem))

Bagging Regressor R2 Test score:  0.49295980480477786
Bagging Regressor Training MSE:  9.062468465720269


### XGBoost

In [72]:
xgb_mem = XGBRegressor(objective='reg:linear', n_estimators=100, learning_rate=0.1)
xgb_mem = xgb_mem.fit(x_train_mem, y_train_mem)
xgb_mem_r2 = xgb_mem.score(x_train_mem, y_train_mem)
print('XGBoost R2 Training score: ', xgb_mem_r2)

y_pred = xgb_mem.predict(x_train_mem)
print('XGBoost Training MSE: ', metrics.mean_squared_error(y_pred, y_train_mem))

XGBoost R2 Training score:  0.39922017605868987
XGBoost Training MSE:  10.529535945091093


In [73]:
xgb_mem_r2 = xgb_mem.score(x_test_mem, y_test_mem)
y_pred = xgb_mem.predict(x_test_mem)
print('XGBoost R2 Test score: ', xgb_mem_r2)
print('XGBoost Test MSE: ', metrics.mean_squared_error(y_pred, y_test_mem))

XGBoost R2 Test score:  0.38826922511763806
XGBoost Test MSE:  10.93363190811242


### Random Forest Regressor

In [87]:
rf_mem = RandomForestRegressor(n_estimators=50)
rf_mem = rf_mem.fit(x_train_mem, y_train_mem)
rf_mem_r2 = rf_mem.score(x_train_mem, y_train_mem)
print('Random Forest Regressor R2 Training score: ', rf_mem_r2)

y_pred = rf_mem.predict(x_train_mem)
print('Random Forest Regressor Training MSE: ', metrics.mean_squared_error(y_pred, y_train_mem))

  


Random Forest Regressor R2 Training score:  0.5236160497072719
Random Forest Regressor Training MSE:  8.349318216721256


In [86]:
y_pred = rf_mem.predict(x_test_mem)
rf_mem_r2 = rf_mem.score(x_test_mem, y_test_mem)
print('Random Forest Regressor R2 Test score: ', rf_mem_r2)
print('Random Forest Regressor Test MSE: ', metrics.mean_squared_error(y_pred, y_test_mem))

Random Forest Regressor R2 Test score:  0.49182467304089705
Random Forest Regressor Test MSE:  9.082756987048734
