In [31]:
import pandas as pd
import write_csv
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor

In [2]:
# Global Variables
whitelist_cols_e = ['Resource_List.ncpus', 'Resource_List.mem', 'resources_used.cput', 'queue', 
                      'resources_used.mem', 'dept', 'resources_used.walltime', 'Resource_List.mpiprocs', 
                  'job_id', 'user']
whitelist_cols_q = ['job_id', 'datetime']
dept_encoder = preprocessing.LabelEncoder() # Encoding for the dept attribute: Perform fit_labels function to load labels
queue_encoder = preprocessing.LabelEncoder() #Encoding for the queue attribute: Perform fit_labels funciton to load labels

In [3]:
# Extracts the necessary columns from the dataset
def extract_cols(df, whitelist_cols):
    return df[whitelist_cols]

In [4]:
# Extracts necessary queues for the dataset
# Extracted queues are as in the whitelist_queues attribute
def extract_queues(df):
    whitelist_queues = ['parallel12', 'serial', 'parallel20', 'parallel8', 'short', 
                        'parallel24', 'openmp', 'serial']
    return df[df['queue'].isin(whitelist_queues)]

In [5]:
# Creates new features to identify relations in data
def feature_eng(df):
    # CPU Efficiency
    # Created by taking CPU time over total time, giving estimate of number of cores used. 
    # Then, divide by CPUs requested
    cpu_efficiency = (df['resources_used.cput']/df['resources_used.walltime'])/df['Resource_List.ncpus']
    df['cpu_efficiency_eng'] = cpu_efficiency
    
    # CPU usage
    # Gauged using the CPU usage of the process
    # Current implementation uses cpupercent/100 -> derive the estimated number of cores used
    cpu_cores_used = df['resources_used.cput']/df['resources_used.walltime']
    df['estimated_cores_used_eng'] = cpu_cores_used
    
    # Remove NaN from feature engineering
    df['cpu_efficiency_eng'].fillna(0, inplace=True)
    df['estimated_cores_used_eng'].fillna(0, inplace=True)
    
    return df

In [6]:
# Load labels for LabelEncoder objects
def fit_labels(df):
    # Fit categorical attributes into numerical labels
    dept_encoder.fit(df['dept'])
    queue_encoder.fit(df['queue'])

In [7]:
# Function that performs the necessary transformation on the data
def feature_transform(df):
    # Requested memory scaling
    # Requested memory observed to have long tail distribution
    # Use logarithmic scaling
    df['Resource_List.mem'] = df['Resource_List.mem'].apply(lambda x: np.log2(x))
    
    # Request mpiprocs
    # Requested mpiprocs observed to have long tail distribution
    # Square root scaling performed due to presence of 0 valued attributes
    df['Resource_List.mpiprocs'] = df['Resource_List.mpiprocs'].apply(lambda x : np.sqrt(x))
    
    # Transform dept and queue attributes to numerical encoding.
    # Preconditions: Performed fit_labels function before this function call
    df['queue'] = queue_encoder.transform(df['queue'])
    df['dept'] = dept_encoder.transform(df['dept'])
    
    return df

In [8]:
# Data Extraction
df_q = pd.DataFrame()
df_q = df_q.append(write_csv.read_pkl('q_20190509v3.pkl')[0], sort=True)
df_q = extract_cols(df_q, whitelist_cols_q)
df_e = pd.DataFrame()
df_e = df_e.append(write_csv.read_pkl('e_20190509v3.pkl')[0], sort=True)
df_e = extract_cols(df_e, whitelist_cols_e)

df = df_e.merge(df_q, on='job_id', how='inner') # Merge Q and E logs

In [9]:
# Data Transformation and Engineering
df = feature_eng(df)
df = extract_queues(df)
fit_labels(df)
df = feature_transform(df)

In [10]:
# Exploratory Modelling
# Training/Test Split
whitelist_cols_x = ['Resource_List.ncpus', 'Resource_List.mem', 'queue', 'dept', 
                       'Resource_List.mpiprocs']
whitelist_cols_y = ['estimated_cores_used_eng']
x = df[whitelist_cols_x]
y = df[whitelist_cols_y]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2468)

### Linear Regression
Used as performance benchmark for the rest of the models tested

In [11]:
# Linear Regression - Used as benchmark
linear_regressor = LinearRegression()
lr_r2 = linear_regressor.fit(x_train, y_train).score(x_train, y_train) # R2 Score
print('Linear Regression R2 Training score: ', lr_r2)

y_pred = linear_regressor.predict(x_train)
print('Linear Regression Training MSE: ', metrics.mean_squared_error(y_pred, y_train)) # Training error

Linear Regression R2 Training score:  0.4851355682008523
Linear Regression Training MSE:  27.300621143455825


In [12]:
y_pred = linear_regressor.predict(x_test)
lr_r2 = linear_regressor.score(x_test, y_test)
print('Linear Regression R2 Training score: ', lr_r2)
print('Linear Regression Test MSE: ', metrics.mean_squared_error(y_pred, y_test))

Linear Regression R2 Training score:  0.45431821754790774
Linear Regression Test MSE:  29.860654289301387


### Support Vector Machines
Kernels used:
- Linear Kernel
- RBF Kernel
- Sigmoid Kernel

In [13]:
# SVR
# Linear Kernel
svr_linear = SVR(kernel='linear', C=100, gamma=0.1)
svr_linear = svr_linear.fit(x_train, y_train)
svr_lin_r2 = svr_linear.score(x_train, y_train)
print('Linear SVR R2 Training score: ', svr_lin_r2)

  y = column_or_1d(y, warn=True)


Linear SVR R2 Training score:  -0.7445956867339016


In [16]:
# RBF Kernel
svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1)
svr_rbf = svr_rbf.fit(x_train, y_train)
svr_rbf_r2 = svr_rbf.score(x_train, y_train)
print('RBF SVR R2 Training score: ', svr_rbf_r2)

y_pred = svr_rbf.predict(x_train)
print('RBF SVR R2 Training MSE: ', metrics.mean_squared_error(y_pred, y_train))

  y = column_or_1d(y, warn=True)


RBF SVR R2 Training score:  0.6514104992662247
RBF SVR R2 Training MSE:  18.483914029298788


In [17]:
y_pred = svr_rbf.predict(x_test)
svr_rbf_r2 = svr_rbf.score(x_test, y_test)
print('RBF SVR R2 Test score: ', svr_rbf_r2)
print('RBF SVR R2 Training MSE: ', metrics.mean_squared_error(y_pred, y_test))

RBF SVR R2 Test score:  0.5288172638764143
RBF SVR R2 Training MSE:  25.783937164346813


In [18]:
# Sigmoid Kernel
svr_sigmoid = SVR(kernel='sigmoid', C=100, gamma=0.1)
svr_sigmoid = svr_sigmoid.fit(x_train, y_train)
svr_sigmoid_r2 = svr_sigmoid.score(x_train, y_train)
print('Sigmoid SVR R2 Training score: ', svr_sigmoid_r2)

y_pred = svr_sigmoid.predict(x_train)
print('Sigmoid SVR Training MSE: ', metrics.mean_squared_error(y_pred, y_train))

  y = column_or_1d(y, warn=True)


Sigmoid SVR R2 Training score:  -0.41016277610358043
Sigmoid SVR Training MSE:  74.77370220832469


### Bagged Decison Trees

In [21]:
# Bagging Regressor
bag_reg = BaggingRegressor(n_estimators=50)
bag_reg = bag_reg.fit(x_train, y_train)
bag_reg_r2 = bag_reg.score(x_train, y_train)
print('Bagging Regressor R2 Training score: ', bag_reg_r2)

y_pred = bag_reg.predict(x_train)
print('Bagging Regressor Training MSE: ', metrics.mean_squared_error(y_pred, y_train))

  return column_or_1d(y, warn=True)


Bagging Regressor R2 Training score:  0.7537269581793223
Bagging Regressor Training MSE:  13.058596782649031


In [22]:
bag_reg_r2 = bag_reg.score(x_test, y_test)
y_pred = bag_reg.predict(x_test)
print('Bagging Regressor R2 Test score: ', bag_reg_r2)
print('Bagging Regressor Training MSE: ', metrics.mean_squared_error(y_pred, y_test))

Bagging Regressor R2 Test score:  0.6663909049103394
Bagging Regressor Training MSE:  18.25566873695956


### AdaBoost

In [28]:
# AdaBoost
adaboost = AdaBoostRegressor(n_estimators=100)
adaboost = adaboost.fit(x_train, y_train)
adaboost_r2 = adaboost.score(x_train, y_train)
print('AdaBoost R2 Training score: ', adaboost_r2)

y_pred = adaboost.predict(x_train)
print('AdaBoost Training MSE: ', metrics.mean_squared_error(y_pred, y_train))

AdaBoost R2 Training score:  0.5329475884202431
AdaBoost Training MSE:  24.765394840190726


  y = column_or_1d(y, warn=True)


In [29]:
adaboost_r2 = adaboost.score(x_test, y_test)
y_pred = adaboost.predict(x_test)
print('AdaBoost R2 Test score: ', adaboost_r2)
print('AdaBoost Test MSE: ', metrics.mean_squared_error(y_pred, y_test))

AdaBoost R2 Test score:  0.49104357436085283
AdaBoost Test MSE:  27.850979019376336


### XGBoost

In [34]:
# XGBoost
xgb = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)
xgb = xgb.fit(x_train, y_train)
xgb_r2 = xgb.score(x_train, y_train)
print('XGBoost R2 Training score: ', xgb_r2)

y_pred = xgb.predict(x_train)
print('XGBoost Training MSE: ', metrics.mean_squared_error(y_pred, y_train))

XGBoost R2 Training score:  0.6765942894173518
XGBoost Training MSE:  17.14854675316031


In [35]:
xgb_r2 = xgb.score(x_test, y_test)
y_pred = xgb.predict(x_test)
print('XGBoost R2 Test score: ', xgb_r2)
print('XGBoost Test MSE: ', metrics.mean_squared_error(y_pred, y_test))

XGBoost R2 Test score:  0.6146875348562035
XGBoost Test MSE:  21.08496688915491
