# Hyper Parameter Tuning with XGBoost Regression on Today_Energy Data

Based on the XGBoostV1 file, this will add some basic hyperparameter tuning to see if we can improve our model based on MAE.

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

df_inv01_2020 = pd.read_csv("inv01_2020.csv")
thisFileName = "07b.RegressionXGboostTuning"

print(df_inv01_2020.shape)
print(df_inv01_2020.info())
df_inv01_2020.head()

(12962, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12962 entries, 0 to 12961
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Date_Hour_Quarter         12962 non-null  object 
 1   Inv01_Temp                12962 non-null  float64
 2   Wms01_Irr                 12962 non-null  float64
 3   Wms01_Temp                12962 non-null  float64
 4   Inv01_Today_Energy_Start  12962 non-null  int64  
 5   Inv01_Today_Energy_End    12962 non-null  int64  
 6   Time_Past                 12962 non-null  float64
 7   Quarterly_Average_Energy  12912 non-null  float64
dtypes: float64(5), int64(2), object(1)
memory usage: 810.2+ KB
None


Unnamed: 0,Date_Hour_Quarter,Inv01_Temp,Wms01_Irr,Wms01_Temp,Inv01_Today_Energy_Start,Inv01_Today_Energy_End,Time_Past,Quarterly_Average_Energy
0,2020-03-15_12_3,36.981818,943.272727,36.909091,1,6,10.0,0.5
1,2020-03-15_12_4,41.857143,939.785714,35.785714,7,14,13.0,0.538462
2,2020-03-15_13_1,45.11875,940.3125,35.0125,14,22,15.0,0.533333
3,2020-03-15_13_2,47.273333,928.466667,35.206667,23,30,14.0,0.5
4,2020-03-15_13_3,48.64,905.933333,35.46,31,38,14.0,0.5


In [2]:
df_inv01_2020 = df_inv01_2020[df_inv01_2020["Quarterly_Average_Energy"].notna()]

### Feature Engineering

In [3]:
# df_inv01_2020.loc[:, 'Year'] = df_inv01_2020.Date.apply(lambda x: int(str(x).split('-')[0]))
# df_inv01_2020.loc[:, 'Month'] = df_inv01_2020.Date.apply(lambda x: int(str(x).split('-')[1]))
# df_inv01_2020.loc[:, 'Day'] = df_inv01_2020.Date.apply(lambda x: int(str(x).split('-')[2]))
df_inv01_2020.loc[:, 'Hour'] = df_inv01_2020.Date_Hour_Quarter.apply(lambda x: int(str(x).split('_')[1]))
df_inv01_2020.loc[:, 'Quarter'] = df_inv01_2020.Date_Hour_Quarter.apply(lambda x: int(str(x).split('_')[2]))

df_inv01_2020 = df_inv01_2020.drop(['Date_Hour_Quarter'], axis=1)
df_inv01_2020 = df_inv01_2020.drop(['Inv01_Today_Energy_Start'], axis=1)
df_inv01_2020 = df_inv01_2020.drop(['Inv01_Today_Energy_End'], axis=1)

In [4]:
df_inv01_2020

Unnamed: 0,Inv01_Temp,Wms01_Irr,Wms01_Temp,Time_Past,Quarterly_Average_Energy,Hour,Quarter
0,36.981818,943.272727,36.909091,10.0,0.500000,12,3
1,41.857143,939.785714,35.785714,13.0,0.538462,12,4
2,45.118750,940.312500,35.012500,15.0,0.533333,13,1
3,47.273333,928.466667,35.206667,14.0,0.500000,13,2
4,48.640000,905.933333,35.460000,14.0,0.500000,13,3
...,...,...,...,...,...,...,...
12957,41.621429,68.714286,19.335714,13.0,0.538462,8,4
12958,42.287500,98.125000,19.406250,15.0,0.866667,9,1
12959,42.946667,166.066667,19.726667,14.0,1.500000,9,2
12960,43.553333,96.266667,19.846667,14.0,0.857143,9,3


With Hyperparameter Tuning, we're not doing K-Folds, instead, we'll use Gridsearch CV to try different permutations of parameters

In [5]:
# Split the columns between the target and features
targetColName = "Quarterly_Average_Energy"
col_names = df_inv01_2020.columns
feature_cols = col_names.drop([targetColName])
trainFeatures = df_inv01_2020[feature_cols]
trainTargets = df_inv01_2020[targetColName]


In [6]:
# Configure basic XGBoost Regression object and the grid of possible parameter settings to test
randomSeed = databasic.get_random_seed()
model = xgb.XGBRegressor(objective="reg:squarederror", booster="gbtree", seed=randomSeed)
gbm_param_grid = { 'colsample_bytree': [0.3, 0.7], 'n_estimators': [10, 100, 200, 500, 1000], 'max_depth': [2, 4, 6, 8, 10], "eta": [0.3, 0.1, 0.01] }

x_train, x_vali, y_train, y_vali = train_test_split(trainFeatures, trainTargets, test_size=0.2, random_state=databasic.get_random_seed())

tuned_mse = GridSearchCV(param_grid=gbm_param_grid, estimator=model, scoring="neg_mean_absolute_error", cv=5, verbose=1)
tuned_mse.fit(x_train, y_train)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", tuned_mse.best_params_)
print("Lowest MAE found: ", tuned_mse.best_score_)

Fitting 5 folds for each of 150 candidates, totalling 750 fits
Best parameters found:  {'colsample_bytree': 0.7, 'eta': 0.1, 'max_depth': 4, 'n_estimators': 500}
Lowest MAE found:  -0.3491235814751967


### Run 1
- Fitting 5 folds for each of 150 candidates, totalling 750 fits
- Time to run: 8m 2.1s
- Best parameters found:  {'colsample_bytree': 0.7, 'eta': 0.1, 'max_depth': 4, 'n_estimators': 500}
- Lowest MAE found:  -0.3491235814751967