# Experiment with an XGBoost Regression on today_energy data

Updated to use the 2020 inv01 data

This will use XGBoost Regression as the model. It will use 5x K-Folds Cross Validation to train then fit the model and evaluate the MAE and RMSE. For each fold, it will write out the data with the predictions to the /predictions folder so we can look at what the model is predicting vs the actual today_energy

In [1]:
# pip install xgboost

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb

from utilities import data_basic_utility as databasic
from utilities import dataframe_utility as dfutil

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

df_inv01_2020 = pd.read_csv("inv01_2020.csv")
thisFileName = "01a.RegressionXGboostV1"

print(df_inv01_2020.shape)
print(df_inv01_2020.info())
df_inv01_2020.head()

(12962, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12962 entries, 0 to 12961
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Date_Hour_Quarter         12962 non-null  object 
 1   Inv01_Temp                12962 non-null  float64
 2   Wms01_Irr                 12962 non-null  float64
 3   Wms01_Temp                12962 non-null  float64
 4   Inv01_Today_Energy_Start  12962 non-null  int64  
 5   Inv01_Today_Energy_End    12962 non-null  int64  
 6   Time_Past                 12962 non-null  float64
 7   Quarterly_Average_Energy  12912 non-null  float64
dtypes: float64(5), int64(2), object(1)
memory usage: 810.2+ KB
None


Unnamed: 0,Date_Hour_Quarter,Inv01_Temp,Wms01_Irr,Wms01_Temp,Inv01_Today_Energy_Start,Inv01_Today_Energy_End,Time_Past,Quarterly_Average_Energy
0,2020-03-15_12_3,36.981818,943.272727,36.909091,1,6,10.0,0.5
1,2020-03-15_12_4,41.857143,939.785714,35.785714,7,14,13.0,0.538462
2,2020-03-15_13_1,45.11875,940.3125,35.0125,14,22,15.0,0.533333
3,2020-03-15_13_2,47.273333,928.466667,35.206667,23,30,14.0,0.5
4,2020-03-15_13_3,48.64,905.933333,35.46,31,38,14.0,0.5


In [3]:
df_inv01_2020 = df_inv01_2020[df_inv01_2020["Quarterly_Average_Energy"].notna()]

### Feature Engineering

In [4]:
# df_inv01_2020.loc[:, 'Year'] = df_inv01_2020.Date.apply(lambda x: int(str(x).split('-')[0]))
# df_inv01_2020.loc[:, 'Month'] = df_inv01_2020.Date.apply(lambda x: int(str(x).split('-')[1]))
# df_inv01_2020.loc[:, 'Day'] = df_inv01_2020.Date.apply(lambda x: int(str(x).split('-')[2]))
df_inv01_2020.loc[:, 'Hour'] = df_inv01_2020.Date_Hour_Quarter.apply(lambda x: int(str(x).split('_')[1]))
df_inv01_2020.loc[:, 'Quarter'] = df_inv01_2020.Date_Hour_Quarter.apply(lambda x: int(str(x).split('_')[2]))

df_inv01_2020 = df_inv01_2020.drop(['Date_Hour_Quarter'], axis=1)
df_inv01_2020 = df_inv01_2020.drop(['Inv01_Today_Energy_Start'], axis=1)
df_inv01_2020 = df_inv01_2020.drop(['Inv01_Today_Energy_End'], axis=1)


In [5]:
df_inv01_2020

Unnamed: 0,Inv01_Temp,Wms01_Irr,Wms01_Temp,Time_Past,Quarterly_Average_Energy,Hour,Quarter
0,36.981818,943.272727,36.909091,10.0,0.500000,12,3
1,41.857143,939.785714,35.785714,13.0,0.538462,12,4
2,45.118750,940.312500,35.012500,15.0,0.533333,13,1
3,47.273333,928.466667,35.206667,14.0,0.500000,13,2
4,48.640000,905.933333,35.460000,14.0,0.500000,13,3
...,...,...,...,...,...,...,...
12957,41.621429,68.714286,19.335714,13.0,0.538462,8,4
12958,42.287500,98.125000,19.406250,15.0,0.866667,9,1
12959,42.946667,166.066667,19.726667,14.0,1.500000,9,2
12960,43.553333,96.266667,19.846667,14.0,0.857143,9,3


Do a K-Folds Cross Validation using XGBoost and get an MAE and an RMSE for mean error and indication of variance

In [6]:
# Test a basic XGBoost Regression with KFolds Cross Validation
randomSeed = databasic.get_random_seed()
model = xgb.XGBRegressor(objective="reg:squarederror", booster="gbtree", n_estimators=10, seed=randomSeed)
modellingLog = ""   

targetColName = "Quarterly_Average_Energy"
col_names = df_inv01_2020.columns
feature_cols = col_names.drop([targetColName])
trainFeatures = df_inv01_2020[feature_cols]
trainTargets = df_inv01_2020[targetColName]


In [7]:

lstMae = []
lstRmse = []
kfolds = KFold(n_splits=5, random_state=randomSeed, shuffle=True)
for k, (train_index, test_index) in enumerate(kfolds.split(df_inv01_2020)):
    # x_train = trainFeatures.loc[train_index, ]
    # x_vali = trainFeatures.loc[test_index, ]

    # y_train = trainTargets.loc[train_index, ]
    # y_vali = trainTargets.loc[test_index, ]
    x_train = trainFeatures.loc[trainFeatures.index.intersection(train_index)]
    x_vali = trainFeatures.loc[trainFeatures.index.intersection(test_index)]
    
    y_train = trainTargets.loc[trainTargets.index.intersection(train_index)]
    y_vali = trainTargets.loc[trainTargets.index.intersection(test_index)]
        
    model.fit(x_train, y_train)
    y_pred = model.predict(x_vali)

    # Compute the mae
    mae = mean_absolute_error(y_pred, y_vali)
    lstMae.append(mae)

    # Compute the rmse
    rmse = np.sqrt(mean_squared_error(y_pred, y_vali))
    lstRmse.append(rmse)
    
    print("Fold {0} MAE: {1}, RMSE: {2}".format(str(k), str(mae), str(rmse)))

    dfPredicted = x_vali
    dfPredicted["Quarterly_Average_Energy"] = y_vali
    dfPredicted["Quarterly_Average_Energy_predicted"] = y_pred
    dfPredicted.to_csv("./predictions/" + thisFileName+"_KFold" + str(k) + ".csv", index=False)

print("Final Result")
print("----------")
print("Average Mean Absolute Error (MAE): " + str(np.mean(lstMae)))
print("Average Root Mean Squared Error (RMSE): " + str(np.mean(lstRmse)))


Fold 0 MAE: 0.3771401649670248, RMSE: 0.6973191848259866
Fold 1 MAE: 0.36168907006838935, RMSE: 0.6512801214549696
Fold 2 MAE: 0.3896097697329342, RMSE: 0.7179750745019388
Fold 3 MAE: 0.3555560222289694, RMSE: 0.6053411554515096
Fold 4 MAE: 0.3856128228835308, RMSE: 0.7171455542423462
Final Result
----------
Average Mean Absolute Error (MAE): 0.3739215699761697
Average Root Mean Squared Error (RMSE): 0.6778122180953502


Run 1:
- Average Mean Absolute Error (MAE): 0.3731403835952224
- Average Root Mean Squared Error (RMSE): 0.6835358129954484

Run 2:
- Average Mean Absolute Error (MAE): 0.37383787672946706
- Average Root Mean Squared Error (RMSE): 0.6793045464083918

Run 3:
- Average Mean Absolute Error (MAE): 0.3753467753441989
- Average Root Mean Squared Error (RMSE): 0.6822454913430949

In [8]:
realTimeEnergy = np.mean(df_inv01_2020["Quarterly_Average_Energy"])
avgMae = np.mean([ 0.3731403835952224, 0.37383787672946706, 0.3753467753441989 ])
avgRmse = np.mean([ 0.6835358129954484, 0.6793045464083918, 0.6822454913430949 ])

predictionAccuracy = 100 - np.round((avgMae / realTimeEnergy) * 100, 2)
percentAvgAccuracyError = np.round((avgRmse / realTimeEnergy) * 100, 2)

print("Predictions made to an accuracy of: " + str(predictionAccuracy) + "%")
print("Predictions Error: +/-" + str(percentAvgAccuracyError) + "%")

Predictions made to an accuracy of: 87.37%
Predictions Error: +/-23.02%
