### Model Prediction for Each Validation Fold

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_validate
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score

In [23]:
# Load dataset
data = pd.read_csv('../data/ENB2012_data_with_columns.csv')

In [24]:
# Check for NaN values and replace or drop them
data = data.replace([np.inf, -np.inf], np.nan)
data = data.dropna()

In [25]:
# Define input and output variables
X = data.iloc[:, :-2]
y_hl = data['Heating_Load']
y_cl = data['Cooling_Load']

In [26]:
# Define custom scoring functions
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [27]:
scoring = {
    'MAE': make_scorer(mean_absolute_error),
    'RMSE': make_scorer(rmse),
    'MAPE': make_scorer(mape),
    'R2': make_scorer(r2_score)
}

In [28]:
# Best parameters from grid search
best_params_hl = {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 300, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 1}
best_params_cl = {'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.2, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 300, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 0.8}

In [29]:
# Define the models with the best parameters
xgb_model_hl = XGBRegressor(**best_params_hl)
xgb_model_cl = XGBRegressor(**best_params_cl)

In [30]:
# Cross-validation for heating load
scores_hl = cross_validate(xgb_model_hl, X, y_hl, cv=10, scoring=scoring, return_train_score=True)
# Cross-validation for cooling load
scores_cl = cross_validate(xgb_model_cl, X, y_cl, cv=10, scoring=scoring, return_train_score=True)

In [31]:
# Save the scores
hl_scores_df = pd.DataFrame(scores_hl)
cl_scores_df = pd.DataFrame(scores_cl)

In [32]:
hl_scores_df.to_csv('../results/tables/heating_load_scores.csv', index=False)
cl_scores_df.to_csv('../results/tables/cooling_load_scores.csv', index=False)

In [33]:
# Calculate and print the averages and standard deviations
print("Heating Load Cross-Validation Scores (Avg and Std):")
print(hl_scores_df.mean())
print(hl_scores_df.std())

Heating Load Cross-Validation Scores (Avg and Std):
fit_time      0.079162
score_time    0.003701
test_MAE      0.664526
train_MAE     0.053769
test_RMSE     0.881368
train_RMSE    0.077518
test_MAPE     4.534199
train_MAPE    0.271833
test_R2       0.968914
train_R2      0.999941
dtype: float64
fit_time      0.004218
score_time    0.000482
test_MAE      1.190295
train_MAE     0.002931
test_RMSE     1.501862
train_RMSE    0.003276
test_MAPE     9.715305
train_MAPE    0.015656
test_R2       0.092461
train_R2      0.000005
dtype: float64


In [34]:
print("\nCooling Load Cross-Validation Scores (Avg and Std):")
print(cl_scores_df.mean())
print(cl_scores_df.std())


Cooling Load Cross-Validation Scores (Avg and Std):
fit_time      0.081013
score_time    0.003551
test_MAE      0.690026
train_MAE     0.103027
test_RMSE     0.893710
train_RMSE    0.145793
test_MAPE     3.166651
train_MAPE    0.454252
test_R2       0.985746
train_R2      0.999764
dtype: float64
fit_time      0.003542
score_time    0.000497
test_MAE      0.555177
train_MAE     0.004946
test_RMSE     0.660693
train_RMSE    0.006440
test_MAPE     3.038289
train_MAPE    0.019164
test_R2       0.026684
train_R2      0.000021
dtype: float64


In [35]:
# Save averages and standard deviations
with open('../results/tables/average_std_scores.txt', 'w') as f:
    f.write("Heating Load Cross-Validation Scores (Avg and Std):\n")
    f.write(str(hl_scores_df.mean()) + "\n")
    f.write(str(hl_scores_df.std()) + "\n\n")
    f.write("Cooling Load Cross-Validation Scores (Avg and Std):\n")
    f.write(str(cl_scores_df.mean()) + "\n")
    f.write(str(cl_scores_df.std()) + "\n")