In [1]:
#Notebook used to prototype different xgboost models

from xgboost import XGBRegressor
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
import seaborn
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures


from skopt.space import Real, Integer
from skopt import BayesSearchCV
import requests
import json
from entsoe import EntsoePandasClient

from skforecast.ForecasterBaseline import ForecasterEquivalentDate
from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.model_selection import bayesian_search_forecaster
from skforecast.model_selection import backtesting_forecaster
from skforecast.utils import check_y

from astral.sun import sun
from astral import LocationInfo
import data

from sklearn.linear_model import LinearRegression,Ridge
import xgbmodel as xgb
import pickle
from datetime import timedelta,datetime


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# 2015 - 2018 dataset: "../datasets/energy_dataset.csv"

df2 = data.load_data(dataset = "../datasets/energy_updated.csv")
df2 = data.preprocessing(df2)
df2.rename({"Actual Load":"total load actual"},axis=1, inplace=True)
df2.head

In [None]:
# split dates for other dataset: "2017-06-30 23:00:00",'2018-03-31 23:00:00'

trainx,testx,trainy,testy, end_validation = data.split_data(df2,"total load actual","2023-03-01 23:00:00",'2023-06-30 23:00:00')

testx.head

In [42]:
#fourier encoding for calender features
def fourier_features(feature,cycle_length,order):
    result = pd.DataFrame()

    k = 2 * np.pi * feature/cycle_length
    for i in range(1,order+1):
        result[f"sin_{feature.name}_{i}"] =  np.sin(i*k)
        result[f"cos_{feature.name}_{i}"]    =  np.cos(i*k)
    return result

In [None]:
#import weather features from dataset
weather_df = pd.read_csv("../datasets/weather_features.csv")
weather_df = weather_df.loc[weather_df["city_name"] == "Madrid"]

#Drop duplicates from index
weather_df = weather_df.drop_duplicates(subset="dt_iso")

dups = weather_df["dt_iso"].duplicated()
dups.loc[dups == False]

weather_df.head

In [None]:
#extract rolling window temperature features: min, max and avg temperature
temp_features = weather_df["temp"].copy()
temp_features = temp_features.to_frame()

temp_features['temp_roll_mean_1_day'] = temp_features['temp'].rolling(24, closed='left').mean()
temp_features['temp_roll_mean_7_day'] = temp_features['temp'].rolling(24*7, closed='left').mean()
temp_features['temp_roll_max_1_day'] = temp_features['temp'].rolling(24, closed='left').max()
temp_features['temp_roll_min_1_day'] = temp_features['temp'].rolling(24, closed='left').min()
temp_features['temp_roll_max_7_day'] = temp_features['temp'].rolling(24*7, closed='left').max()
temp_features['temp_roll_min_7_day'] = temp_features['temp'].rolling(24*7, closed='left').min()


temp_features.index = df2.index

temp_features.head



In [None]:
location = LocationInfo(
    name='Washington DC',
    region='Spain',
    timezone='CET')

calendar_features = pd.DataFrame(index=df2.index)
calendar_features['month'] = calendar_features.index.month
calendar_features['week_of_year'] = calendar_features.index.isocalendar().week
calendar_features['week_day'] = calendar_features.index.day_of_week + 1
calendar_features['hour_day'] = calendar_features.index.hour + 1
sunrise_hour = [
    sun(location.observer, date=date, tzinfo=location.timezone)['sunrise'].hour
    for date in df2.index
]
sunset_hour = [
    sun(location.observer, date=date, tzinfo=location.timezone)['sunset'].hour
    for date in df2.index
]
sun_light_features = pd.DataFrame({
                         'sunrise_hour': sunrise_hour,
                         'sunset_hour': sunset_hour}, 
                         index = df2.index
                     )
sun_light_features['daylight_hours'] = (
    sun_light_features['sunset_hour'] - sun_light_features['sunrise_hour']
)
sun_light_features['is_daylight'] = np.where(
                                        (df2.index.hour >= sun_light_features['sunrise_hour']) & \
                                        (df2.index.hour < sun_light_features['sunset_hour']),
                                        1,
                                        0
                                    )
exo_features = pd.concat([
                            calendar_features,
                            sun_light_features,
                         
                        ], axis=1)

month_encoded = fourier_features(exo_features["month"], 12,1)
week_of_year_encoded = fourier_features(exo_features['week_of_year'], 52,1)
week_day_encoded = fourier_features(exo_features['week_day'], 7,1)
hour_day_encoded = fourier_features(exo_features['hour_day'], 24,1)
cyclical_features = pd.concat([
                        month_encoded,
                        week_of_year_encoded,
                        week_day_encoded,
                        hour_day_encoded,
                    ], axis=1)

#cyclical_features = pd.concat([cyclical_features,temp_features], axis = 1)
exo_features = pd.concat([exo_features, cyclical_features], axis=1)
exo_features.head

In [None]:
transformer_poly = PolynomialFeatures(
                       degree           = 2,
                       interaction_only = True,
                       include_bias     = False,
                       

                   ).set_output(transform="pandas")
"""    'sin_sunrise_hour_1',
    'cos_sunrise_hour_1',
    'sin_sunset_hour_1',
    'cos_sunset_hour_1',"""
poly_cols = [
    'sin_month_1', 
    'cos_month_1',
    'sin_week_of_year_1',
    'cos_week_of_year_1',
    'sin_week_day_1', 
    'cos_week_day_1',
    'sin_hour_day_1',
    'cos_hour_day_1',
    'daylight_hours',
    'is_daylight',
    #'temp_roll_mean_1_day',
    #'temp_roll_mean_7_day',
    #'temp_roll_max_1_day',
    #'temp_roll_min_1_day',
    #'temp_roll_max_7_day',
    #'temp_roll_min_7_day',
    #'temp'

]

poly_features = transformer_poly.fit_transform(exo_features[poly_cols].dropna())
poly_features = poly_features.drop(columns=poly_cols)
poly_features.columns = [f"poly_{col}" for col in poly_features.columns]
poly_features.columns = poly_features.columns.str.replace(" ", "__")

exo_features = pd.concat([exo_features, poly_features], axis=1)
exo_features.head


In [None]:

features = []

# Select the columns to be used as exo features
#features.extend(exo_features.columns.tolist())

features.extend(exo_features.filter(regex='^sin_|^cos_').columns.tolist())

features.extend(exo_features.filter(regex='^temp_.*').columns.tolist())

#add or remove temp features 
features = [x for x in features if "temp" not in x]

df2 = df2[["total load actual"]].merge(
           exo_features,
           left_index=True,
           right_index=True,
           how='left'
       )

df2.dropna()

In [49]:
#skforecast
forecast = ForecasterAutoreg(regressor = XGBRegressor(random_state = 1543),lags = 168)
forecast.fit(y=trainy)


In [None]:

metric, predictions = backtesting_forecaster(
                          forecaster         = forecast,
                          y                  = df2["total load actual"],
                          steps              = 24,
                          metric             =  'mean_absolute_error',
                          initial_train_size = len(trainy),
                          refit              = False,
                          n_jobs             = 'auto',
                          verbose            = True, 
                          show_progress      = True
                      )

print(f'Backtest error (MAE): {metric}')

In [None]:
#prediction without using exogenous variables 

lags_grid = [48, 72, [1, 2, 3, 23, 24, 25, 167, 168, 169]]  

def search_space(trial):
    search_space  = {
        'n_estimators'  : trial.suggest_int('n_estimators', 400, 1200, step=100),
        'max_depth'     : trial.suggest_int('max_depth', 3, 10, step=1),
        'learning_rate' : trial.suggest_float('learning_rate', 0.01, 0.5),
        'reg_alpha'     : trial.suggest_float('reg_alpha', 0, 1, step=0.1),
        'reg_lambda'    : trial.suggest_float('reg_lambda', 0, 1, step=0.1),
    } 
    return search_space
search_params = {
    "reg__max_depth": Integer(2,8),
    "reg__learning_rate": Real(0.001,0.1,prior="log-uniform"),
    "reg__subsample": Real(0.5,1.0),
    "reg__reg_alpha": Real(0.0,10.0),
    "reg__reg_lambda": Real(0.0,10.0),
    "reg__gamma": Real(0.0,10.0)

}

results_search, frozen_trial = bayesian_search_forecaster(
                                   forecaster         = forecast,
                                   y                  = df2["total load actual"],
                                   search_space       = search_space,
                                   lags_grid          = lags_grid,
                                   steps              = 24,
                                   refit              = False,
                                   metric             = 'mean_absolute_error',
                                   initial_train_size = len(trainy),
                                   fixed_train_size   = False,
                                   n_trials           = 10, 
                                   random_state       = 123,
                                   return_best        = True,
                                   n_jobs             = 'auto',
                                   verbose            = False,
                                   show_progress      = True
                               )

In [52]:
len(df2.loc[: "2023-03-01 23:00:00"])

10201

In [None]:
#model trained using exogenous features


forecast = ForecasterAutoreg(regressor = XGBRegressor(random_state = 1543),lags =169)
forecast.fit(y=trainy)

# Lags gridy
lags_grid = [[1, 2, 3, 23, 24, 25, 167, 168, 169]]

# Regressor hyperparameters search space
def search_space(trial):
    search_space  = {
        'n_estimators' : trial.suggest_int('n_estimators', 800, 1400, step=100),
        'max_depth'    : trial.suggest_int('max_depth', 3, 8, step=1),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
        'reg_alpha'    : trial.suggest_float('reg_alpha', 0, 1, step=0.1),
        'reg_lambda'   : trial.suggest_float('reg_lambda', 0, 1, step=0.1),
    } 
    return search_space

results_search, frozen_trial = bayesian_search_forecaster(
                                   forecaster         = forecast,
                                   y                  = df2.loc[:end_validation, "total load actual"],
                                   exog               = df2.loc[:end_validation, features],

                                   search_space       = search_space,
                                   lags_grid          = lags_grid,
                                   steps              = 36,
                                   refit              = False,
                                   metric             = 'mean_absolute_error',

                                   initial_train_size = len(df2.loc[: "2023-03-01 23:00:00"]),
                                   fixed_train_size   = False,
                                   n_trials           = 20,
                                   random_state       = 123,
                                   return_best        = True,
                                   n_jobs             = 'auto',
                                   verbose            = False,
                                   show_progress      = True
                               )

In [None]:
#backtesting using model with exogenous features
metric, predictions = backtesting_forecaster(
                          forecaster         = forecast,
                          y                  = df2["total load actual"],
                          exog               = df2[features],
                          steps              = 7,
                          metric             = 'mean_absolute_error',
                          initial_train_size = len(df2.loc[: end_validation]),
                          refit              = False,
                          n_jobs             = 'auto',
                          verbose            = True, 
                          show_progress      = True
                      )

print(f"Backtest error: {metric:.2f}")

In [None]:
predicted2 = forecast.predict(steps=24,exog = df2.loc["2018-04-01 00:00:00+00:00":,features])

data2 = pd.concat([df2.loc["2018-04-01 00:00:00+00:00":"2018-04-02 00:00:00+00:00","total load actual"],predicted2],axis=1)

fig,ax = plt.subplots(figsize=[20,5])
seaborn.lineplot(ax = ax, data = data2)


In [None]:
# Creates an XGBoost model without temperature features using the XGBoost class

df2 = data.load_data(dataset = "datasets/energy_updated.csv")
df2 = data.preprocessing(df2)

poly_cols =     ['sin_month_1', 
        'cos_month_1',
        'sin_week_of_year_1',
        'cos_week_of_year_1',
        'sin_week_day_1',
        'cos_week_day_1',
        'sin_hour_day_1',
        'cos_hour_day_1',
        'daylight_hours',
        'is_daylight']

model2 = xgb.xgboost_model(df2,"total load actual","2023-03-01 23:00:00",'2023-06-30 23:00:00')
 
model2.train_models(poly_cols)

model2.backtesting()

model2.save_model("xgboost_v2_no_temp.joblib")
