In [12]:
import pandas as pd
import numpy as np
import utils
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_percentage_error

# Define hyperparameters and other important variables

In [70]:
target_variable = "SpotPriceEUR"

timestamp_col = "HourDK"

target_sequence_length = 7*24

test_size = 0.20

#first_day_train = "2020-01-01"

first_day_test = "2021-01-01"

last_day_test = "2021-12-31"

hyperparameters = {
    "in_length" : 14*24,
    "step_size" : 12,
    "n_estimators" : 20,
    "max_depth" : 6,
    "subsample" : 0.5,
    "min_child_weight" : 1,
    "selected_features" : [target_variable]
    }

# Prepare data

First, we load the data and split it into train and test. It is important that the shuffle argument is False, because we need to maintain the original order of the data points in the data. 

In [71]:
spotprices = utils.load_data()

In [72]:
training_data = spotprices[spotprices.index < first_day_test]
#training_data = training_data[training_data.index < first_day]
training_data

Unnamed: 0_level_0,SpotPriceEUR
HourDK,Unnamed: 1_level_1
2017-01-01 00:00:00,20.959999
2017-01-01 01:00:00,20.900000
2017-01-01 02:00:00,18.129999
2017-01-01 03:00:00,16.030001
2017-01-01 04:00:00,16.430000
...,...
2020-12-31 19:00:00,59.470001
2020-12-31 20:00:00,56.700001
2020-12-31 21:00:00,52.439999
2020-12-31 22:00:00,51.860001


In [73]:
test_data = spotprices[spotprices.index >= first_day_test]
test_data = test_data[test_data.index <= last_day_test]
test_data

Unnamed: 0_level_0,SpotPriceEUR
HourDK,Unnamed: 1_level_1
2021-01-01 00:00:00,50.869999
2021-01-01 01:00:00,48.189999
2021-01-01 02:00:00,44.680000
2021-01-01 03:00:00,42.919998
2021-01-01 04:00:00,40.389999
...,...
2021-12-30 20:00:00,54.590000
2021-12-30 21:00:00,55.000000
2021-12-30 22:00:00,53.500000
2021-12-30 23:00:00,50.130001


In [74]:
# Create indices. Must be passed to function that creates (X,Y) pairs so that
# it knows where to slice the data
training_indices = utils.get_indices_entire_sequence(
    data=training_data, 
    window_size=hyperparameters["in_length"]+target_sequence_length, 
    step_size=hyperparameters["step_size"]
    )

# Obtain (X,Y) pairs of training data
x_train, y_train = utils.get_xgboost_x_y(
    indices=training_indices, 
    data=training_data[hyperparameters["selected_features"]].to_numpy(),
    univariate=True,
    target_sequence_length=target_sequence_length,
    input_seq_len=hyperparameters["in_length"],
    exo_feature_steps=0
    )


test_indices = utils.get_indices_entire_sequence(
    data=test_data, 
    window_size=hyperparameters["in_length"]+target_sequence_length, 
    step_size=24
    )

# Obtain (X,Y) pairs of test data
x_test, y_test = utils.get_xgboost_x_y(
    indices=test_indices, 
    data=test_data[hyperparameters["selected_features"]].to_numpy(),
    univariate=True,
    target_sequence_length=target_sequence_length,
    input_seq_len=hyperparameters["in_length"],
    exo_feature_steps=0
    )

Preparing data..
Finished preparing data!
Preparing data..
Finished preparing data!


In [75]:
# Initialize model
model = xgb.XGBRegressor(
    n_estimators=hyperparameters["n_estimators"],
    max_depth=hyperparameters["max_depth"],
    subsample=hyperparameters["subsample"],
    min_child_weight=hyperparameters["min_child_weight"],
    objective="reg:squarederror",
    tree_method="hist"
    )

trained_model = MultiOutputRegressor(model).fit(x_train, y_train)

train_forecasts = trained_model.predict(x_train)

#train_mape = mean_absolute_percentage_error(y_train, train_forecasts)
#print("Overall training MAPE: {}".format(train_mape))



In [76]:
y_train

array([[29.02    , 28.540001, 27.33    , ..., 28.969999, 28.540001,
        28.129999],
       [31.950001, 31.700001, 31.5     , ..., 29.290001, 29.6     ,
        29.59    ],
       [31.01    , 30.93    , 30.9     , ..., 29.530001, 29.040001,
        28.42    ],
       ...,
       [56.      , 54.509998, 49.970001, ..., 52.700001, 52.169998,
        50.700001],
       [13.13    , 12.53    , 10.87    , ..., 45.759998, 23.129999,
        20.940001],
       [21.469999, 19.700001, 19.120001, ..., 47.5     , 43.23    ,
        43.57    ]])

In [77]:
train_forecasts

array([[27.96498 , 27.565006, 26.00282 , ..., 27.339048, 31.014221,
        29.710768],
       [31.397942, 30.369198, 32.53094 , ..., 33.148544, 32.745968,
        36.782005],
       [30.000326, 29.948568, 28.50131 , ..., 31.4451  , 30.059782,
        26.93521 ],
       ...,
       [54.327297, 53.326283, 51.761158, ..., 46.73434 , 50.904613,
        50.121037],
       [16.70731 , 11.648505, 11.531536, ..., 42.951977, 26.6587  ,
        22.669006],
       [19.554478, 18.126287, 20.470968, ..., 52.333134, 48.172523,
        46.666718]], dtype=float32)

In [78]:
from sklearn.metrics import mean_absolute_error
train_mae = mean_absolute_error(y_train, train_forecasts)
print(train_mae)
print("Mean train value: {}".format(np.mean(y_train)))

5.274390050061007
Mean train value: 36.63538420075645


In [79]:
test_forecasts = trained_model.predict(x_test)
test_mae = mean_absolute_error(y_test, test_forecasts)
print(test_mae)
print("Mean train value: {}".format(np.mean(y_test)))

51.70997033130468
Mean train value: 89.48914604419295
