In [15]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os
from xgboost import XGBRegressor
from mlforecast import MLForecast
from window_ops.rolling import rolling_mean, rolling_max, rolling_min

In [61]:
data = pd.read_csv('../datasets/ETT-small/ETTh1.csv', parse_dates=['date'])
#delete index column and create id column for each time series
data['unique_id'] = data.index
# transform date column into datetime format
data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d %H:%M:%S')
data

Unnamed: 0,date,HUFL,HULL,MUFL,MULL,LUFL,LULL,OT,unique_id
0,2016-07-01 00:00:00,5.827,2.009,1.599,0.462,4.203,1.340,30.531000,0
1,2016-07-01 01:00:00,5.693,2.076,1.492,0.426,4.142,1.371,27.787001,1
2,2016-07-01 02:00:00,5.157,1.741,1.279,0.355,3.777,1.218,27.787001,2
3,2016-07-01 03:00:00,5.090,1.942,1.279,0.391,3.807,1.279,25.044001,3
4,2016-07-01 04:00:00,5.358,1.942,1.492,0.462,3.868,1.279,21.948000,4
...,...,...,...,...,...,...,...,...,...
17415,2018-06-26 15:00:00,-1.674,3.550,-5.615,2.132,3.472,1.523,10.904000,17415
17416,2018-06-26 16:00:00,-5.492,4.287,-9.132,2.274,3.533,1.675,11.044000,17416
17417,2018-06-26 17:00:00,2.813,3.818,-0.817,2.097,3.716,1.523,10.271000,17417
17418,2018-06-26 18:00:00,9.243,3.818,5.472,2.097,3.655,1.432,9.778000,17418


In [31]:
train = data[:8640]
valid = data[8304:11520]
test = data[11184:14400]

number_of_days_in_valid = valid["date"].nunique()

In [68]:
train

Unnamed: 0,date,HUFL,HULL,MUFL,MULL,LUFL,LULL,OT,unique_id
0,2016-07-01 00:00:00,5.827,2.009,1.599,0.462,4.203,1.340,30.531000,0
1,2016-07-01 01:00:00,5.693,2.076,1.492,0.426,4.142,1.371,27.787001,1
2,2016-07-01 02:00:00,5.157,1.741,1.279,0.355,3.777,1.218,27.787001,2
3,2016-07-01 03:00:00,5.090,1.942,1.279,0.391,3.807,1.279,25.044001,3
4,2016-07-01 04:00:00,5.358,1.942,1.492,0.462,3.868,1.279,21.948000,4
...,...,...,...,...,...,...,...,...,...
8635,2017-06-25 19:00:00,7.770,0.201,4.762,0.675,2.985,-0.640,21.174000,8635
8636,2017-06-25 20:00:00,9.042,0.335,5.650,0.675,3.381,-0.609,20.612000,8636
8637,2017-06-25 21:00:00,8.372,0.536,5.472,0.391,3.290,-0.670,21.034000,8637
8638,2017-06-25 22:00:00,8.975,0.737,6.254,1.031,3.107,-0.579,21.455999,8638


In [69]:
valid

Unnamed: 0,date,HUFL,HULL,MUFL,MULL,LUFL,LULL,OT,unique_id
8304,2017-06-12 00:00:00,10.047,2.210,7.960,2.452,1.919,-0.731,15.336,8304
8305,2017-06-12 01:00:00,9.042,1.808,6.681,1.564,1.828,-0.731,15.054,8305
8306,2017-06-12 02:00:00,9.310,1.875,7.356,2.203,1.736,-0.761,14.280,8306
8307,2017-06-12 03:00:00,8.105,0.938,6.219,1.777,1.949,-0.670,13.858,8307
8308,2017-06-12 04:00:00,8.439,1.474,5.757,1.741,1.919,-0.609,14.210,8308
...,...,...,...,...,...,...,...,...,...
11515,2017-10-23 19:00:00,8.707,1.005,4.797,-0.604,3.686,1.279,10.271,11515
11516,2017-10-23 20:00:00,8.105,0.938,4.371,-0.569,3.533,1.279,9.708,11516
11517,2017-10-23 21:00:00,7.167,1.206,4.087,-0.462,3.107,1.432,8.723,11517
11518,2017-10-23 22:00:00,7.100,1.340,4.015,-0.320,2.772,1.310,8.864,11518


In [55]:
print(train.columns)
print(valid.columns)
print(test.columns)

Index(['date', 'HUFL', 'HULL', 'MUFL', 'MULL', 'LUFL', 'LULL', 'OT',
       'unique_id'],
      dtype='object')
Index(['date', 'HUFL', 'HULL', 'MUFL', 'MULL', 'LUFL', 'LULL', 'OT',
       'unique_id'],
      dtype='object')
Index(['date', 'HUFL', 'HULL', 'MUFL', 'MULL', 'LUFL', 'LULL', 'OT',
       'unique_id'],
      dtype='object')


# Training

In [62]:
models = [XGBRegressor(random_state=0, n_estimators=100)]

model = MLForecast(models=models,
                   freq='H',
                   lags=[1],
                   #lag_transforms={
                   #    1: [(rolling_mean, 7), (rolling_max, 7), (rolling_min, 7)],
                   #},
                   date_features=['hour', 'dayofweek', 'month'],
                   num_threads=6)


model.fit(train, id_col='unique_id', time_col='date', target_col='OT', static_features=['HUFL', 'HULL', 'MUFL', 'MULL', 'LUFL', 'LULL'])

MLForecast(models=[XGBRegressor], freq=<Hour>, lag_features=['lag1'], date_features=['hour', 'dayofweek', 'month'], num_threads=6)

In [63]:
# Predict
predictions = model.predict(horizon=number_of_days_in_valid)

In [67]:
predictions[8304:11520]

Unnamed: 0,unique_id,date,XGBRegressor
8304,2,2016-09-17 03:00:00,0.0
8305,2,2016-09-17 04:00:00,0.0
8306,2,2016-09-17 05:00:00,0.0
8307,2,2016-09-17 06:00:00,0.0
8308,2,2016-09-17 07:00:00,0.0
...,...,...,...
11515,3,2016-09-16 23:00:00,0.0
11516,3,2016-09-17 00:00:00,0.0
11517,3,2016-09-17 01:00:00,0.0
11518,3,2016-09-17 02:00:00,0.0


In [64]:
# Merge predictions with actuals for validation (if desired)
merged = predictions.merge(valid[['unique_id', 'date', 'OT']], on=['unique_id', 'date'], how='left')
merged.rename(columns={'OT_x': 'Prediction', 'OT_y': 'Actual'}, inplace=True)

print(merged)

          unique_id                date  XGBRegressor  OT
0                 0 2016-07-01 01:00:00           0.0 NaN
1                 0 2016-07-01 02:00:00           0.0 NaN
2                 0 2016-07-01 03:00:00           0.0 NaN
3                 0 2016-07-01 04:00:00           0.0 NaN
4                 0 2016-07-01 05:00:00           0.0 NaN
...             ...                 ...           ...  ..
27786235       8639 2017-11-06 19:00:00           0.0 NaN
27786236       8639 2017-11-06 20:00:00           0.0 NaN
27786237       8639 2017-11-06 21:00:00           0.0 NaN
27786238       8639 2017-11-06 22:00:00           0.0 NaN
27786239       8639 2017-11-06 23:00:00           0.0 NaN

[27786240 rows x 4 columns]


In [65]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(merged['y'], merged['XGBRegressor'])

KeyError: 'y'