In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os
from xgboost import XGBRegressor
from mlforecast import MLForecast
from window_ops.rolling import rolling_mean, rolling_max, rolling_min

In [3]:
data = pd.read_csv('../datasets/ETT-small/ETTh1.csv', parse_dates=['date'])
#delete index column and create id column for each time series
data['unique_id'] = data.index
# transform date column into datetime format
data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d %H:%M:%S')
data.drop(['HUFL', 'HULL', 'MUFL', 'MULL', 'LUFL', 'LULL'], axis=1, inplace=True)
data

Unnamed: 0,date,OT,unique_id
0,2016-07-01 00:00:00,30.531000,0
1,2016-07-01 01:00:00,27.787001,1
2,2016-07-01 02:00:00,27.787001,2
3,2016-07-01 03:00:00,25.044001,3
4,2016-07-01 04:00:00,21.948000,4
...,...,...,...
17415,2018-06-26 15:00:00,10.904000,17415
17416,2018-06-26 16:00:00,11.044000,17416
17417,2018-06-26 17:00:00,10.271000,17417
17418,2018-06-26 18:00:00,9.778000,17418


In [4]:
train = data[:8640]
valid = data[8304:11520]
test = data[11184:14400]

number_of_days_in_valid = valid["date"].nunique()

In [5]:
train

Unnamed: 0,date,OT,unique_id
0,2016-07-01 00:00:00,30.531000,0
1,2016-07-01 01:00:00,27.787001,1
2,2016-07-01 02:00:00,27.787001,2
3,2016-07-01 03:00:00,25.044001,3
4,2016-07-01 04:00:00,21.948000,4
...,...,...,...
8635,2017-06-25 19:00:00,21.174000,8635
8636,2017-06-25 20:00:00,20.612000,8636
8637,2017-06-25 21:00:00,21.034000,8637
8638,2017-06-25 22:00:00,21.455999,8638


In [6]:
valid

Unnamed: 0,date,OT,unique_id
8304,2017-06-12 00:00:00,15.336,8304
8305,2017-06-12 01:00:00,15.054,8305
8306,2017-06-12 02:00:00,14.280,8306
8307,2017-06-12 03:00:00,13.858,8307
8308,2017-06-12 04:00:00,14.210,8308
...,...,...,...
11515,2017-10-23 19:00:00,10.271,11515
11516,2017-10-23 20:00:00,9.708,11516
11517,2017-10-23 21:00:00,8.723,11517
11518,2017-10-23 22:00:00,8.864,11518


In [7]:
print(train.columns)
print(valid.columns)
print(test.columns)

Index(['date', 'OT', 'unique_id'], dtype='object')
Index(['date', 'OT', 'unique_id'], dtype='object')
Index(['date', 'OT', 'unique_id'], dtype='object')


# Training

In [8]:
models = [XGBRegressor(objective="reg:squarederror", random_state=0, n_estimators=100)]

model = MLForecast(models=models,
                   freq='H',
                   lags=[1],
                   #lag_transforms={
                   #    1: [(rolling_mean, 7), (rolling_max, 7), (rolling_min, 7)],
                   #},
                   date_features=['hour', 'dayofweek', 'month'],
                   num_threads=6)


model.fit(train, id_col='unique_id', time_col='date', target_col='OT', static_features=[])



MLForecast(models=[XGBRegressor], freq=<Hour>, lag_features=['lag1'], date_features=['hour', 'dayofweek', 'month'], num_threads=6)

In [9]:
# Predict
predictions = model.predict(horizon=number_of_days_in_valid)

In [11]:
predictions

Unnamed: 0,unique_id,date,XGBRegressor
0,0,2016-07-01 01:00:00,0.0
1,0,2016-07-01 02:00:00,0.0
2,0,2016-07-01 03:00:00,0.0
3,0,2016-07-01 04:00:00,0.0
4,0,2016-07-01 05:00:00,0.0
...,...,...,...
27786235,8639,2017-11-06 19:00:00,0.0
27786236,8639,2017-11-06 20:00:00,0.0
27786237,8639,2017-11-06 21:00:00,0.0
27786238,8639,2017-11-06 22:00:00,0.0


In [64]:
# Merge predictions with actuals for validation (if desired)
merged = predictions.merge(valid[['unique_id', 'date', 'OT']], on=['unique_id', 'date'], how='left')
merged.rename(columns={'OT_x': 'Prediction', 'OT_y': 'Actual'}, inplace=True)

print(merged)

          unique_id                date  XGBRegressor  OT
0                 0 2016-07-01 01:00:00           0.0 NaN
1                 0 2016-07-01 02:00:00           0.0 NaN
2                 0 2016-07-01 03:00:00           0.0 NaN
3                 0 2016-07-01 04:00:00           0.0 NaN
4                 0 2016-07-01 05:00:00           0.0 NaN
...             ...                 ...           ...  ..
27786235       8639 2017-11-06 19:00:00           0.0 NaN
27786236       8639 2017-11-06 20:00:00           0.0 NaN
27786237       8639 2017-11-06 21:00:00           0.0 NaN
27786238       8639 2017-11-06 22:00:00           0.0 NaN
27786239       8639 2017-11-06 23:00:00           0.0 NaN

[27786240 rows x 4 columns]


In [65]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(merged['y'], merged['XGBRegressor'])

KeyError: 'y'