In [29]:
import pandas as pd
import numpy as np
from datetime import timedelta
from xgboost import XGBRegressor

# Read in the data
#helen = pd.read_parquet('processed_data/helen.parquet')
wind = pd.read_parquet('processed_data/tuulivoima.parquet')
consumption = pd.read_parquet('processed_data/consumption.parquet')
dayahead = pd.read_parquet('processed_data/dayahead.parquet')

In [30]:
# Merge the data

df = wind.join(consumption).join(dayahead).dropna()
df.head()

Unnamed: 0_level_0,Wind_MWh,Consumption_MWh,price
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01 00:00:00,1427.0,10152.0,10.07
2019-01-01 01:00:00,1352.0,9920.0,10.03
2019-01-01 02:00:00,1177.0,9845.0,4.56
2019-01-01 03:00:00,1001.0,9913.0,4.83
2019-01-01 04:00:00,826.0,10027.0,8.09


### Create time series features

In [31]:
def create_features(df):
    """
    Create time series features based on time series index.
    """
    df = df.copy()
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofyear'] = df.index.dayofyear
    df['dayofmonth'] = df.index.day
    df['weekofyear'] = df.index.isocalendar().week
    return df

### Create lag features

In [32]:
def add_lags(df):
    target_map = df['Consumption_MWh'].to_dict()
    df['c_lag1'] = (df.index - pd.Timedelta('364 days')).map(target_map)
    df['c_lag2'] = (df.index - pd.Timedelta('728 days')).map(target_map)
    df['c_lag3'] = (df.index - pd.Timedelta('1092 days')).map(target_map)
    target_map = df['Wind_MWh'].to_dict()
    df['w_lag1'] = (df.index - pd.Timedelta('364 days')).map(target_map)
    df['w_lag2'] = (df.index - pd.Timedelta('728 days')).map(target_map)
    df['w_lag3'] = (df.index - pd.Timedelta('1092 days')).map(target_map)
    return df

### Train XGBoost model

In [35]:
df = create_features(df)
df = add_lags(df)
FEATURES = ['Wind_MWh','Consumption_MWh','dayofyear', 'hour', 'dayofweek', 'quarter', 'month', 'year',
            'c_lag1','c_lag2','c_lag3','w_lag1','w_lag2','w_lag3']
TARGET = 'price'

X_all = df[FEATURES]
y_all = df[TARGET]

reg = XGBRegressor(n_estimators=1000)
reg.fit(X_all, y_all,
        eval_set=[(X_all, y_all)],
        verbose=100)

[0]	validation_0-rmse:76.39777
[100]	validation_0-rmse:12.63755
[200]	validation_0-rmse:8.99849
[300]	validation_0-rmse:7.00982
[400]	validation_0-rmse:5.78731
[500]	validation_0-rmse:4.92792
[600]	validation_0-rmse:4.23242
[700]	validation_0-rmse:3.71362
[800]	validation_0-rmse:3.25650
[900]	validation_0-rmse:2.87331
[999]	validation_0-rmse:2.61149


## Save the model for further use

In [36]:
reg.save_model('../models/xgboost_model.ubj')