In [3]:
import pandas as pd
import numpy as np
from datetime import timedelta
from xgboost import XGBRegressor

# Read in the data
#helen = pd.read_parquet('processed_data/helen.parquet')
wind = pd.read_parquet('processed_data/tuulivoima.parquet')
consumption = pd.read_parquet('processed_data/consumption.parquet')
dayahead = pd.read_parquet('processed_data/dayahead.parquet')
weather = pd.read_parquet('processed_data/weather.parquet')

In [4]:
# Merge the data

df = wind.join(consumption).join(dayahead).join(weather).dropna()
df.head()

Unnamed: 0_level_0,Wind_MWh,Consumption_MWh,price,pressure,rain,humidity,temperature,wind
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-01-01 00:00:00,1427.0,10152.0,10.07,1000.9,0.9,97.0,0.8,7.9
2019-01-01 01:00:00,1352.0,9920.0,10.03,998.7,2.2,97.0,1.5,8.5
2019-01-01 02:00:00,1177.0,9845.0,4.56,996.6,1.7,98.0,2.0,9.8
2019-01-01 03:00:00,1001.0,9913.0,4.83,994.2,0.3,98.0,2.4,8.9
2019-01-01 04:00:00,826.0,10027.0,8.09,992.0,0.4,98.0,2.5,7.6


### Create time series features

In [5]:
def create_features(df):
    """
    Create time series features based on time series index.
    """
    df = df.copy()
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek
    df['quarter'] = df.index.quarter
    df['month'] = df.index.month
    df['year'] = df.index.year
    df['dayofyear'] = df.index.dayofyear
    df['dayofmonth'] = df.index.day
    df['weekofyear'] = df.index.isocalendar().week
    return df

### Create lag features

In [6]:
def add_lags(df, colname, shortname):
    target_map = df[colname].to_dict()
    df[shortname + '_lag1'] = (df.index - pd.Timedelta('364 days')).map(target_map)
    df[shortname + '_lag2'] = (df.index - pd.Timedelta('728 days')).map(target_map)
    df[shortname + '_lag3'] = (df.index - pd.Timedelta('1092 days')).map(target_map)
    return df

### Train XGBoost model

In [7]:
df = create_features(df)
df = add_lags(df, 'Consumption_MWh', 'c')
df = add_lags(df, 'Wind_MWh', 'w')
df = add_lags(df, 'pressure', 'pr')
df = add_lags(df, 'rain', 'rain')
df = add_lags(df, 'humidity', 'hum')
df = add_lags(df, 'temperature', 'temp')
df = add_lags(df, 'wind', 'win')

FEATURES = ['Wind_MWh','Consumption_MWh','pressure','rain','humidity','temperature','wind',
            'dayofyear', 'hour', 'dayofweek', 'quarter', 'month', 'year',
            'c_lag1','c_lag2','c_lag3','w_lag1','w_lag2','w_lag3','pr_lag1','pr_lag2','pr_lag3',
            'rain_lag1','rain_lag2','rain_lag3','hum_lag1','hum_lag2','hum_lag3',
            'temp_lag1','temp_lag2','temp_lag3','win_lag1','win_lag2','win_lag3']
TARGET = 'price'

X_all = df[FEATURES]
y_all = df[TARGET]

reg = XGBRegressor(n_estimators=1000)
reg.fit(X_all, y_all,
        eval_set=[(X_all, y_all)],
        verbose=100)

[0]	validation_0-rmse:76.35624


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


[100]	validation_0-rmse:11.69737
[200]	validation_0-rmse:8.26340
[300]	validation_0-rmse:6.17296
[400]	validation_0-rmse:4.98288
[500]	validation_0-rmse:4.10515
[600]	validation_0-rmse:3.47347
[700]	validation_0-rmse:2.93877
[800]	validation_0-rmse:2.55784
[900]	validation_0-rmse:2.23429
[999]	validation_0-rmse:1.95092


## Save the model for further use

In [8]:
reg.save_model('../models/xgboost_model.ubj')