# Hyperparameter search in non-sequential pipelines

This notebook shows how hyperparameter search for pyWATTS Pipelines can be performed. We show this on two different pipelines.

The first pipeline performs a simple load forecast. The seond pipeline performs an electricity price forecast based on an historical load forecast.


In [1]:
import pandas as pd
from pywatts.callbacks import LinePlotCallback
from pywatts.modules import CalendarExtraction, CalendarFeature, SKLearnWrapper
from pywatts.modules.preprocessing.select import Select
from pywatts.summaries import RMSE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler

from examples.load_data import load_elec_data
from pywatts_pipeline.core.pipeline import Pipeline
from pywatts_pipeline.core.util.computation_mode import ComputationMode


## Hyperparameter tuning for a Load Forecast Pipeline

In [2]:
# Create a pipeline
pipeline = Pipeline(path="../results")

# Extract dummy calendar features, using holidays from Germany
# NOTE: CalendarExtraction can't return multiple features.
calendar = CalendarExtraction(continent="Europe",
                              country="Germany",
                              features=[CalendarFeature.month,CalendarFeature.weekday,
                                        CalendarFeature.weekend],
                              name="calendar"
                              )(x=pipeline["load_power_statistics"])

# Scale the data using a standard SKLearn scaler
power_scaler = SKLearnWrapper(module=StandardScaler(), name="scaler")
scale_power_statistics = power_scaler(x=pipeline["load_power_statistics"])

# Create lagged time series to later be used as regressors
lag_features = Select(start=-2, stop=0, step=1, name="lag_features")(x=scale_power_statistics)

target_multiple_output = Select(start=0, stop=24, step=1, name="sampled_data")(x=scale_power_statistics)
target = Select(start=0, stop=24, step=1, name="sampled_data")(x=pipeline["load_power_statistics"])

# Create a linear regression that uses the lagged values to predict the current value
# NOTE: SKLearnWrapper has to collect all **kwargs itself and fit it against target.
#       It is also possible to implement a join/collect class
regressor_power_statistics = SKLearnWrapper(
    module=LinearRegression(fit_intercept=True)
)(
    features=lag_features,
    calendar=calendar,
    target=target_multiple_output,
)

# Rescale the predictions to be on the original time scale
inverse_power_scale = power_scaler(
    x=regressor_power_statistics, computation_mode=ComputationMode.Transform,
    method="inverse_transform", callbacks=[LinePlotCallback("rescale")]
)

# Calculate the root mean squared error (RMSE) between the linear regression and the true values
# save it as csv file
rmse = RMSE(name="rmse")(y_hat=inverse_power_scale, y=target)

pipeline.set_score("rmse", direction="lower")



In [3]:
#pipeline.draw()

# BG information about Hyperparameter Search with SKLearn/SKtime

In [4]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPRegressor

params = {
 "LinearRegression__module" : [LinearRegression(), MLPRegressor()],
 "scaler__module" : [ MinMaxScaler(), StandardScaler()],
 "calendar__features" : [#[CalendarFeature.weekend],
                         [CalendarFeature.month_cos, CalendarFeature.month_sine, CalendarFeature.weekend],
                         [CalendarFeature.hour_cos, CalendarFeature.hour_sine, CalendarFeature.weekend]],
"lag_features__start":[-24, -1]
}

In [5]:
data = load_elec_data()
train = data.iloc[:6000, :]
test = data.iloc[6000:, :]


In [6]:
tscv = TimeSeriesSplit(test_size=168*4)
pipeline_cv = GridSearchCV(pipeline, param_grid=params, cv=tscv)
pipeline_cv.fit(data)



In [7]:
pipeline_cv.best_params_

{'LinearRegression__module': MLPRegressor(),
 'calendar__features': [<CalendarFeature.hour_cos: 10>,
  <CalendarFeature.hour_sine: 9>,
  <CalendarFeature.weekend: 21>],
 'lag_features__start': -24,
 'scaler__module': StandardScaler()}

In [8]:
pd.DataFrame(pipeline_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_LinearRegression__module,param_calendar__features,param_lag_features__start,param_scaler__module,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.336295,0.071687,0.880754,0.106767,LinearRegression(),"[CalendarFeature.month_cos, CalendarFeature.mo...",-24,MinMaxScaler(),{'LinearRegression__module': LinearRegression(...,-4562.071686,-5153.797094,-4723.804833,-4357.790354,-6571.085664,-5073.709926,793.132543,9
1,0.331599,0.040379,0.848463,0.04818,LinearRegression(),"[CalendarFeature.month_cos, CalendarFeature.mo...",-24,StandardScaler(),{'LinearRegression__module': LinearRegression(...,-4562.071686,-5153.797094,-4723.804833,-4357.790354,-6571.085664,-5073.709926,793.132543,10
2,0.191808,0.020085,0.736825,0.062662,LinearRegression(),"[CalendarFeature.month_cos, CalendarFeature.mo...",-1,MinMaxScaler(),{'LinearRegression__module': LinearRegression(...,-7505.839695,-7913.076257,-7902.796545,-7676.227133,-10031.515002,-8205.890926,925.324306,14
3,0.232735,0.050545,0.713987,0.022845,LinearRegression(),"[CalendarFeature.month_cos, CalendarFeature.mo...",-1,StandardScaler(),{'LinearRegression__module': LinearRegression(...,-7505.839695,-7913.076257,-7902.796545,-7676.227133,-10031.515002,-8205.890926,925.324306,14
4,0.315117,0.101431,0.975843,0.240313,LinearRegression(),"[CalendarFeature.hour_cos, CalendarFeature.hou...",-24,MinMaxScaler(),{'LinearRegression__module': LinearRegression(...,-4633.884418,-5035.476085,-4487.768758,-4788.279849,-4760.298482,-4741.141519,181.617009,8
5,0.254851,0.027814,0.814848,0.030345,LinearRegression(),"[CalendarFeature.hour_cos, CalendarFeature.hou...",-24,StandardScaler(),{'LinearRegression__module': LinearRegression(...,-4633.884418,-5035.476085,-4487.768758,-4788.279849,-4760.298482,-4741.141519,181.617009,7
6,0.257607,0.079453,0.757027,0.023223,LinearRegression(),"[CalendarFeature.hour_cos, CalendarFeature.hou...",-1,MinMaxScaler(),{'LinearRegression__module': LinearRegression(...,-5582.536298,-5910.226909,-5748.953391,-6075.504761,-6230.633594,-5909.570991,229.506142,11
7,0.178563,0.009351,0.707619,0.043441,LinearRegression(),"[CalendarFeature.hour_cos, CalendarFeature.hou...",-1,StandardScaler(),{'LinearRegression__module': LinearRegression(...,-5582.536298,-5910.226909,-5748.953391,-6075.504761,-6230.633594,-5909.570991,229.506142,12
8,1.588793,0.190039,0.879886,0.152345,MLPRegressor(),"[CalendarFeature.month_cos, CalendarFeature.mo...",-24,MinMaxScaler(),"{'LinearRegression__module': MLPRegressor(), '...",-3462.053374,-4400.655184,-4271.739363,-4318.43176,-7001.810782,-4690.938093,1204.079976,5
9,9.738287,0.919881,0.812768,0.056028,MLPRegressor(),"[CalendarFeature.month_cos, CalendarFeature.mo...",-24,StandardScaler(),"{'LinearRegression__module': MLPRegressor(), '...",-2373.364927,-3517.089395,-3091.455116,-2993.381131,-6556.97884,-3706.453882,1471.405712,2


## Hyperparameter search for Non-Sequential Electricity Price Forecasting Pipeline

The second pipeline combines the power of pyWATTS to model non-sequential pipelines with the GridSearch from sklearn. Thus, we define the following non-sequential pipeline that predicts the electricity price of the next day based on calendar information, historical information, and a 48 hour load forecast.


In this notebook, we consider the following simple forecasting scenario. We aim to forecast the day-ahead electricity price. Since the electricity price is dependent on the electrical demand, we create a pipeline that forecasts the electricity demand and uses this forecast as an input for the electricity price forecast. As additional information, we use calendar features.
So we use the following transformers in this pipeline and search for the best hyperparameter:
* CalendarExtraction
  * List of features
* Scaler for the Electricity price
* Scaler for the Electricity demand
* Forecaster for the Electricity price
* Forecaster for the Electricity demand


In [9]:
# Create a pipeline
pipeline = Pipeline(path="../results")

# Extract dummy calendar features, using holidays from Germany
# NOTE: CalendarExtraction can't return multiple features.
calendar = CalendarExtraction(continent="Europe",
                              country="Germany",
                              features=[CalendarFeature.month,CalendarFeature.weekday,
                                        CalendarFeature.weekend],
                              name="calendar"
                              )(x=pipeline["load_power_statistics"])

calendar_load = Select(start=0, stop=24, step=1)(x=calendar)
calendar_price = Select(start=0, stop=48, step=1)(x=calendar)

# Scale the data using a standard SKLearn scaler
power_scaler = SKLearnWrapper(module=StandardScaler(), name="scaler_power")
scale_power = power_scaler(x=pipeline["load_power_statistics"])

price_scaler = SKLearnWrapper(module=StandardScaler(), name="scaler_price")
scale_price = price_scaler(x=pipeline["price_day_ahead"])

# Create lagged time series to later be used as regressors
lag_features_load = Select(start=-24, stop=0, step=1, name="lag_features_load")(x=scale_power)
lag_features_price = Select(start=-24, stop=0, step=1, name="lag_features_price")(x=scale_price)

target_load = Select(start=0, stop=48, step=1, name="load_hist")(x=scale_power)
target_price = Select(start=0, stop=24, step=1, name="price_hist")(x=scale_price)

target_price_unscaled = Select(start=0, stop=24, step=1, name="target")(x=pipeline["price_day_ahead"])

# Create a linear regression that uses the lagged values to predict the current value
# NOTE: SKLearnWrapper has to collect all **kwargs itself and fit it against target.
#       It is also possible to implement a join/collect class
forecast_load = SKLearnWrapper(module=LinearRegression(fit_intercept=True), name="load_forecast")(
    features=lag_features_load,
    calendar=calendar_load,
    target=target_load,
)

forecast_price_scaled = SKLearnWrapper(module=LinearRegression(fit_intercept=True), name="price_forecast")(
    features=lag_features_price,
    calendar=calendar_price,
    load=forecast_load,
    target=target_price,
)

# Rescale the predictions to be on the original time scale
forecast_price = price_scaler(
    x=forecast_price_scaled, computation_mode=ComputationMode.Transform,
    method="inverse_transform", callbacks=[LinePlotCallback("rescale")]
)

# Calculate the root mean squared error (RMSE) between the linear regression and the true values
# save it as csv file
rmse = RMSE(name="rmse")(y_hat=forecast_price, y=target_price_unscaled)

pipeline.set_score("rmse", direction="lower")



In [10]:

params = {
    "load_forecast__module": [LinearRegression(), MLPRegressor()],
    "price_forecast__module": [LinearRegression(), MLPRegressor()],
    "scaler_power__module": [MinMaxScaler(), StandardScaler()],
    "scaler_price__module": [MinMaxScaler(), StandardScaler()],
    "calendar__features": [[CalendarFeature.month_cos, CalendarFeature.month_sine, CalendarFeature.weekend],
                           [CalendarFeature.hour_cos, CalendarFeature.hour_sine, CalendarFeature.weekend]],
}

In [11]:
result, summary = pipeline.train(data=train)

#pipeline.test(data=test)
pipeline.score(data.iloc[6000:, :])


-12.726233709498345

In [12]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(test_size=168*4)
pipeline_cv = GridSearchCV(pipeline, param_grid=params, cv=tscv)
pipeline_cv.fit(data)



In [13]:
pipeline_cv.best_params_

{'calendar__features': [<CalendarFeature.month_cos: 4>,
  <CalendarFeature.month_sine: 3>,
  <CalendarFeature.weekend: 21>],
 'load_forecast__module': LinearRegression(),
 'price_forecast__module': LinearRegression(),
 'scaler_power__module': MinMaxScaler(),
 'scaler_price__module': MinMaxScaler()}

In [14]:
pd.DataFrame(pipeline_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_calendar__features,param_load_forecast__module,param_price_forecast__module,param_scaler_power__module,param_scaler_price__module,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.095976,0.175444,1.283085,0.185376,"[CalendarFeature.month_cos, CalendarFeature.mo...",LinearRegression(),LinearRegression(),MinMaxScaler(),MinMaxScaler(),{'calendar__features': [CalendarFeature.month_...,-6.256721,-14.143013,-11.909271,-10.510416,-13.983219,-11.360528,2.887677,1
1,0.907825,0.032746,1.241801,0.257661,"[CalendarFeature.month_cos, CalendarFeature.mo...",LinearRegression(),LinearRegression(),MinMaxScaler(),StandardScaler(),{'calendar__features': [CalendarFeature.month_...,-6.259465,-14.144792,-11.908073,-10.51433,-13.983064,-11.361945,2.886746,4
2,0.914369,0.091208,1.088443,0.042501,"[CalendarFeature.month_cos, CalendarFeature.mo...",LinearRegression(),LinearRegression(),StandardScaler(),MinMaxScaler(),{'calendar__features': [CalendarFeature.month_...,-6.259287,-14.141893,-11.909574,-10.515821,-13.982501,-11.361815,2.886117,3
3,0.900442,0.066367,1.086463,0.045719,"[CalendarFeature.month_cos, CalendarFeature.mo...",LinearRegression(),LinearRegression(),StandardScaler(),StandardScaler(),{'calendar__features': [CalendarFeature.month_...,-6.260751,-14.141737,-11.909581,-10.513772,-13.982564,-11.361681,2.885702,2
4,2.436155,0.205387,1.149775,0.081855,"[CalendarFeature.month_cos, CalendarFeature.mo...",LinearRegression(),MLPRegressor(),MinMaxScaler(),MinMaxScaler(),{'calendar__features': [CalendarFeature.month_...,-12.600532,-17.425032,-14.300181,-14.03163,-15.834667,-14.838408,1.651171,27
5,12.690692,4.006184,1.118261,0.081401,"[CalendarFeature.month_cos, CalendarFeature.mo...",LinearRegression(),MLPRegressor(),MinMaxScaler(),StandardScaler(),{'calendar__features': [CalendarFeature.month_...,-10.861537,-14.382651,-15.372472,-19.809067,-14.47575,-14.980295,2.865196,28
6,2.649727,0.167046,1.155497,0.025594,"[CalendarFeature.month_cos, CalendarFeature.mo...",LinearRegression(),MLPRegressor(),StandardScaler(),MinMaxScaler(),{'calendar__features': [CalendarFeature.month_...,-11.089482,-17.461229,-16.643316,-15.29612,-16.352356,-15.368501,2.248905,30
7,12.161681,3.284225,1.197727,0.071062,"[CalendarFeature.month_cos, CalendarFeature.mo...",LinearRegression(),MLPRegressor(),StandardScaler(),StandardScaler(),{'calendar__features': [CalendarFeature.month_...,-10.806261,-13.533408,-14.823279,-16.977639,-14.643222,-14.156762,2.012593,24
8,2.87405,0.277515,1.155328,0.04537,"[CalendarFeature.month_cos, CalendarFeature.mo...",MLPRegressor(),LinearRegression(),MinMaxScaler(),MinMaxScaler(),{'calendar__features': [CalendarFeature.month_...,-8.953317,-14.224443,-12.451935,-12.102103,-14.91444,-12.529248,2.075502,18
9,2.864568,0.320775,1.152309,0.037033,"[CalendarFeature.month_cos, CalendarFeature.mo...",MLPRegressor(),LinearRegression(),MinMaxScaler(),StandardScaler(),{'calendar__features': [CalendarFeature.month_...,-7.939068,-14.42028,-12.089059,-11.991292,-14.066967,-12.101333,2.305556,15
