# BLU06  - Exercise Notebook 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import hashlib # for grading purposes
import json
from random import seed
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import ParameterGrid
import itertools
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa import stattools
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
from pandas.plotting import lag_plot

plt.rcParams['figure.figsize'] = (12, 4.2)
idx = pd.IndexSlice
import utils

Let's predict wind power production! We will use the dataset with hourly values of eolic electricity production in Jan-March 2010.

In [None]:
df_wp = pd.read_csv('data/wind_power.csv')
df_wp['date'] = pd.to_datetime(df_wp['date'])
df_wp = df_wp.set_index('date').sort_index()
df_wp.head()

Here is the plot:

In [None]:
df_wp.plot()
plt.ylabel('Wind power')
plt.xlabel('Time')
plt.title('Wind power production for the initial months of 2010');

We also prepare the train and test sets for you. The test set is the last day (24 h).

In [None]:
train_wp = df_wp.iloc[:-24]
test_wp = df_wp.iloc[-24:]

## Exercise 1 - One-step forecast

### Exercise 1.1 - Prepare the data
Create a dataframe with features called `df_features` from the wind power train set following the steps below. We did the first step for you.
- Create the target, the wind power production in the next hour.
- Plot the PACF and identify the three most significant lags (including the negative ones).
- Create three lagged features a, b, c. Use the three lag values identified in the PACF plot.

In [None]:
#df_features = train_wp.copy()
#df_features['target'] =
#df_features['lag_a'] = 
#df_features['lag_b'] = 
#df_features['lag_c'] = 

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert df_features.shape==(2136,5), 'The shape of the dataframe is not correct.'
assert hashlib.sha256(json.dumps(''.join(sorted(df_features.columns))).encode()).hexdigest() == \
'0341a6d0522a5dda2a452392e5269c2ba4804b3bd6909a8897cba62f489010a0', 'The column names are not correct.'
assert hashlib.sha256(json.dumps(str(df_features.iloc[-1:].index[0])).encode()).hexdigest() == \
'4c666882f1e52aeb56bb743b24e1d45cab3ce37f2e62b9d61ab4d3f9a034857d', 'The last timestamp is not correct.'
assert hashlib.sha256(json.dumps(''.join([str(i) for i in df_features.iloc[-1].sort_index()])).encode()).hexdigest() == \
'fd7c2340b73fa498c1e04db1f93f1d17fec13771c618132650d928dc8b11e7e8', 'The values in the last row are not correct.'

### Exercise 1.2 - Separate the train and test set
Separate the `df_features` dataframe into a train and test set and drop the null values from the train set. The test set is the last row of the `df_features` dataframe, train set is the rest.

In [None]:
# df_train =
# df_test =

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert df_train.shape==(2110,5), 'The shape of the train set is not correct.'
assert df_test.shape==(1,5), 'The shape of the test set is not correct.'
assert hashlib.sha256(json.dumps(''.join([str(i) for i in df_train.index])).encode()).hexdigest() == \
'07a96943bfc21ffc95a9bcdc69f4b3556a6f143bb9d0684fa58f1fef9b673d07', 'The data in the train set is not correct.'
assert hashlib.sha256(json.dumps(str(df_test.index[0])).encode()).hexdigest() == \
'4c666882f1e52aeb56bb743b24e1d45cab3ce37f2e62b9d61ab4d3f9a034857d', 'The timestamp of the test set is not correct.'

### Exercise 1.3 - Model
Fit a sklearn linear regression model to the training set. The fitted model should be stored in the `model` variable.

In [None]:
# model =

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert isinstance(model, LinearRegression), 'Did you use the correct model?'
assert hashlib.sha256(json.dumps(''.join(sorted(model.feature_names_in_))).encode()).hexdigest() == \
'1678e066cb05c24a374a598c30c5a3d754cbba8e974808046ab5b9fd79faf9f3', 'The features used in the model are not correct.'
assert hashlib.sha256(json.dumps(str(round(model.intercept_,4))).encode()).hexdigest() == \
'4ac126c7537977aa9306b4f75abc3d3ab0f7c665c64e0207a842958babfcc783', 'The output of the model is not correct.'
assert hashlib.sha256(json.dumps(''.join([str(round(i,4)) for i in sorted(model.coef_)])).encode()).hexdigest() == \
'35a1a4c2f32a49b0fb2eff0a005701ba0351c77a550631ee33da734c74a1e607', 'The output of the model is not correct.'

### Exercise 1.4 - Forecast
Make a forecast on the test set and calculate the MAE of the forecast. Store the results in the corresponding variables.

In [None]:
# forecast_one_step = 
# mae_one_step = 

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert hashlib.sha256(json.dumps(str(round(forecast_one_step[0],4))).encode()).hexdigest() == \
'7bebf20e9750f0c225f299cdbbd91ee290c08acb5c51343f98680bb39a8d898c', 'The forecast is not correct.'
assert hashlib.sha256(json.dumps(str(round(mae_one_step,4))).encode()).hexdigest() == \
'44fed59e085de65f6b766de55f4b0aa61da81040478f5a2e1cf643359ddd15b6', 'The MAE is not correct.'

## Exercise 2 - Let's go into multi-step prediction!
We will use the infrastructure defined in the learning notebooks for the forecast. All the necessary functions are in the utils file.

### Exercise 2.1 - Create a validation set
Separate the train set into a validation and train set. Use the last 24 hours for the validation set.

In [None]:
# df_multistep_train =
# df_multistep_val =


# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert hashlib.sha256(json.dumps(''.join([str(i) for i in df_multistep_train.index])).encode()).hexdigest() == \
'073b1b40c5025aed5e04821304765b919b44d0ae8e558f8b9fb7327e8b5dd131', 'The train set is not correct.'
assert hashlib.sha256(json.dumps(''.join([str(i) for i in df_multistep_val.index])).encode()).hexdigest() == \
'd1744856f01e40a869698dd09654711669edd67180ab4f725502f81027a55c75', 'The validation set is not correct.'

### Exercise 2.2 - Hyperparameter grid
Create a dictionary called `param_grid` to use in a parameter grid for tuning a regression time series model with the following conditions. Keep the order of the parameter values as listed.
- Include the linear regression and gradient boosting regressor as `model`. For the gradient boosting regressor use `n_estimators`=20 and `random_state`=10.
- Use 3 and 26 lags (`num_lags`).
- Use 0 and 15 diffs (`num_diffs`).
- Set the `weekday`, `month` and `holidays` to False. These shouldn't affect wind power generation.
- Don't use rolling window features (you still have to include this parameter in the grid).

In [None]:
# param_grid = 

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert isinstance(param_grid,dict), 'param_grid should be a dictionary'
assert len(param_grid)==7, 'The dictionary should have 7 items.'
assert hashlib.sha256(json.dumps(''.join(sorted(param_grid.keys()))).encode()).hexdigest() == \
'89e9e480b892a82143688daec4068c63b7516dd75236bf25d9da41bdc54bdfe6', 'The keys of the dictionary are not correct.'
assert hashlib.sha256(json.dumps(''.join([str(i) for i in param_grid['model']])).encode()).hexdigest() == \
'2bfbaf3fc1c49731221d344654f1d2fde44e74abd4f3bd43f818b3fb1a80dd62', "The 'model' item is not correct."
assert hashlib.sha256(json.dumps(''.join([str(i) for i in param_grid['num_lags']])).encode()).hexdigest() == \
'040c38181df1fefff00e092ae645bca29bf4d29cc919f15cf3450c6b98678c98', "The 'num_lags' item is not correct."
assert hashlib.sha256(json.dumps(''.join([str(i) for i in param_grid['num_diffs']])).encode()).hexdigest() == \
'e6a178c0829d9454edf2a957da734a790fdbd375079d4ad06134d82988e9cb1c', "The 'num_diffs' item is not correct."
assert param_grid['weekday']==param_grid['month']==param_grid['holidays'] and not param_grid['month'][0],\
"The 'weekday', 'month' or 'holidays' items are not correct."
assert isinstance(param_grid['rolling'][0],list) and len(param_grid['rolling'][0])==0, "The 'rolling' item is not correct."

### Exercise 2.3 - Hyperparameter optimization
Perform a grid search for the multi-step forecast using the parameter grid from the previous exercise. Use the `df_multistep_train` data for training and `df_multistep_val` data for validation. Calculate MAE for each model outcome. Store the results in the list `grid_search_result`. Each entry should be a tuple of the parameter set (a dictionary) and the corresponding MAE.

Finally, find the parameter set with the smallest MAE and store it in the `best_params` dictionary.

In [None]:
%%time 
# grid_search_result = []
# best_params = 

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert isinstance(grid_search_result, list), 'The result should be a list.'
assert len(grid_search_result)==8, 'The length of the results list is not correct.'
assert isinstance(best_params, dict), 'best_params should be a dictionary.'
assert len(best_params)==7, 'The length of the best_params dictionary is not correct.'
assert hashlib.sha256(json.dumps(best_params['num_lags']).encode()).hexdigest() == \
'4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce', 'The num_lags parameter is not correct.'
assert hashlib.sha256(json.dumps(best_params['num_diffs']).encode()).hexdigest() == \
hashlib.sha256(json.dumps(best_params['num_diffs']).encode()).hexdigest(), 'The num_diffs parameter is not correct.'
assert hashlib.sha256(json.dumps(str(best_params['model'])).encode()).hexdigest() == \
'6806292b7854910f566b0af0e8b5f8e0516decce03414eb2b84e065d72a6def7', 'The model parameter is not correct.'

### Exercise 2.4 - Best model
Merge the train and val data sets and train the best model from the grid search on this data. Then make a forecast for the test set and calculate the MAE. Store the results in the corresponding variables.

In [None]:
# forecast_multi_step = 
# mae_multi_step = 

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert len(forecast_multi_step)==24, 'The length of the forecast is not correct.'
assert hashlib.sha256(json.dumps(str(round(mae_multi_step,4))).encode()).hexdigest() == \
'7e5bffaee5ae90d24740726ce68df1aac332000395ff3cea6c3f3a7f1db9162b', 'The MAE is not correct.'

Plot your forecast below.

In [None]:
plt.plot(test_wp, label='original')
plt.plot(pd.Series(forecast_multi_step,index=test_wp.index), label='multi-step forecast')
plt.xlabel('Time')
plt.ylabel('Wind power')
plt.legend();

## Exercise 3 - Finally, we'll add exogenous features to improve model performance!
This is our exogenous feature, the windspeed forecast:

In [None]:
exog = pd.read_csv('data/wind_speed_forecast.csv')
exog['date'] = pd.to_datetime(exog['date'])
exog = exog.set_index('date').sort_index()
exog.head()

In [None]:
exog.plot()
plt.xlabel('Time')
plt.ylabel('Wind speed');

### Exercise 3.1 - Hyperparameter grid (again)
Create a dictionary called `param_grid_exog` to use in a parameter grid for tuning a regression time series model with the following conditions. Keep the order of the parameter values as listed.
- Include the linear regression and gradient boosting regressor as `model`. For the gradient boosting regressor use `n_estimators`=20 and `random_state`=10.
- Use 3 and 26 lags (`num_lags`).
- Use 0 for diffs (`num_diffs`).
- Set the `weekday`, `month` and `holidays` to False. These shouldn't affect wind power generation.
- Don't use rolling window features (you still have to include this parameter in the grid).
- Add the exogenous variable as the `exog` parameter.

In [None]:
# param_grid_exog = 

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert isinstance(param_grid_exog,dict), 'param_grid should be a dictionary'
assert len(param_grid_exog)==8, 'The dictionary should have 7 items.'
assert hashlib.sha256(json.dumps(''.join(sorted(param_grid_exog.keys()))).encode()).hexdigest() == \
'8a813298b9e53dd0eaddffb9544957e44528283386d4608ee725e013ab8083ef', 'The keys of the dictionary are not correct.'
assert hashlib.sha256(json.dumps(''.join([str(i) for i in param_grid_exog['model']])).encode()).hexdigest() == \
'2bfbaf3fc1c49731221d344654f1d2fde44e74abd4f3bd43f818b3fb1a80dd62', "The 'model' item is not correct."
assert hashlib.sha256(json.dumps(''.join([str(i) for i in param_grid_exog['num_lags']])).encode()).hexdigest() == \
'040c38181df1fefff00e092ae645bca29bf4d29cc919f15cf3450c6b98678c98', "The 'num_lags' item is not correct."
assert hashlib.sha256(json.dumps(''.join([str(i) for i in param_grid_exog['num_diffs']])).encode()).hexdigest() == \
'98089e6d36f78e9766c9ea34d5acb3611f3a92cd81c5eb102095d924ffc7d08b', "The 'num_diffs' item is not correct."
assert (param_grid_exog['weekday']==param_grid_exog['month']==param_grid_exog['holidays'] and 
not param_grid_exog['month'][0]), "The 'weekday', 'month' or 'holidays' items are not correct."
assert isinstance(param_grid_exog['rolling'][0],list) and len(param_grid_exog['rolling'][0])==0, "The 'rolling' item is not correct."

### Exercise 3.2 - Hyperparameter optimization with the exogenous variable
Perform a grid search for the multi-step forecast with the exogenous variable using the parameter grid from the previous exercise. Use the `df_multistep_train` data for training and `df_multistep_val` data for validation. Calculate MAE for each model outcome. Store the results in the list `grid_search_result_exog`. Each entry should be a tuple of the parameter set (a dictionary) and the corresponding MAE.

Finally, find the parameter set with the smallest MAE and store it in the `best_params_exog` dictionary.

In [None]:
%%time 
# grid_search_result_exog = []
# best_params_exog = 

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert isinstance(grid_search_result_exog, list), 'The result should be a list.'
assert len(grid_search_result_exog)==4, 'The length of the results list is not correct.'
assert isinstance(best_params_exog, dict), 'best_params should be a dictionary.'
assert len(best_params_exog)==8, 'The length of the best_params dictionary is not correct.'
assert hashlib.sha256(json.dumps(best_params_exog['num_lags']).encode()).hexdigest() == \
'4e07408562bedb8b60ce05c1decfe3ad16b72230967de01f640b7e4729b49fce', 'The num_lags parameter is not correct.'
assert hashlib.sha256(json.dumps(str(best_params_exog['model'])).encode()).hexdigest() == \
'6806292b7854910f566b0af0e8b5f8e0516decce03414eb2b84e065d72a6def7', 'The model parameter is not correct.'

### Exercise 3.3 - Best model with the exogenous variable
Merge the train and val data sets and train the best model from the grid search with the exogenous variable on this data. Then make a forecast for the test set and calculate the MAE. Store the results in the corresponding variables.

In [None]:
# forecast_multi_step_exog = 
# mae_multi_step_exog = 

# YOUR CODE HERE
raise NotImplementedError()

In [None]:
assert len(forecast_multi_step_exog)==24, 'The length of the forecast is not correct.'
assert hashlib.sha256(json.dumps(str(round(mae_multi_step_exog,4))).encode()).hexdigest() == \
'fd2bc78e311cafe2cd9ac06c513301a67d6dbd7ee4bda02834c4b51d225b59cd', 'The MAE is not correct.'

Plot your forecast below.

In [None]:
plt.plot(test_wp, label='original')
plt.plot(pd.Series(forecast_multi_step_exog,index=test_wp.index), label='multi-step forecast with exog')
plt.xlabel('Time')
plt.ylabel('Wind power')
plt.legend();

Definitely some improvement!

Congratulations, you mastered all the time series spec exercises! Now get some datasets and practice for the hackathon.