## Feature Engineering

In [20]:
import datetime, time, os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import glob
import json

from functools import reduce

# Make numpy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

from datetime import datetime, timezone, timedelta

import warnings

In [21]:
df_merged = pd.read_pickle('../data/pickle/df_merged_5.pickle')
df_merged.shape

(17358, 63)

In [22]:
count = np.isinf(df_merged).values.sum()
print("The data frame contains " + str(count) + " infinite values")
print("The data Frame has",df_merged.isnull().sum().sum(),"missing values.")

The data frame contains 0 infinite values
The data Frame has 0 missing values.


In [23]:
correlations = df_merged.corr(method='pearson')
print(correlations['rebap_eur_mwh'].sort_values(ascending=False).to_string())

rebap_eur_mwh                               1.000000
epex_da_de_eur_mwh                          0.243183
generationplusimbalancepriceeurmwhmbadk1    0.147773
generationimbalancepriceeurmwhmbadk1        0.147303
generationplusimbalancepriceeurmwhmbadk2    0.137709
rebap_eur_mwh_PL                            0.134192
priceforconsumptioneurmwhmbadk1             0.130247
rebap_eur_mwh_dk2                           0.123288
priceforconsumptioneurmwhmbadk2             0.111158
Wasserkraft[MWh]                            0.101527
DE_power_mw_y                               0.080726
Pumpspeicher[MWh]                           0.080279
rebap_eur_mwh_BE                            0.080030
50Hertz_power_mw_y                          0.078830
DK1_power_mw_y                              0.072103
TTG_power_mw_y                              0.063629
Wind Onshore[MWh]                           0.061494
DK_power_mw_y                               0.059533
rebap_eur_mwh_AUS                           0.

In [24]:
#Simplify certain column names 
df_merged.rename(columns={'Biomasse[MWh]' : "Biomasse",
                            "Wasserkraft[MWh]" : "Wasserkraft", 
                            "Braunkohle[MWh]" : "Braunkohle", 
                            "Wind Onshore[MWh]" : "Wind_Onshore",
                            'Wind Offshore[MWh]' : "Wind_Offshore" ,
                            'Photovoltaik[MWh]': "Photovoltaik",
                            'Sonstige Erneuerbare[MWh]' : "Sonstige_Erneuerbare",
                            'Kernenergie[MWh]' : "Kernenergie",
                            'Steinkohle[MWh]' : "Steinkohle" ,
                            'Erdgas[MWh]' : "Erdgas", 
                            'Pumpspeicher[MWh]' : "Pumpspeicher",
                            'Sonstige Konventionelle[MWh]' :  "Sonstige_Konventionelle"
                        },inplace=True)
df_merged.columns

Index(['50Hertz_power_mw_x', 'DE_power_mw_x', 'DK_power_mw_x',
       'DK1_power_mw_x', 'TTG_power_mw_x', 'total_pred_cons', 'Biomasse',
       'Wasserkraft', 'Wind_Offshore', 'Wind_Onshore', 'Photovoltaik',
       'Sonstige_Erneuerbare', 'Kernenergie', 'Braunkohle', 'Steinkohle',
       'Erdgas', 'Pumpspeicher', 'Sonstige_Konventionelle', 'rel_total',
       'diff_prog_real', 'rebap_eur_mwh', 'rz_saldo_mwh', 'rebap_eur_mwh_BE',
       'rz_saldo_mwh_BE', 'rebap_eur_mwh_PL', 'rz_saldo_mwh_PL',
       'rebap_eur_mwh_AUS', 'rz_saldo_mwh_AUS',
       'generationplusimbalancepriceeurmwhmbadk2', 'rebap_eur_mwh_dk2',
       'priceforconsumptioneurmwhmbadk2', 'totalimbalancemwhmbadk2',
       'generationplusimbalancepriceeurmwhmbadk1',
       'generationimbalancepriceeurmwhmbadk1',
       'priceforconsumptioneurmwhmbadk1', 'totalimbalancemwhmbadk1',
       'sechs_h_regelung', 'epex_da_de_eur_mwh', '50Hertz_power_mw_x',
       'DE_power_mw_x', 'DK_power_mw_x', 'DK1_power_mw_x', 'TTG_power_mw_x'

# Additional features

We create new features to achive additionally higher correlation with our target. Rolling average (window) of different time periods, which create a soothing affect and subtract them (In financial market analysis, this intersection has a signal attribute)

In [25]:
df_merged.eval('Braunkohle_roll_diff_32_8 = Braunkohle.rolling(window=16).mean() - Braunkohle.rolling(window=4).mean()', inplace = True)
df_merged.eval('Wasserkraft_roll_diff_16_2 = Wasserkraft.rolling(window=4).mean() - Wasserkraft.rolling(window=2).mean()', inplace = True)
df_merged["rz_saldo_mwh_diff"] = df_merged["rz_saldo_mwh"]-df_merged["rz_saldo_mwh"].shift()

df_merged.eval('rz_saldo_roll_win16 = rz_saldo_mwh.rolling(window=16).mean()', inplace = True)
df_merged.eval('rz_saldo_roll_win4 = rz_saldo_mwh.rolling(window=4).mean()', inplace = True)
df_merged.eval('rz_saldo_roll_win2 = rz_saldo_mwh.rolling(window=2).mean()', inplace = True)
df_merged.eval('rz_saldo_roll_diff_16_2 = rz_saldo_roll_win16 - rz_saldo_roll_win2', inplace = True)

df_merged["epex_diff"] = df_merged["epex_da_de_eur_mwh"]-df_merged["epex_da_de_eur_mwh"].shift()
df_merged.eval('epex_roll_win8 = epex_da_de_eur_mwh.rolling(window=8).mean()', inplace = True)
df_merged.eval('epex_roll_win2 = epex_da_de_eur_mwh.rolling(window=4).mean()', inplace = True)
df_merged.eval('epex_roll_diff_16_2 = epex_da_de_eur_mwh.rolling(window=16).mean() - epex_roll_win2', inplace = True)

df_merged.eval('fac_epex_rz_saldo = epex_da_de_eur_mwh * rz_saldo_mwh', inplace = True)

In [26]:
#df_merged.fillna(0, inplace=True)
df_merged.sort_index();

In [27]:
correlations = df_merged.corr(method='pearson')
print(correlations['rebap_eur_mwh'].sort_values(ascending=False).to_string())

rebap_eur_mwh                               1.000000
rz_saldo_roll_diff_16_2                     0.249470
epex_roll_win2                              0.247754
epex_roll_win8                              0.244680
epex_da_de_eur_mwh                          0.243183
generationplusimbalancepriceeurmwhmbadk1    0.147773
generationimbalancepriceeurmwhmbadk1        0.147303
generationplusimbalancepriceeurmwhmbadk2    0.137709
rebap_eur_mwh_PL                            0.134192
priceforconsumptioneurmwhmbadk1             0.130247
rebap_eur_mwh_dk2                           0.123288
priceforconsumptioneurmwhmbadk2             0.111158
Wasserkraft                                 0.101527
DE_power_mw_y                               0.080726
Pumpspeicher                                0.080279
rebap_eur_mwh_BE                            0.080030
50Hertz_power_mw_y                          0.078830
DK1_power_mw_y                              0.072103
TTG_power_mw_y                              0.

In [28]:
df_merged.shape

(17358, 75)

In [29]:
df_merged.astype(np.float32)

Unnamed: 0_level_0,50Hertz_power_mw_x,DE_power_mw_x,DK_power_mw_x,DK1_power_mw_x,TTG_power_mw_x,total_pred_cons,Biomasse,Wasserkraft,Wind_Offshore,Wind_Onshore,...,rz_saldo_mwh_diff,rz_saldo_roll_win16,rz_saldo_roll_win4,rz_saldo_roll_win2,rz_saldo_roll_diff_16_2,epex_diff,epex_roll_win8,epex_roll_win2,epex_roll_diff_16_2,fac_epex_rz_saldo
dt_start_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-01 02:30:00,1051.0,2590.0,131.0,54.0,854.0,4680.0,1.138,295.0,95.0,890.000,...,,,,,,,,,,2160.939697
2021-01-01 02:45:00,1027.0,2536.0,131.0,54.0,827.0,4575.0,1.139,295.0,90.0,860.000,...,22.0,,,63.0,,-0.699121,,,,3023.448242
2021-01-01 03:00:00,1004.0,2486.0,105.0,43.0,801.0,4439.0,1.138,294.0,80.0,843.000,...,-18.0,,,65.0,,-0.467409,,,,2261.840088
2021-01-01 03:15:00,989.0,2452.0,105.0,43.0,783.0,4372.0,1.137,290.0,76.0,833.000,...,14.0,,63.00,63.0,,-0.235698,,40.739559,,2810.801270
2021-01-01 03:30:00,978.0,2423.0,105.0,43.0,765.0,4314.0,1.139,292.0,73.0,812.000,...,-151.0,,29.75,-5.5,,-0.003986,,40.388008,,-3252.175537
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-06-30 20:45:00,4194.0,9994.0,756.0,625.0,3352.0,18921.0,1.121,562.0,420.0,1.919,...,136.0,-108.0000,-131.25,-147.0,39.0000,-3.462849,94.427879,91.704185,4.958381,-6859.005371
2021-06-30 21:00:00,4246.0,10153.0,834.0,694.0,3427.0,19354.0,1.123,632.0,431.0,1.874,...,-12.0,-111.6250,-142.25,-85.0,-26.6250,-2.462854,92.845375,88.791687,6.895256,-7676.759766
2021-06-30 21:15:00,4291.0,10299.0,834.0,694.0,3492.0,19610.0,1.121,595.0,450.0,1.904,...,34.0,-116.3750,-110.50,-74.0,-42.3750,-1.462860,91.078316,86.091423,8.542856,-4725.137207
2021-06-30 21:30:00,4319.0,10429.0,834.0,694.0,3555.0,19831.0,1.125,572.0,475.0,1.914,...,61.0,-111.0625,-55.75,-26.5,-84.5625,-0.462865,89.216156,84.128571,9.435544,329.737091


In [30]:
count = np.isinf(df_merged).values.sum()
print("The data frame contains " + str(count) + " infinite values")
print("The data frame has",df_merged.isnull().sum().sum(),"missing values.")

The data frame contains 0 infinite values
The data frame has 79 missing values.


## Data Extension

We want to extract date specific information: the weekday and if its a buisness day /hours

In [31]:
import datetime
df_merged['weekday'] = df_merged.index.weekday
df_merged['businessday'] = np.where((df_merged.index.weekday) < 5,1,0)

Decide between business hours

In [32]:
m1 = df_merged.index.dayofweek >= 0
m2 = df_merged.index.dayofweek < 5
m3 = df_merged.index.hour >= 5 
m4 = df_merged.index.hour <= 18

df_merged["businesshours"] = int(-1)
df_merged['businesshours'] = df_merged['businesshours'].mask(m1 & m2 & m3 & m4, '1')
df_merged["businesshours"] = df_merged["businesshours"].astype('int64')

## Extract mathematical sign 

Here, we extract the mathematical sign of the regulation power saldo. 

In [33]:
df_merged.eval('rz_saldo_mwh_sign = rz_saldo_mwh/abs(rz_saldo_mwh)', inplace=True)

In [34]:
correlations = df_merged.corr(method='pearson')
#print(correlations['rebap_eur_mwh'].sort_values(ascending=False).to_string())
print(correlations['rebap_eur_mwh'].abs().sort_values(ascending=False).to_string())

rebap_eur_mwh                               1.000000
rz_saldo_mwh                                0.366963
rz_saldo_roll_win2                          0.350436
fac_epex_rz_saldo                           0.348191
rz_saldo_roll_win4                          0.320539
rz_saldo_roll_diff_16_2                     0.249470
epex_roll_win2                              0.247754
epex_roll_win8                              0.244680
epex_da_de_eur_mwh                          0.243183
rz_saldo_mwh_sign                           0.241402
rz_saldo_roll_win16                         0.213990
generationplusimbalancepriceeurmwhmbadk1    0.147773
generationimbalancepriceeurmwhmbadk1        0.147303
generationplusimbalancepriceeurmwhmbadk2    0.137709
rebap_eur_mwh_PL                            0.134192
priceforconsumptioneurmwhmbadk1             0.130247
rebap_eur_mwh_dk2                           0.123288
priceforconsumptioneurmwhmbadk2             0.111158
rz_saldo_mwh_diff                           0.

We drop missing values created by our feature engineering

In [35]:
df_merged.dropna(axis=0,inplace=True)

In [36]:
df_merged.shape

(17292, 79)

## Save Data

In [37]:
#Final check if infinit or missing values in our data frame
count = np.isinf(df_merged).values.sum()
print("The data frame contains " + str(count) + " infinite values")
print("The Data Frame has",df_merged.isnull().sum().sum(),"missing values.")

The data frame contains 0 infinite values
The Data Frame has 0 missing values.


In [38]:
df_merged.to_pickle('../data/pickle/df_merged_6.pickle')