In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import math
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

# Importing data

In [2]:
os.getcwd()

'/Users/michael.kuntz/code/HaukeFock/predict_energy_generation/notebooks'

In [3]:
daily_data_path = '/Users/michael.kuntz/code/HaukeFock/predict_energy_generation/raw_data/Target/Daily/energy_generation_data/df_deutschland.csv'
hourly_data_path = '/Users/michael.kuntz/code/HaukeFock/predict_energy_generation/raw_data/Target/Hourly/energy_generation_data/df_deutschland.csv'

## Importing target

In [4]:
daily_df = pd.read_csv(daily_data_path)

daily_df['Date'] = pd.to_datetime(daily_df['Date'])
daily_df.set_index('Date', inplace=True)

daily_df.head()

Unnamed: 0_level_0,Wind offshore[MWh],Wind onshore[MWh],Photovoltaics[MWh],region
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-01,11.989583,1278.427083,75.53125,50hertz
2015-01-02,9.947917,2555.510417,17.677083,50hertz
2015-01-03,11.28125,2308.958333,27.354167,50hertz
2015-01-04,11.354167,2079.041667,49.75,50hertz
2015-01-05,4.104167,1333.34375,20.1875,50hertz


In [5]:
hourly_df = pd.read_csv(hourly_data_path)

hourly_df['Date'] = pd.to_datetime(hourly_df['Date'])
hourly_df.set_index('Date', inplace=True)

hourly_df.tail()

Unnamed: 0_level_0,Wind offshore[MWh],Wind onshore[MWh],Photovoltaics[MWh],region
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-01 19:00:00,0.0,13.0,0.0,transnet_bw
2021-01-01 20:00:00,0.0,21.0,0.0,transnet_bw
2021-01-01 21:00:00,0.0,18.0,0.0,transnet_bw
2021-01-01 22:00:00,0.0,25.0,0.0,transnet_bw
2021-01-01 23:00:00,0.0,29.0,0.0,transnet_bw


In [6]:
hourly_de_df = hourly_df.groupby(hourly_df.index).sum()
hourly_de_df.head()

Unnamed: 0_level_0,Wind offshore[MWh],Wind onshore[MWh],Photovoltaics[MWh]
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-01-01 00:00:00,517.0,8129.0,0.0
2015-01-01 01:00:00,516.0,8300.0,0.0
2015-01-01 02:00:00,514.0,8544.0,0.0
2015-01-01 03:00:00,519.0,8552.0,0.0
2015-01-01 04:00:00,522.0,8646.0,0.0


In [7]:
hourly_wind_de_df = pd.DataFrame(hourly_de_df.iloc[:,0] + hourly_de_df.iloc[:,1],columns=['Wind[MWh]'])
hourly_wind_de_df.head()

Unnamed: 0_level_0,Wind[MWh]
Date,Unnamed: 1_level_1
2015-01-01 00:00:00,8646.0
2015-01-01 01:00:00,8816.0
2015-01-01 02:00:00,9058.0
2015-01-01 03:00:00,9071.0
2015-01-01 04:00:00,9168.0


In [8]:
hourly_wind_de_df.tail()

Unnamed: 0_level_0,Wind[MWh]
Date,Unnamed: 1_level_1
2021-01-01 19:00:00,4845.0
2021-01-01 20:00:00,5028.0
2021-01-01 21:00:00,5204.0
2021-01-01 22:00:00,5681.0
2021-01-01 23:00:00,5333.0


In [9]:
hourly_wind_de_df = hourly_wind_de_df.astype('float32')
hourly_wind_de_df.dtypes

Wind[MWh]    float32
dtype: object

## Importing features

In [10]:
feature_path = daily_data_path = '/Users/michael.kuntz/code/HaukeFock/predict_energy_generation/raw_data/Features/DATAFRAMES'

hourly_windspeed_df = pd.read_csv(os.path.join(feature_path,'wind_hourly_clean.csv'))
hourly_airp_df = pd.read_csv(os.path.join(feature_path,'hourly_airp_df.csv'))

In [11]:
hourly_windspeed_df.head()

Unnamed: 0,Date,90,125,161,164,183,197,198,232,282,...,15000,15044,15120,15122,15189,15200,15207,15214,15444,1001
0,2015-01-01 00:00:00,2.1,0.7,0.9,5.1,11.3,3.2,1.6,1.1,0.7,...,3.9,0.8,4.9,3.2,4.4,3.0,0.5,1.2,1.0,2.2
1,2015-01-01 01:00:00,2.7,0.3,0.8,4.9,13.0,3.2,2.6,1.5,0.9,...,3.4,0.7,5.3,2.6,6.0,2.8,0.8,1.1,1.0,2.5
2,2015-01-01 02:00:00,2.5,0.8,1.3,3.9,12.8,3.2,3.2,1.7,0.9,...,3.8,1.3,4.8,2.3,5.7,3.1,0.5,1.8,1.2,2.6
3,2015-01-01 03:00:00,2.0,0.7,1.4,3.7,12.4,3.2,3.5,1.5,0.6,...,4.2,1.1,4.2,2.5,5.4,3.2,0.7,1.1,1.0,2.8
4,2015-01-01 04:00:00,1.1,0.9,1.7,4.2,12.2,3.2,3.4,1.7,1.0,...,4.4,0.9,4.1,1.8,6.1,3.6,0.8,0.9,1.4,2.8


In [12]:
hourly_airp_df.head()

Unnamed: 0,Date,44,71,73,78,91,96,102,125,131,...,13711,13713,13777,13965,15000,15207,15444,15555,19171,19172
0,2015-01-01 00:00:00,2.2,-3.7,-2.1,1.6,0.6,,3.9,-9.6,1.1,...,1.9,1.1,4.2,-2.9,1.2,0.1,-2.2,,,
1,2015-01-01 01:00:00,2.5,-3.8,-1.7,1.9,0.5,,3.3,-8.8,1.3,...,1.7,0.9,3.7,-5.2,1.0,-0.8,-1.8,,,
2,2015-01-01 02:00:00,2.3,-4.4,-1.8,1.6,0.3,,3.7,-12.5,1.2,...,1.6,1.0,3.1,-5.3,1.1,-1.4,-2.3,,,
3,2015-01-01 03:00:00,2.3,-3.2,-2.0,1.5,0.4,,3.8,-11.0,1.4,...,1.5,0.9,1.8,-4.0,0.5,-1.7,-3.5,,,
4,2015-01-01 04:00:00,1.9,-3.4,-1.7,1.5,0.4,,3.8,-10.6,1.5,...,1.2,0.9,2.2,-3.6,0.9,-1.5,-1.6,,,


In [13]:
hourly_windspeed_df.dtypes

Date      object
90       float64
125      float64
161      float64
164      float64
          ...   
15200    float64
15207    float64
15214    float64
15444    float64
1001     float64
Length: 261, dtype: object

In [14]:
hourly_airp_df.dtypes

Date      object
44       float64
71       float64
73       float64
78       float64
          ...   
15207    float64
15444    float64
15555    float64
19171    float64
19172    float64
Length: 516, dtype: object

In [15]:
hourly_windspeed_df['Date'] = pd.to_datetime(hourly_windspeed_df['Date'])

In [16]:
hourly_airp_df['Date'] = pd.to_datetime(hourly_airp_df['Date'])

In [17]:
hourly_windspeed_df.set_index('Date',inplace=True)

In [18]:
hourly_airp_df.set_index('Date',inplace=True)

In [19]:
hourly_windspeed_df = hourly_windspeed_df.astype('float32')
hourly_airp_df = hourly_airp_df.astype('float32')

In [20]:
hourly_windspeed_df = pd.DataFrame(hourly_windspeed_df.mean(axis=1),columns = ['Wind_speed'])
hourly_windspeed_df.head()

Unnamed: 0_level_0,Wind_speed
Date,Unnamed: 1_level_1
2015-01-01 00:00:00,3.320001
2015-01-01 01:00:00,3.306155
2015-01-01 02:00:00,3.313076
2015-01-01 03:00:00,3.28
2015-01-01 04:00:00,3.30077


In [21]:
hourly_airp_df = pd.DataFrame(hourly_airp_df.mean(axis=1),columns = ['Air_pressure'])
hourly_airp_df.head()

Unnamed: 0_level_0,Air_pressure
Date,Unnamed: 1_level_1
2015-01-01 00:00:00,0.902415
2015-01-01 01:00:00,0.757948
2015-01-01 02:00:00,0.600604
2015-01-01 03:00:00,0.48833
2015-01-01 04:00:00,0.356942


In [22]:
hourly_windspeed_df.shape

(55488, 1)

# Define features and target

In [23]:
y = pd.DataFrame(hourly_wind_de_df['Wind[MWh]'])

In [24]:
y.head()

Unnamed: 0_level_0,Wind[MWh]
Date,Unnamed: 1_level_1
2015-01-01 00:00:00,8646.0
2015-01-01 01:00:00,8816.0
2015-01-01 02:00:00,9058.0
2015-01-01 03:00:00,9071.0
2015-01-01 04:00:00,9168.0


In [25]:
X = pd.DataFrame(hourly_windspeed_df['Wind_speed'])
X.head()

Unnamed: 0_level_0,Wind_speed
Date,Unnamed: 1_level_1
2015-01-01 00:00:00,3.320001
2015-01-01 01:00:00,3.306155
2015-01-01 02:00:00,3.313076
2015-01-01 03:00:00,3.28
2015-01-01 04:00:00,3.30077


In [26]:
X.shape

(55488, 1)

In [27]:
y.shape

(52632, 1)

In [28]:
X.tail()

Unnamed: 0_level_0,Wind_speed
Date,Unnamed: 1_level_1
2021-04-30 19:00:00,2.315
2021-04-30 20:00:00,2.123076
2021-04-30 21:00:00,2.006538
2021-04-30 22:00:00,1.985769
2021-04-30 23:00:00,1.918077


In [29]:
y.tail()

Unnamed: 0_level_0,Wind[MWh]
Date,Unnamed: 1_level_1
2021-01-01 19:00:00,4845.0
2021-01-01 20:00:00,5028.0
2021-01-01 21:00:00,5204.0
2021-01-01 22:00:00,5681.0
2021-01-01 23:00:00,5333.0


In [30]:
# Filter out dates to align X and y
X = X[X.index <= dt.datetime(2021,1,1,23)]

In [31]:
len(X) == len(y)

True

# Building train/ test split

In [39]:
def train_test_split(tt_split, X, y):
        train_size = int(len(X) * tt_split)
        test_size = len(X) - train_size
        X_train, X_test = X.iloc[0:train_size,:], X.iloc[train_size:len(X),:]
        y_train, y_test = y.iloc[0:train_size,:], y.iloc[train_size:len(y),:]
        
        return X_train, X_test, y_train, y_test

In [70]:
tt_split = 0.995

X_train, X_test, y_train, y_test = train_test_split(tt_split, X, y)

In [71]:
X_train.shape

(52368, 1)

In [72]:
X_test.shape

(264, 1)

In [73]:
y_test.shape

(264, 1)

# Scale data

In [74]:
# normalize the dataset

def scale_training_data(X_train, X_test, y_train, y_test):
    scaler = MinMaxScaler(feature_range=(0, 1))

    X_train = pd.DataFrame(scaler.fit_transform(X_train))
    X_test = pd.DataFrame(scaler.fit_transform(X_test))
    y_train = pd.DataFrame(scaler.fit_transform(y_train))
    y_test = pd.DataFrame(scaler.fit_transform(y_test))

    return X_train, X_test, y_train, y_test

In [75]:
X_train, X_test, y_train, y_test = scale_training_data(X_train, X_test, y_train, y_test)

# Define loss functions

In [50]:
def mae(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return np.sum(np.abs(y_true-y_pred))/(len(y_true))

def mse(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return np.mean((y_true-y_pred)**2)

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Build RNN

In [76]:
# fix random seed for reproducibility
np.random.seed(7)

In [77]:
X_train.shape

(52368, 1)

In [85]:
X_train = np.array(X_train).reshape(len(X_train),1,1)

In [86]:
y_train = np.array(y_train).reshape(len(y_train),1,1)

In [87]:
y_train.shape

(52368, 1, 1)

In [None]:
# create and fit the LSTM network
#model = Sequential()
#model.add(LSTM(4, input_shape=(1, look_back)))
#model.add(Dense(1,activation="linear"))
#model.compile(loss='mse', optimizer='rmsprop')

#model.fit(X_train, y_train, epochs=100, batch_size=16, verbose=0)

#model.predict(X_test)