In [1]:
import pandas as pd
import os
import random
from sklearn import preprocessing 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np
from collections import deque
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint

In [2]:
TIME_STEPS = 60  # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3  # how far into the future are we trying to predict?
RATIO_TO_PREDICT = "P_avg"
EPOCHS = 10  # how many passes through our data
SEQ_LEN = 60
BATCH_SIZE = 64  # how many batches? Try smaller batch if you're getting OOM (out of memory) errors.
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"  # a unique name for the model
pd.__version__

'0.23.4'

In [4]:
main_df = pd.read_csv('Updated_Basic_Data.csv')

In [5]:
main_df_R80711 = main_df[main_df.Wind_turbine_name == 'R80711']
main_df_R80711 = main_df_R80711.reset_index()[['Wa_avg', 'Wa_min', 'Wa_max',
                                               'Ws1_avg', 'Ws1_min', 'Ws1_max', 'Ws2_avg', 'Ws2_min', 'Ws2_max', 'Ws_avg', 'Ws_min', 'Ws_max',
                                               'Rs_avg', 'Rs_min', 'Rs_max', 
                                               'P_avg',  'P_min', 'P_max']]

main_df_R80721 = main_df[main_df.Wind_turbine_name == 'R80721']
main_df_R80721 = main_df_R80721.reset_index()[['Wa_avg', 'Wa_min', 'Wa_max',
                                               'Ws1_avg', 'Ws1_min', 'Ws1_max', 'Ws2_avg', 'Ws2_min', 'Ws2_max', 'Ws_avg', 'Ws_min', 'Ws_max',
                                               'Rs_avg', 'Rs_min', 'Rs_max', 
                                               'P_avg',  'P_min', 'P_max']]

main_df_R80736 = main_df[main_df.Wind_turbine_name == 'R80736']
main_df_R80736 = main_df_R80736.reset_index()[['Wa_avg', 'Wa_min', 'Wa_max',
                                               'Ws1_avg', 'Ws1_min', 'Ws1_max', 'Ws2_avg', 'Ws2_min', 'Ws2_max', 'Ws_avg', 'Ws_min', 'Ws_max',
                                               'Rs_avg', 'Rs_min', 'Rs_max', 
                                               'P_avg',  'P_min', 'P_max']]

main_df_R80790 = main_df[main_df.Wind_turbine_name == 'R80790']
main_df_R80790 = main_df_R80790.reset_index()[['Wa_avg', 'Wa_min', 'Wa_max',
                                               'Ws1_avg', 'Ws1_min', 'Ws1_max', 'Ws2_avg', 'Ws2_min', 'Ws2_max', 'Ws_avg', 'Ws_min', 'Ws_max',
                                               'Rs_avg', 'Rs_min', 'Rs_max', 
                                               'P_avg',  'P_min', 'P_max']]

print("checking if any null values are present\n", main_df_R80711.isna().sum())

checking if any null values are present
 Wa_avg     0
Wa_min     0
Wa_max     0
Ws1_avg    0
Ws1_min    0
Ws1_max    0
Ws2_avg    0
Ws2_min    0
Ws2_max    0
Ws_avg     0
Ws_min     0
Ws_max     0
Rs_avg     0
Rs_min     0
Rs_max     0
P_avg      0
P_min      0
P_max      0
dtype: int64


In [16]:
def train_test_split(data):
    '''
    Function that splits the dataframe into training and test data via an 80:20 split
    '''
    Index = sorted(data.index.values) 
    Last20pct = Index[-int(0.2*len(Index))]

    validation_data = data[(data.index >= Last20pct)]
    data = data[(data.index < Last20pct)]
    return validation_data, data

def scale_data(data,column,scaler):
    '''
    Column should be literal for funciton to work 
    '''
    data_form = data[column].values.reshape((len(data), 1))
    scaled_data = scaler.transform(data_form)
    return scaled_data

In [7]:
# Test train split using 20 80 in hopes of more better validation
Test_R80711, Train_R80711 = train_test_split(main_df_R80711)
#
Test_R80721, Train_R80721 = train_test_split(main_df_R80721)
#
Test_R80736, Train_R80736 = train_test_split(main_df_R80736)
#
Test_R80790, Train_R80790 = train_test_split(main_df_R80790)

In [20]:
# First fit the scalers to the data then transform the dat using said scalers using the scale_data function
Wa_scaler = MinMaxScaler(feature_range=(0,1))
Scaler_fit_Wa_avg = Train_R80711['Wa_avg'].values.reshape(len(Train_R80711['Wa_avg']),1)
Wa_scaler.fit(Scaler_fit_Wa_avg)
Ws_scaler = MinMaxScaler(feature_range=(0,1))
Scaler_fit_Ws_avg = Train_R80711['Ws_avg'].values.reshape(len(Train_R80711['Ws_avg']),1)
Ws_scaler.fit(Scaler_fit_Ws_avg)
Rs_scaler = MinMaxScaler(feature_range=(0,1))
Scaler_fit_Rs_avg = Train_R80711['Rs_avg'].values.reshape(len(Train_R80711['Rs_avg']),1)
Rs_scaler.fit(Scaler_fit_Rs_avg)
P_scaler  = MinMaxScaler(feature_range=(0,1))
Scaler_fit_P_avg = Train_R80711['P_avg'].values.reshape(len(Train_R80711['P_avg']),1)
P_scaler.fit(Scaler_fit_P_avg)

# Training Data scaled

Scaled_Wa_avg_R80711 = scale_data(Train_R80711,'Wa_avg',Wa_scaler)
Scaled_Wa_min_R80711 = scale_data(Train_R80711,'Wa_min',Wa_scaler)
Scaled_Wa_max_R80711 = scale_data(Train_R80711,'Wa_max',Wa_scaler)
Scaled_Ws1_avg_R80711 = scale_data(Train_R80711,'Ws1_avg',Ws_scaler)
Scaled_Ws1_min_R80711 = scale_data(Train_R80711,'Ws1_min',Ws_scaler)
Scaled_Ws1_max_R80711 = scale_data(Train_R80711,'Ws1_max',Ws_scaler)
Scaled_Ws2_avg_R80711 = scale_data(Train_R80711,'Ws2_avg',Ws_scaler)
Scaled_Ws2_min_R80711 = scale_data(Train_R80711,'Ws2_min',Ws_scaler)
Scaled_Ws2_max_R80711 = scale_data(Train_R80711,'Ws2_max',Ws_scaler)
Scaled_Ws_avg_R80711 = scale_data(Train_R80711,'Ws_avg',Ws_scaler)
Scaled_Ws_min_R80711 = scale_data(Train_R80711,'Ws_min',Ws_scaler)
Scaled_Ws_max_R80711 = scale_data(Train_R80711,'Ws_max',Ws_scaler)
Scaled_Rs_avg_R80711 = scale_data(Train_R80711,'Rs_avg',Rs_scaler)
Scaled_Rs_min_R80711 = scale_data(Train_R80711,'Rs_min',Rs_scaler)
Scaled_Rs_max_R80711 = scale_data(Train_R80711,'Rs_max',Rs_scaler)
Scaled_P_avg_R80711 = scale_data(Train_R80711,'P_avg',P_scaler)
Scaled_P_min_R80711 = scale_data(Train_R80711,'P_min',P_scaler)
Scaled_P_max_R80711 = scale_data(Train_R80711,'P_max',P_scaler)

# Testing Data scaled

Scaled_Wa_avg_R80711_Test = scale_data(Test_R80711,'Wa_avg',Wa_scaler)
Scaled_Wa_min_R80711_Test = scale_data(Test_R80711,'Wa_min',Wa_scaler)
Scaled_Wa_max_R80711_Test = scale_data(Test_R80711,'Wa_max',Wa_scaler)
Scaled_Ws1_avg_R80711_Test = scale_data(Test_R80711,'Ws1_avg',Ws_scaler)
Scaled_Ws1_min_R80711_Test = scale_data(Test_R80711,'Ws1_min',Ws_scaler)
Scaled_Ws1_max_R80711_Test = scale_data(Test_R80711,'Ws1_max',Ws_scaler)
Scaled_Ws2_avg_R80711_Test = scale_data(Test_R80711,'Ws2_avg',Ws_scaler)
Scaled_Ws2_min_R80711_Test = scale_data(Test_R80711,'Ws2_min',Ws_scaler)
Scaled_Ws2_max_R80711_Test = scale_data(Test_R80711,'Ws2_max',Ws_scaler)
Scaled_Ws_avg_R80711_Test = scale_data(Test_R80711,'Ws_avg',Ws_scaler)
Scaled_Ws_min_R80711_Test = scale_data(Test_R80711,'Ws_min',Ws_scaler)
Scaled_Ws_max_R80711_Test = scale_data(Test_R80711,'Ws_max',Ws_scaler)
Scaled_Rs_avg_R80711_Test = scale_data(Test_R80711,'Rs_avg',Rs_scaler)
Scaled_Rs_min_R80711_Test = scale_data(Test_R80711,'Rs_min',Rs_scaler)
Scaled_Rs_max_R80711_Test = scale_data(Test_R80711,'Rs_max',Rs_scaler)
Scaled_P_avg_R80711_Test = scale_data(Test_R80711,'P_avg',P_scaler)
Scaled_P_min_R80711_Test = scale_data(Test_R80711,'P_min',P_scaler)
Scaled_P_max_R80711_Test = scale_data(Test_R80711,'P_max',P_scaler)



In [56]:
# multivariate data preparation
from numpy import array
from numpy import hstack
 
# split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
	X, y = list(), list()
	for i in range(len(sequences)):
		# find the end of this pattern
		end_ix = i + n_steps
		# check if we are beyond the dataset
		if end_ix > len(sequences):
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
		X.append(seq_x)
		y.append(seq_y)
	return array(X), array(y)

# define input sequence
in_seq1 = Scaled_Wa_avg_R80711
in_seq2 = Scaled_Wa_min_R80711
in_seq3 = Scaled_Wa_max_R80711
in_seq4 = Scaled_Ws1_avg_R80711
in_seq5 = Scaled_Ws1_min_R80711
in_seq6 = Scaled_Ws1_max_R80711
in_seq7 = Scaled_Ws2_avg_R80711
in_seq8 = Scaled_Ws2_min_R80711
in_seq9 = Scaled_Ws2_max_R80711
in_seq10 = Scaled_Ws_avg_R80711
in_seq11 = Scaled_Ws_min_R80711
in_seq12 = Scaled_Ws_max_R80711
in_seq13 = Scaled_Rs_avg_R80711
in_seq14 = Scaled_Rs_min_R80711
in_seq15 = Scaled_Rs_max_R80711
out_seq = Scaled_P_avg_R80711

# convert to [rows, columns] structure
in_seq1 = in_seq1.reshape((len(in_seq1), 1))
in_seq2 = in_seq2.reshape((len(in_seq2), 1))
in_seq3 = in_seq3.reshape((len(in_seq3), 1))
in_seq4 = in_seq4.reshape((len(in_seq4), 1))
in_seq5 = in_seq5.reshape((len(in_seq5), 1))
in_seq6 = in_seq6.reshape((len(in_seq6), 1))
in_seq7 = in_seq7.reshape((len(in_seq7), 1))
in_seq8 = in_seq8.reshape((len(in_seq8), 1))
in_seq9 = in_seq9.reshape((len(in_seq9), 1))
in_seq10 = in_seq10.reshape((len(in_seq10), 1))
in_seq11 = in_seq11.reshape((len(in_seq11), 1))
in_seq12 = in_seq12.reshape((len(in_seq12), 1))
in_seq13 = in_seq13.reshape((len(in_seq13), 1))
in_seq14 = in_seq14.reshape((len(in_seq14), 1))
in_seq15 = in_seq15.reshape((len(in_seq15), 1))
out_seq = out_seq.reshape((len(out_seq), 1))
# horizontally stack columns
dataset = hstack((in_seq1, in_seq2, in_seq3, in_seq4, in_seq5, in_seq6, in_seq7, in_seq8, in_seq9, in_seq10, 
                  in_seq11, in_seq12, in_seq13, in_seq14, in_seq15, out_seq))
# choose a number of time steps
n_steps_in = 60
# convert into input/output
X, y = split_sequences(dataset, n_steps)
print(X.shape, y.shape)
n_features = X.shape[2]

(132149, 60, 15) (132149,)


In [58]:
# Test data sorted for model

# define input sequence
in_seq1_Test = Scaled_Wa_avg_R80711_Test
in_seq2_Test = Scaled_Wa_min_R80711_Test
in_seq3_Test = Scaled_Wa_max_R80711_Test
in_seq4_Test = Scaled_Ws1_avg_R80711_Test
in_seq5_Test = Scaled_Ws1_min_R80711_Test
in_seq6_Test = Scaled_Ws1_max_R80711_Test
in_seq7_Test = Scaled_Ws2_avg_R80711_Test
in_seq8_Test = Scaled_Ws2_min_R80711_Test
in_seq9_Test = Scaled_Ws2_max_R80711_Test
in_seq10_Test = Scaled_Ws_avg_R80711_Test
in_seq11_Test = Scaled_Ws_min_R80711_Test
in_seq12_Test = Scaled_Ws_max_R80711_Test
in_seq13_Test = Scaled_Rs_avg_R80711_Test
in_seq14_Test = Scaled_Rs_min_R80711_Test
in_seq15_Test = Scaled_Rs_max_R80711_Test
out_seq_Test = Scaled_P_avg_R80711_Test

# convert to [rows, columns] structure
in_seq1_Test = in_seq1_Test.reshape((len(in_seq1_Test), 1))
in_seq2_Test = in_seq2_Test.reshape((len(in_seq2_Test), 1))
in_seq3_Test = in_seq3_Test.reshape((len(in_seq3_Test), 1))
in_seq4_Test = in_seq4_Test.reshape((len(in_seq4_Test), 1))
in_seq5_Test = in_seq5_Test.reshape((len(in_seq5_Test), 1))
in_seq6_Test = in_seq6_Test.reshape((len(in_seq6_Test), 1))
in_seq7_Test = in_seq7_Test.reshape((len(in_seq7_Test), 1))
in_seq8_Test = in_seq8_Test.reshape((len(in_seq8_Test), 1))
in_seq9_Test = in_seq9_Test.reshape((len(in_seq9_Test), 1))
in_seq10_Test = in_seq10_Test.reshape((len(in_seq10_Test), 1))
in_seq11_Test = in_seq11_Test.reshape((len(in_seq11_Test), 1))
in_seq12_Test = in_seq12_Test.reshape((len(in_seq12_Test), 1))
in_seq13_Test = in_seq13_Test.reshape((len(in_seq13_Test), 1))
in_seq14_Test = in_seq14_Test.reshape((len(in_seq14_Test), 1))
in_seq15_Test = in_seq15_Test.reshape((len(in_seq15_Test), 1))
out_seq_Test = out_seq_Test.reshape((len(out_seq_Test), 1))
# horizontally stack columns
dataset_Test = hstack((in_seq1_Test, in_seq2_Test, in_seq3_Test, in_seq4_Test, in_seq5_Test, in_seq6_Test, in_seq7_Test,
                  in_seq8_Test, in_seq9_Test, in_seq10_Test, in_seq11_Test, in_seq12_Test, in_seq13_Test, in_seq14_Test,
                  in_seq15_Test, out_seq_Test))

X_Test, y_Test = split_sequences(dataset_Test, n_steps)
print(X_Test.shape, y_Test.shape)

(32993, 60, 15) (32993,)


In [59]:
# define model
model = Sequential()
model.add(CuDNNLSTM(1024, input_shape=(n_steps_in, n_features), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(CuDNNLSTM(512, return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(CuDNNLSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(CuDNNLSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1,activation = 'linear'))

opt = tf.keras.optimizers.Adam(lr=0.001,decay=1e-6)
model.compile(optimizer=opt, loss='mae', metrics = ['accuracy'])

In [60]:
tensorboard = TensorBoard(log_dir="logs/{}".format(NAME))
            # Train model
history = model.fit(
    X, y,
    validation_split=0.2,
    epochs=EPOCHS,
    callbacks=[tensorboard])

Train on 105719 samples, validate on 26430 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [61]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cu_dnnlstm_16 (CuDNNLSTM)    (None, 60, 1024)          4263936   
_________________________________________________________________
dropout_23 (Dropout)         (None, 60, 1024)          0         
_________________________________________________________________
batch_normalization_v1_19 (B (None, 60, 1024)          4096      
_________________________________________________________________
cu_dnnlstm_17 (CuDNNLSTM)    (None, 60, 512)           3149824   
_________________________________________________________________
dropout_24 (Dropout)         (None, 60, 512)           0         
_________________________________________________________________
batch_normalization_v1_20 (B (None, 60, 512)           2048      
_________________________________________________________________
cu_dnnlstm_18 (CuDNNLSTM)    (None, 60, 256)           788480    
__________

In [72]:
    # demonstrate prediction
ypred = [None]*len(X_Test)

for i in range(len(X_Test)):
        x_input = X_Test[i]
        x_input = x_input.reshape((1, n_steps, n_features))
        yhat = model.predict(x_input, verbose=0)
        ypred[i] = yhat

In [98]:
ypred  = np.asarray(ypred)
ypred  = ypred.reshape((len(ypred), 1))
y_Test = np.asarray(y_Test)
y_Test = y_Test.reshape((len(y_Test), 1))

Mean_Abs_Error = sum(abs(ypred - y_Test))/len(y_Test)
Mean_Abs_Error = Mean_Abs_Error.reshape((len(Mean_Abs_Error),1))
P_scaler.inverse_transform(Mean_Abs_Error)

array([[28.44207403]])

In [101]:
28.44207403/(684.15002-142.88750)*100 # Relating the MAE to the IQR for better understanding relative to the data.

5.2547651054796845

[[[10 15]]] => [25]
[[[20 25]]] => [45]
[[[30 35]]] => [65]
[[[40 45]]] => [85]
[[[50 55]]] => [105]
[[[60 65]]] => [125]
[[[70 75]]] => [145]
[[[80 85]]] => [165]
[[[90 95]]] => [185]
[[[100 105]]] => [205]
