Imports
==============================================================================================

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from keras.callbacks import TensorBoard, History
from keras.layers import Input, Dense, Dropout
from keras.utils import plot_model
from keras.models import Model
from pathlib import Path
import sys
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
pd.options.mode.chained_assignment = None  # default='warn'
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import math
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from evaluation import *
from machineLearningModel import *

Using TensorFlow backend.


Configuration
==============================================================================================

In [2]:
# fix random seed for reproducibility
np.random.seed(13)

## net params
num_layers = 1#3#6
num_neurons = 300#500
batch_size = 100#500#1000
dropout_rate = 0
#const_features = ['latitude', 'longitude', 'altitude', 'modules_per_string', 'strings_per_inverter', 'tilt',
#                  'azimuth', 'albedo', 'Technology', 'BIPV', 'A_c', 'N_s', 'pdc0', 'gamma_pdc', 'SystemID']#15
#dyn_features = ['Wind Direction_x', 'Wind Direction_y', 'Total Cloud Cover', 'Low Cloud Cover', 'Medium Cloud Cover',
#                'High Cloud Cover', 'Wind Speed', 'Wind Gust', 'Total Precipitation',
#                'Snow Fraction', 'Mean Sea Level Pressure', 'DIF - backwards', 'DNI - backwards', 'Shortwave Radiation',
#                'Temperature', 'Relative Humidity', 'Hour_x', 'Hour_y', 'Month_x', 'Month_y']#20
const_features = ['SystemID']
dyn_features = ['DIF - backwards', 'DNI - backwards', 'Shortwave Radiation', 'Temperature', 'Relative Humidity', 'Hour_x', 'Hour_y', 'Month_x', 'Month_y']
target_features = ['power']
drop_features = ['power_pvlib']
act_fct = 'relu'
out_act = 'linear'#'relu'
loss_fct = 'mae'
optim = 'adam'
metrics = []
history = History()
val_history = History()

## data params
filename = './data/full_data_5_systems.csv'
correlations = ['pearson']#'pearson', 'spearman', 'kendall']
timesteps = 5#24
shape = (len(const_features) + len(dyn_features) + timesteps * (len(dyn_features) + len(target_features)),)# - 1

## training params
tensorboard = False
shuffle = True
epochs = 20
val_split = 1.0/10.0
forecast_horizon = 3#24
sliding_window = 24#168#24
dir = './test_results/'
if not os.path.exists(dir):
    os.makedirs(dir)

Data Preprocessing
==============================================================================================

In [3]:
pfname = dir + 'preprocessed_data_t-'+str(timesteps)+'_f'+str(shape[0])+'.csv'
print(pfname)
prep = Path(pfname)
if prep.exists():
    print('Loading preprocessed dataset ...')
    pvlib = np.array_split(pd.read_csv(filename, skipinitialspace=True).set_index('time'), 5)[-1].power_pvlib
    dataset = pd.read_csv(pfname, skipinitialspace=True).set_index(['time', 'SystemID'])
else:
    print('Data preprocessing ...')
    df = pd.read_csv(filename, skipinitialspace=True).set_index('time')
    df = np.array_split(df, 5)[-1] ##################################
    pvlib = df.power_pvlib
    dataset = df[const_features + dyn_features + target_features].copy()[:'2017-02-09 10:00:00']

    #separate system
    for i in range(1, timesteps + 1):
        for feature in dyn_features + target_features:
            sys.stdout.write("Shifting %i/%i %s                \r" % (i, timesteps, feature))
            sys.stdout.flush()
            dataset[feature + ' t-' + str(i)] = dataset.shift(i)[feature]
    print('Shifting done.                ')

    dataset['forecast_horizon'] = 0
    p = dataset[target_features]
    dataset = dataset.drop(target_features, axis=1)
    for f in target_features:
        dataset[f] = p[f]
    dataset = dataset.dropna().reset_index().set_index(['time', 'SystemID'])

    sys.stdout.write("Writing to file ...\r")
    sys.stdout.flush()
    dataset.to_csv(pfname, encoding='utf-8')
    print('Writing done.                ')

    if correlations:
        sys.stdout.write('Computing correlations ...\r')
        sys.stdout.flush()
        for corr in correlations:
            sys.stdout.write("Computing %s correlation matrix                \r" % (corr))
            sys.stdout.flush()
            dataset.corr(method=corr).to_csv(dir + corr + '_correlations.csv', encoding='utf-8')
        print('Correlations done.                   ')

train, test = dataset[:('2015-10-12 06:00:00', 4.0)], dataset[('2015-10-12 07:00:00', 4.0):]
trainX, trainY = train.iloc[:,:-len(target_features)], train.iloc[:,-len(target_features):]
testX, testY = test.iloc[:,:-len(target_features)], test.iloc[:,-len(target_features):]
idx = testX.index.values

./test_results/preprocessed_data_t-5_f60.csv
Loading preprocessed dataset ...


Build Model
==============================================================================================

In [4]:
if tensorboard:
    print('tensorboard activated')
    callbacks = [TensorBoard(log_dir='./tensorboard', histogram_freq=1, batch_size=batch_size, write_graph=True, write_grads=True, write_images=False), history]
else:
    callbacks = [history]

#model = SARIMAX((0,1,2), (1,0,0,24))
model = MultiLayerPerceptron(shape, len(target_features), num_layers, num_neurons, loss_fct, optim,
                 act_fct, out_act, metrics, dropout_rate, dir + 'model.png', batch_size,
                 epochs, val_split, callbacks, 1, True)

Using MultiLayerPerceptron



Training
==============================================================================================

In [5]:
X = trainX
y = trainY
if shuffle:
    df = pd.DataFrame(np.concatenate((trainX, trainY), axis=1))
    df = df.sample(frac=1).values
    y = df[:, -len(target_features):]
    X = df[:, :-len(target_features)]

model.learn(X, y)

name = './saved_models/pretrained_t-'+str(timesteps)+'_f'+str(shape[0])+'_e'+str(epochs)+'_b'+str(batch_size)+'_sys'+str(4)
# serialize model to JSON
model_json = model.model.to_json()
with open(name + ".json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.model.save_weights(name + ".h5")
print("Saved model to disk")

Train on 61367 samples, validate on 6819 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Saved model to disk


Walk-Forward Validation
==============================================================================================

In [6]:
method = 'pvlib'

model.epochs = 5
model.validation_split = 0.0
model.batch_size = sliding_window
model.verbose = 0
model.callbacks = [val_history]

predictions = []
length = len(testX) - forecast_horizon - 1     #-11000
for i in range(length):# - 10
    sys.stdout.write("Walk-Forward Validation %i/%i\r" % (i+1, length))
    sys.stdout.flush()
    
    # initialize values for lagged power columns
    p = []
    for l in range(1, timesteps + 1):
        p.append(testX.iloc[i:i+1,:]['power t-'+str(l)].values[0])
        
    ps = []
    ts = []
    for f in range(forecast_horizon):
        # build input vector for future timestep
        t = testX.iloc[i+f:i+1+f,:].copy()
        for l in range(timesteps-1, 1, -1):
            t['power t-' + str(l+1)] = p[l]
            p[l] = p[l-1]
        t['power t-1'] = p[0]
        t['forecast_horizon'] = f
        ts.append(t)
        
        # make prediction for input new vector
        p[0] = model.forecast(t).item(0)
        ps.append(p[0])
        
    for f in range(forecast_horizon): # to avoid possible information leakage?
        # train with newly available data
        model.learn(ts[f], testY.iloc[i+f:i+1+f,:])
        
    predictions.append(pd.DataFrame(ps))
    
prediction = pd.concat(predictions)

name = './saved_models/trained_t-'+str(timesteps)+'_f'+str(shape[0])+'_e'+str(epochs)+'_b'+str(batch_size)+'_sys'+str(4)
# serialize model to JSON
model_json = model.model.to_json()
with open(name + ".json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.model.save_weights(name + ".h5")
print("\nSaved model to disk")

Walk-Forward Validation 11664/11664
Saved model to disk


Evaluation
==============================================================================================

In [7]:
data = pd.DataFrame()
for i in range(forecast_horizon):
    a = np.empty(i)
    a.fill(np.nan)
    b = np.empty(forecast_horizon - i - 1)
    b.fill(np.nan)
    data['+'+str(i+1)+'h-prediction'] = np.append(np.append(a, prediction[:][0][i].values), b)
data['measured'] = pd.DataFrame(np.array(testY).reshape([len(testY), len(target_features)])).iloc[:,0]
data = data.set_index(pd.MultiIndex.from_tuples(idx[:-2])).unstack()#[:-4]
data['pvlib'] = pvlib['2015-10-12 07:00:00':'2017-02-09 10:00:00'].reindex(data.index)

tmp = pd.DataFrame()
tmp[method] = data[method]
tmp['measured'] = data[('measured', 4.0)]
for i in range(forecast_horizon):
    tmp['+'+str(i+1)+'h-prediction'] = data['+'+str(i+1)+'h-prediction']
data = tmp
data.index = pd.to_datetime(data.index)
data = data.dropna()

m_col = data['measured']
l_col = data[method].dropna()

for horizon in range(1, forecast_horizon + 1):
    name = '+' + str(horizon) + 'h-prediction'
    p_col = data[name]

    walkForwardDailyLoss(m_col, p_col, l_col, method, name)
    scatter_predictions(m_col, p_col, name)

    print('%s test RMSE: %.3f' % (name, math.sqrt(mean_squared_error(m_col, p_col))))
    print('%s test RMSE: %.3f' % (method + ' forecast', math.sqrt(mean_squared_error(m_col, l_col))))
    draw_boxplot(m_col, p_col, l_col, method, name)
    draw_boxplot_monthly(m_col, p_col, l_col, method, name)

    m1, m2 = '2016-07-17 00:00:00', '2016-07-17 23:00:00'
    print('%s nice day RMSE: %.3f' % (name, math.sqrt(mean_squared_error(m_col[m1:m2], p_col[m1:m2]))))
    print('%s nice day RMSE: %.3f' % (method + ' forecast', math.sqrt(mean_squared_error(m_col[m1:m2], l_col[m1:m2]))))
    draw_boxplot(m_col, p_col, l_col, method, name, m1, m2)

    plot_timeseries(m_col, p_col, l_col, method, name, end='2015-10-19 07:00:00')
    plot_timeseries(m_col, p_col, l_col, method, name, start='2017-02-02 10:00:00')
    plot_timeseries(m_col, p_col, l_col, method, name, start=m1, end=m2)
    plot_timeseries(m_col, p_col, l_col, method, name)
    plot_timeseries(m_col, p_col, None, method, name)

    draw_histogram(p_col, m_col, name)

daily mean +1h-prediction RMSE: 400.94355784576646
daily mean pvlib forecast RMSE: 1319.6280018979849
                 0
count   485.000000
mean    400.943558
std     273.097217
min      13.796312
25%     202.228216
50%     348.250759
75%     562.313674
max    1779.027531
                 0
count   485.000000
mean   1319.628002
std     697.310573
min       0.000000
25%     800.629210
50%    1191.894250
75%    1697.159243
max    3500.072065
+1h-prediction test RMSE: 484.829
pvlib forecast test RMSE: 1492.981
+1h-prediction nice day RMSE: 228.829
pvlib forecast nice day RMSE: 444.823
daily mean +2h-prediction RMSE: 560.1512879890648
daily mean pvlib forecast RMSE: 1319.6280018979849
                 0
count   485.000000
mean    560.151288
std     357.983675
min      10.553615
25%     266.679527
50%     534.560667
75%     796.140613
max    2366.456581
                 0
count   485.000000
mean   1319.628002
std     697.310573
min       0.000000
25%     800.629210
50%    1191.894250
75%   

In [8]:
draw_history(history)
draw_history(val_history, True)

In [9]:
print(data.describe())
print(data.corr(method='pearson'))
print(data.corr(method='spearman'))
print(data.corr(method='kendall'))
data.to_csv(dir + 'predictions.csv', encoding='utf-8')

              pvlib      measured  +1h-prediction  +2h-prediction  \
count  11662.000000  11662.000000    11662.000000    11662.000000   
mean    1549.683537   1065.446782     1029.952963     1034.587587   
std     2226.383739   1925.252500     1887.771854     1853.179978   
min        0.000000      0.000000    -1551.626465    -2449.488281   
25%        0.000000      0.000000        0.000003        0.000092   
50%        0.000000      0.000000        4.464455        7.287639   
75%     2846.756628   1137.207027     1166.288116     1253.534210   
max     9849.600000   8494.140630     9970.578125     9347.826172   

       +3h-prediction  
count    11662.000000  
mean      1011.034874  
std       1831.752629  
min     -20399.146484  
25%          0.000096  
50%         13.068636  
75%       1243.079102  
max      10553.772461  
                   pvlib  measured  +1h-prediction  +2h-prediction  \
pvlib           1.000000  0.777900        0.784259        0.790481   
measured        0.7779