**Autoregressive model as a benchmark**

In [30]:
# from google.colab import drive
# drive.mount('/content/drive')

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statistics as stat
from scipy.stats import norm

# Import pytorch utilities
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

In [32]:
x_train = pd.read_csv('windforecasts_wf1.csv', index_col='date')
y_train = pd.read_csv('train.csv')
# just consider the wind farm 1

In [33]:
y_train['date'] = pd.to_datetime(y_train.date, format= '%Y%m%d%H')
y_train.index = y_train['date'] 
y_train.drop('date', inplace = True, axis = 1)

In [34]:
# Use only the power time series when continuous

complete_ts = y_train[:'2011-01-01 00'] # all the data without any gaps
input_generator = np.transpose(np.array([complete_ts.wp1]))
length = 24 # length of the time series, PARAMETER TO TUNE

In [35]:
# Define the validation set as one sequence
validation_power = input_generator[int(len(input_generator)*0.8)+1 : int(len(input_generator))-1]

In [36]:
# Define slices of 24h inputs and corresponding targets 1, 2 and 3 hours ahead
p_inputs = []
p_targets1h = []
p_targets2h = []
p_targets3h = []
p_targets4h = []
p_targets5h = []
p_targets6h = []
for i in range(len(validation_power)-8):
  p_inputs.append(validation_power[i:i+3])
  p_targets1h.append(validation_power[i+3])
  p_targets2h.append(validation_power[i+4])
  p_targets3h.append(validation_power[i+5])
  p_targets4h.append(validation_power[i+6])
  p_targets5h.append(validation_power[i+7])
  p_targets6h.append(validation_power[i+8])

In [37]:
# Definition of ARIMA model (was fitted in Matlab using the training set)
# P_t = a + b*P_{t-1} + c*P_{t-2} + d*P{t-3}
a = 0.0139
b = 1.189
c = -0.282
d = 0.0361

In [38]:
# Forecasting 1, 2 and 3 hours ahead

# Store predictions and errors
pred_1h = []
err_1h = []
pred_2h = []
err_2h = []
pred_3h = []
err_3h = []
pred_4h = []
err_4h = []
pred_5h = []
err_5h = []
pred_6h = []
err_6h = []

# Loop over the sequences of valid data
for seq in range(len(p_inputs)):

    # Define past value for the 1h forecast
    past = p_inputs[seq]
    
    # Take ARIMA output for the past sequence
    pred_1h.append(a + b*past[2] + c*past[1] + d*past[0])
    err_1h.append(pred_1h[-1][0]-p_targets1h[seq][0])

    # Repeat with prediction 2 hours ahead actualizing the past values
    past = np.append(past,[pred_1h[-1]],0)
    pred_2h.append(a + b*past[3] + c*past[2] + d*past[1])
    err_2h.append(pred_2h[-1][0]-p_targets2h[seq][0])

    # Repeat with prediction 3 hours ahead
    past = np.append(past,[pred_2h[-1]],0)
    pred_3h.append(a + b*past[4] + c*past[3] + d*past[2])
    err_3h.append(pred_3h[-1][0]-p_targets3h[seq][0])

    # Repeat with prediction 4 hours ahead
    past = np.append(past,[pred_3h[-1]],0)
    pred_4h.append(a + b*past[5] + c*past[4] + d*past[3])
    err_4h.append(pred_4h[-1][0]-p_targets4h[seq][0])

    # Repeat with prediction 5 hours ahead
    past = np.append(past,[pred_4h[-1]],0)
    pred_5h.append(a + b*past[6] + c*past[5] + d*past[4])
    err_5h.append(pred_5h[-1][0]-p_targets5h[seq][0])

    # Repeat with prediction 6 hours ahead
    past = np.append(past,[pred_5h[-1]],0)
    pred_6h.append(a + b*past[7] + c*past[6] + d*past[5])
    err_6h.append(pred_6h[-1][0]-p_targets6h[seq][0])

    if seq % 100 == 0:
      print(f'step {seq+1}, RMSE 1h: {np.sqrt(stat.mean(err_1h[n]**2 for n in range(len(err_1h))))}, RMSE 2h: {np.sqrt(stat.mean(err_2h[n]**2 for n in range(len(err_2h))))}, RMSE 3h: {np.sqrt(stat.mean(err_3h[n]**2 for n in range(len(err_3h))))}, RMSE 4h: {np.sqrt(stat.mean(err_4h[n]**2 for n in range(len(err_4h))))}, RMSE 5h: {np.sqrt(stat.mean(err_5h[n]**2 for n in range(len(err_5h))))}, RMSE 6h: {np.sqrt(stat.mean(err_6h[n]**2 for n in range(len(err_6h))))}')

step 1, RMSE 1h: 0.0041101000000000045, RMSE 2h: 0.0348390911, RMSE 3h: 0.0833917275179, RMSE 4h: 0.12339426571858308, RMSE 5h: 0.13062100596805748, RMSE 6h: 0.03688013452677609
step 101, RMSE 1h: 0.07035843134737843, RMSE 2h: 0.11797614363222156, RMSE 3h: 0.14906344886143555, RMSE 4h: 0.1703300348444637, RMSE 5h: 0.18595369415643603, RMSE 6h: 0.1988572157893249
step 201, RMSE 1h: 0.06812978019505032, RMSE 2h: 0.11516970177161238, RMSE 3h: 0.14504199198656959, RMSE 4h: 0.1650519363526176, RMSE 5h: 0.1824356222535656, RMSE 6h: 0.19780761740265784
step 301, RMSE 1h: 0.07677044837502314, RMSE 2h: 0.12156251335590067, RMSE 3h: 0.150656968010809, RMSE 4h: 0.16965070532251125, RMSE 5h: 0.1846873471824445, RMSE 6h: 0.20042151258602983
step 401, RMSE 1h: 0.07224706880106027, RMSE 2h: 0.1147480021350331, RMSE 3h: 0.14248685221821158, RMSE 4h: 0.16072607069066508, RMSE 5h: 0.17527370960172006, RMSE 6h: 0.18910427468458876
step 501, RMSE 1h: 0.06904041416862959, RMSE 2h: 0.10970037994924346, RMSE

In [39]:
# Estimation of confidence intervals:
RMSE_1h = np.sqrt(stat.mean(err_1h[n]**2 for n in range(len(err_1h))))
RMSE_2h = np.sqrt(stat.mean(err_2h[n]**2 for n in range(len(err_2h))))
RMSE_3h = np.sqrt(stat.mean(err_3h[n]**2 for n in range(len(err_3h))))
CI_1h = [norm.ppf(0.025)*RMSE_1h,norm.ppf(0.975)*RMSE_1h]
CI_2h = [norm.ppf(0.025)*RMSE_2h,norm.ppf(0.975)*RMSE_2h]
CI_3h = [norm.ppf(0.025)*RMSE_3h,norm.ppf(0.975)*RMSE_3h]
print(f'Confidence interval 1h: {CI_1h}')
print(f'Confidence interval 2h: {CI_2h}')
print(f'Confidence interval 3h: {CI_3h}')

Confidence interval 1h: [-0.13632496657749724, 0.13632496657749718]
Confidence interval 2h: [-0.2154230522779109, 0.21542305227791086]
Confidence interval 3h: [-0.26856377327348296, 0.2685637732734829]
