**Autoregressive model as a benchmark**

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statistics as stat
from scipy.stats import norm

# Import pytorch utilities
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

In [7]:
x_train = pd.read_csv('windforecasts_wf1.csv', index_col='date')
y_train = pd.read_csv('train.csv')
# just consider the wind farm 1

In [8]:
y_train['date'] = pd.to_datetime(y_train.date, format= '%Y%m%d%H')
y_train.index = y_train['date'] 
y_train.drop('date', inplace = True, axis = 1)

In [9]:
# Use only the power time series when continuous

complete_ts = y_train[:'2011-01-01 00'] # all the data without any gaps
input_generator = np.transpose(np.array([complete_ts.wp1]))
length = 24 # length of the time series, PARAMETER TO TUNE

In [10]:
# Define the validation set as one sequence
validation_power = input_generator[int(len(input_generator)*0.8)+1 : int(len(input_generator))-1]

In [13]:
# Define slices of 24h inputs and corresponding targets 1, 2 and 3 hours ahead
p_inputs = []
p_targets1h = []
p_targets2h = []
p_targets3h = []
for i in range(len(validation_power)-5):
  p_inputs.append(validation_power[i:i+3])
  p_targets1h.append(validation_power[i+3])
  p_targets2h.append(validation_power[i+4])
  p_targets3h.append(validation_power[i+5])

In [14]:
# Definition of ARIMA model (was fitted in Matlab using the training set)
# P_t = a + b*P_{t-1} + c*P_{t-2} + d*P{t-3}
a = 0.0139
b = 1.189
c = -0.282
d = 0.0361

In [51]:
# Forecasting 1, 2 and 3 hours ahead

# Store predictions and errors
pred_1h = []
err_1h = []
pred_2h = []
err_2h = []
pred_3h = []
err_3h = []

# Loop over the sequences of valid data
for seq in range(len(p_inputs)):

    # Define past value for the 1h forecast
    past = p_inputs[seq]
    
    # Take ARIMA output for the past sequence
    pred_1h.append(a + b*past[2] + c*past[1] + d*past[0])
    err_1h.append(pred_1h[-1][0]-p_targets1h[seq][0])

    # Repeat with prediction 2 hours ahead actualizing the past values
    past = np.append(past,[pred_1h[-1]],0)
    pred_2h.append(a + b*past[3] + c*past[2] + d*past[1])
    err_2h.append(pred_2h[-1][0]-p_targets2h[seq][0])

    # Repeat with prediction 3 hours ahead
    past = np.append(past,[pred_2h[-1]],0)
    pred_3h.append(a + b*past[4] + c*past[3] + d*past[2])
    err_3h.append(pred_3h[-1][0]-p_targets3h[seq][0])

    if seq % 100 == 0:
      print(f'step {seq+1}, RMSE 1h: {np.sqrt(stat.mean(err_1h[n]**2 for n in range(len(err_1h))))}, RMSE 2h: {np.sqrt(stat.mean(err_2h[n]**2 for n in range(len(err_2h))))}, RMSE 3h: {np.sqrt(stat.mean(err_3h[n]**2 for n in range(len(err_3h))))}')

step 1, RMSE 1h: 0.0041101000000000045, RMSE 2h: 0.0348390911, RMSE 3h: 0.0833917275179
step 101, RMSE 1h: 0.07035843134737843, RMSE 2h: 0.11797614363222156, RMSE 3h: 0.14906344886143555
step 201, RMSE 1h: 0.06812978019505032, RMSE 2h: 0.11516970177161238, RMSE 3h: 0.14504199198656959
step 301, RMSE 1h: 0.07677044837502314, RMSE 2h: 0.12156251335590067, RMSE 3h: 0.150656968010809
step 401, RMSE 1h: 0.07224706880106027, RMSE 2h: 0.1147480021350331, RMSE 3h: 0.14248685221821158
step 501, RMSE 1h: 0.06904041416862959, RMSE 2h: 0.10970037994924346, RMSE 3h: 0.13569924537775785
step 601, RMSE 1h: 0.07322334788941078, RMSE 2h: 0.11336856829609673, RMSE 3h: 0.1380206603216205
step 701, RMSE 1h: 0.07049565629067599, RMSE 2h: 0.11010058203977255, RMSE 3h: 0.13474782446194725
step 801, RMSE 1h: 0.070830850496784, RMSE 2h: 0.11254189046318452, RMSE 3h: 0.1384421700778182
step 901, RMSE 1h: 0.07235893127061728, RMSE 2h: 0.11597174494322379, RMSE 3h: 0.1433483596375577
step 1001, RMSE 1h: 0.0738943

In [54]:
# Estimation of confidence intervals:
RMSE_1h = np.sqrt(stat.mean(err_1h[n]**2 for n in range(len(err_1h))))
RMSE_2h = np.sqrt(stat.mean(err_2h[n]**2 for n in range(len(err_2h))))
RMSE_3h = np.sqrt(stat.mean(err_3h[n]**2 for n in range(len(err_3h))))
CI_1h = [norm.ppf(0.025)*RMSE_1h,norm.ppf(0.975)*RMSE_1h]
CI_2h = [norm.ppf(0.025)*RMSE_2h,norm.ppf(0.975)*RMSE_2h]
CI_3h = [norm.ppf(0.025)*RMSE_3h,norm.ppf(0.975)*RMSE_3h]
print(f'Confidence interval 1h: {CI_1h}')
print(f'Confidence interval 2h: {CI_2h}')
print(f'Confidence interval 3h: {CI_3h}')

Confidence interval 1h: [-0.13651172320491342, 0.1365117232049134]
Confidence interval 2h: [-0.2157469321465518, 0.21574693214655175]
Confidence interval 3h: [-0.26869945371736936, 0.2686994537173693]
