In [38]:
import pandas as pd
import numpy as np
import datetime as dt

from tensorflow import keras
from keras.utils import timeseries_dataset_from_array

import plotly.express as px
from sklearn.preprocessing import StandardScaler

### Importación de datos

Leemos el archivo y ordenamos por datetime

In [39]:
data_df = pd.read_csv('data_training/esios_dataset_d+7.csv')
data_df = data_df.sort_values(by=['Datetime_hour']).reset_index(drop=True)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13127 entries, 0 to 13126
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Datetime_hour            13127 non-null  object 
 1   Gen T.Real Hidraulica    13127 non-null  float64
 2   Gen T.Real Nuclear       13127 non-null  float64
 3   Gen T.Real Eólica        13127 non-null  float64
 4   MD                       13127 non-null  float64
 5   IDA1                     12839 non-null  float64
 6   IDA2                     12935 non-null  float64
 7   Demanda Real             13127 non-null  float64
 8   Gen T.Real Fotovoltaica  13127 non-null  float64
 9   Gen P48 Total            13091 non-null  float64
dtypes: float64(9), object(1)
memory usage: 1.0+ MB


### Visualización inicial

Visualizamos la forma de la función target para las últimas 2 semanas

In [40]:
px.line(data_df.iloc[-3*24*7:], x='Datetime_hour', y='MD')

### Preparación y limpieza de datos

Tratamiento de nulos: Utilizamos interpolacion lineal

In [41]:
data_df = data_df.interpolate(method='linear')


DataFrame.interpolate with object dtype is deprecated and will raise in a future version. Call obj.infer_objects(copy=False) before interpolating instead.



Convertir dataframe en array

In [42]:
features = [col for col in data_df.columns if col not in ['Datetime_hour']]

data_array = data_df[features].values
data_array

array([[3.74660e+03, 7.09580e+03, 5.52580e+03, ..., 2.18634e+04,
        2.90000e+01, 2.26406e+04],
       [3.29870e+03, 7.09620e+03, 5.66040e+03, ..., 2.12007e+04,
        2.90000e+01, 2.25613e+04],
       [2.79700e+03, 7.09740e+03, 5.48210e+03, ..., 2.00331e+04,
        2.90000e+01, 2.17131e+04],
       ...,
       [8.59420e+03, 6.20600e+03, 4.96900e+03, ..., 3.56829e+04,
        3.98000e+01, 3.68671e+04],
       [7.66900e+03, 6.12800e+03, 3.97990e+03, ..., 3.32828e+04,
        3.26000e+01, 3.59431e+04],
       [7.45500e+03, 6.08500e+03, 3.96200e+03, ..., 3.16400e+04,
        3.20000e+01, 8.80010e+03]])

Reescalado de datos

In [43]:
sc = StandardScaler()
data_array_scaled = sc.fit_transform(data_array)
data_array_scaled

array([[-0.03063723,  0.95433318, -0.28468606, ..., -1.12790245,
        -0.80001079, -1.34129809],
       [-0.17181884,  0.95464945, -0.24990086, ..., -1.2854529 ,
        -0.80001079, -1.35695664],
       [-0.32995863,  0.95559824, -0.29597963, ..., -1.5630384 ,
        -0.80001079, -1.52444198],
       ...,
       [ 1.49736449,  0.25079927, -0.42858207, ...,  2.15754875,
        -0.79828342,  1.4678627 ],
       [ 1.20573416,  0.18912738, -0.68419906, ...,  1.58694841,
        -0.799435  ,  1.28540992],
       [ 1.13827967,  0.15512878, -0.68882503, ...,  1.19638875,
        -0.79953096, -4.07423942]])

Ventanas temporales

In [44]:
input_steps = 24*7
output_steps = 24

X, y = [], []

for i in range(len(data_array_scaled) - input_steps - output_steps):
    X.append(data_array_scaled[i:i+input_steps])
    y.append(data_array_scaled[i+input_steps:i+input_steps+output_steps, features.index('MD')])


X = np.array(X)
y = np.array(y)

Dividir en Train, Val, Test

In [45]:
num_train_samples = int(0.5 * len(data_array))
num_val_samples = int(0.25 * len(data_array))
num_test_samples = len(data_array) - num_train_samples - num_val_samples

X_train = X[:num_train_samples]
y_train = y[:num_train_samples]

X_val = X[num_train_samples:num_train_samples + num_val_samples]
y_val = y[num_train_samples:num_train_samples + num_val_samples]

X_test = X[num_train_samples + num_val_samples:]
y_test = y[num_train_samples + num_val_samples:]

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

Train: (6563, 168, 9), Val: (3281, 168, 9), Test: (3091, 168, 9)


In [46]:
batch_size = 32

train_ds = timeseries_dataset_from_array(
    data=X_train,
    targets=y_train,
    sequence_length=input_steps,  # cada secuencia tiene 7 días
    batch_size=batch_size,
    shuffle=True
)

val_ds = timeseries_dataset_from_array(
    data=X_val,
    targets=y_val,
    sequence_length=input_steps,
    batch_size=batch_size,
    shuffle=True
)

test_ds = timeseries_dataset_from_array(
    data=X_test,
    targets=y_test,
    sequence_length=input_steps,
    batch_size=batch_size,
    shuffle=True
)

### Calculo del error base

Tenemos que establecer un objetivo de precisión para nuestro modelo. Para ello, suponemos dos pronósticos iniciales:
1. El precio a la misma hora del día anterior
2. El precio a la misma hora y el mismo día de la semana anterior

Consideramos esta segunda posibilidad, debido a la gran diferencia que existe entre los precios en fin de semana y entre semana.

In [56]:
md_true = []
md_delay_week = []
md_delay_day = []

# Calculamos los targets para el dataset sin escalar
for i in range(len(data_array) - input_steps - output_steps):
    md_true.append(data_array[i+input_steps:i+input_steps+output_steps, features.index('MD')])
    # Delay de 7 dias
    md_delay_week.append(data_array[i+input_steps-24*7:i+input_steps+output_steps-24*7, features.index('MD')])
    # Delay de 1 dia
    md_delay_day.append(data_array[i+input_steps-24:i+input_steps+output_steps-24, features.index('MD')])


md_true = np.array(md_true)
md_delay_week = np.array(md_delay_week)
md_delay_day = np.array(md_delay_day)

print(f"MAE week-delay: {np.mean(np.abs(md_true - md_delay_week))}")
print(f"MAE day-delay: {np.mean(np.abs(md_true - md_delay_day))}")

MAE week-delay: 28.454020905811102
MAE day-delay: 18.71842040329854
