# 02 - Preparación de datos y creación de ventanas para LSTM


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [3]:
ruta = "../data/raw/continuous_dataset.csv"

df = pd.read_csv(
    ruta,
    parse_dates=['datetime'],
    index_col='datetime'
).sort_index()


In [4]:
df['hour'] = df.index.hour
df['dayofweek'] = df.index.day_of_week
df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)

feature_cols = [
    'nat_demand',
    'T2M_toc', 'T2M_san', 'T2M_dav',
    'hour', 'dayofweek', 'is_weekend',
    'holiday', 'school'
]

target_col = 'nat_demand'

df_model = df[feature_cols].copy()
df_model.head()

Unnamed: 0_level_0,nat_demand,T2M_toc,T2M_san,T2M_dav,hour,dayofweek,is_weekend,holiday,school
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-01-03 01:00:00,970.345,25.865259,23.482446,22.662134,1,5,1,0,0
2015-01-03 02:00:00,912.1755,25.899255,23.399255,22.578943,2,5,1,0,0
2015-01-03 03:00:00,900.2688,25.93728,23.34353,22.53103,3,5,1,0,0
2015-01-03 04:00:00,889.9538,25.957544,23.238794,22.512231,4,5,1,0,0
2015-01-03 05:00:00,893.6865,25.97384,23.075403,22.481653,5,5,1,0,0


In [5]:
n = len(df_model)
train_size = int(n * 0.8)

df_train = df_model.iloc[:train_size]
df_test  = df_model.iloc[train_size:]

print("Train:", df_train.index.min(), "→", df_train.index.max(), "| filas:", len(df_train))
print("Test: ", df_test.index.min(), "→", df_test.index.max(),  "| filas:", len(df_test))

Train: 2015-01-03 01:00:00 → 2019-05-23 14:00:00 | filas: 38438
Test:  2019-05-23 15:00:00 → 2020-06-27 00:00:00 | filas: 9610


In [6]:
X_train = df_train[feature_cols].values
y_train = df_train[[target_col]].values   # doble [] -> 2D

X_test  = df_test[feature_cols].values
y_test  = df_test[[target_col]].values

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_train_scaled = scaler_X.fit_transform(X_train)   # fit SOLO en train
y_train_scaled = scaler_y.fit_transform(y_train)

X_test_scaled  = scaler_X.transform(X_test)        # usamos los mismos params
y_test_scaled  = scaler_y.transform(y_test)

print(X_train_scaled.shape, y_train_scaled.shape)

(38438, 9) (38438, 1)


In [7]:
def create_sequences(X, y, seq_len):
    Xs, ys = [], []
    for i in range(seq_len, len(X)):
        Xs.append(X[i-seq_len:i])  # [t-seq_len, ..., t-1]
        ys.append(y[i])            # valor en t
    return np.array(Xs), np.array(ys)


In [8]:
sequence_length = 24 * 7  # 7 días de historia (168 horas)

X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, sequence_length)
X_test_seq,  y_test_seq  = create_sequences(X_test_scaled,  y_test_scaled,  sequence_length)

print("X_train_seq:", X_train_seq.shape)
print("y_train_seq:", y_train_seq.shape)
print("X_test_seq :", X_test_seq.shape)
print("y_test_seq :", y_test_seq.shape)

X_train_seq: (38270, 168, 9)
y_train_seq: (38270, 1)
X_test_seq : (9442, 168, 9)
y_test_seq : (9442, 1)
