# 02 - Data Preparation and Window Creation for LSTM


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [4]:
path = "../data/raw/continuous_dataset.csv"

df = pd.read_csv(
    path,
    parse_dates=['datetime'],
    index_col='datetime'
).sort_index()


In [5]:
df['hour'] = df.index.hour
df['dayofweek'] = df.index.day_of_week
df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)

feature_cols = [
    'nat_demand',
    'T2M_toc', 'T2M_san', 'T2M_dav',
    'hour', 'dayofweek', 'is_weekend',
    'holiday', 'school'
]

target_col = 'nat_demand'

df_model = df[feature_cols].copy()
df_model.head()

In [6]:
n = len(df_model)
train_size = int(n * 0.8)

df_train = df_model.iloc[:train_size]
df_test  = df_model.iloc[train_size:]

print("Train:", df_train.index.min(), "→", df_train.index.max(), "| rows:", len(df_train))
print("Test: ", df_test.index.min(), "→", df_test.index.max(),  "| rows:", len(df_test))

#### Scaling

In [7]:
X_train = df_train[feature_cols].values
y_train = df_train[[target_col]].values   # double [] -> 2D

X_test  = df_test[feature_cols].values
y_test  = df_test[[target_col]].values

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_train_scaled = scaler_X.fit_transform(X_train)   # fit ONLY on train
y_train_scaled = scaler_y.fit_transform(y_train)

X_test_scaled  = scaler_X.transform(X_test)        # use same params
y_test_scaled  = scaler_y.transform(y_test)

print(X_train_scaled.shape, y_train_scaled.shape, X_test_scaled.shape, y_test_scaled.shape)

In [8]:
def create_sequences(X, y, seq_len):
    Xs, ys = [], []
    for i in range(seq_len, len(X)):
        Xs.append(X[i-seq_len:i])  # [t-seq_len, ..., t-1]
        ys.append(y[i])            # value at t
    return np.array(Xs), np.array(ys)


In [9]:
sequence_length = 24 * 7  # 7 days of history (168 hours)

X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, sequence_length)
X_test_seq,  y_test_seq  = create_sequences(X_test_scaled,  y_test_scaled,  sequence_length)

print("X_train_seq:", X_train_seq.shape)
print("y_train_seq:", y_train_seq.shape)
print("X_test_seq :", X_test_seq.shape)
print("y_test_seq :", y_test_seq.shape)

In [10]:
import torch
from torch.utils.data import TensorDataset, DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

X_train_tensor = torch.tensor(X_train_seq, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train_seq, dtype=torch.float32).to(device)

X_test_tensor  = torch.tensor(X_test_seq,  dtype=torch.float32).to(device)
y_test_tensor  = torch.tensor(y_test_seq,  dtype=torch.float32).to(device)

X_train_tensor.shape, y_train_tensor.shape


In [11]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size, shuffle=False)


In [12]:
for X_batch, y_batch in train_loader:
    print(X_batch.shape, y_batch.shape)
    break
print(len(train_loader))

In [13]:
import torch.nn as nn 

class LSTMForecast(nn.Module): 
    def __init__(self, input_size, hidden_size=64, num_layers=2, output_size=1):
        super().__init__()
        # LSTM Layer
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        # Final linear layer 
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        # x: (batch_size, seq_len, input_size)
        out, _ = self.lstm(x)

        last_hidden = out[:, -1, :]
        out = self.fc(last_hidden)
        return out
        

I have a batch of 64 windows.
Each window represents 7 days (168 hours) and, in each hour, I have 9 features.

The LSTM is one single model (not 64 distinct ones) processing the 64 windows in parallel.
If I zoom in on a single window:

the LSTM iterates through the 168 hours in chronological order,

at each step it sees a vector of 9 features (that hour) and updates an internal state (hidden) of size 64,

at the end of hour 168, I keep the last hidden state, which is a summary of what happened in those 168 hours.

That vector of size 64 is passed through a final layer (fc) to obtain a prediction of the demand for the next hour.

This same process happens for all 64 windows in the batch in parallel, so I get 64 predictions.
I compare those 64 predictions with the 64 actual values (y_batch), calculate the loss (for example the average MSE), and use that to adjust the weights of the LSTM and the final layer so that next time it predicts better.

In [14]:
input_size = X_train_seq.shape[2]

model = LSTMForecast(input_size).to(device)

model

criterion = nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

print(model)


In [15]:
n_epochs = 15

for epoch in range(n_epochs):
    #train
    model.train()
    train_loss = 0.0

    for X_batch, y_batch in train_loader:
        optimizer.zero_grad() # clear previous gradients
        y_pred = model(X_batch) # forward: LSTM + fc
        loss = criterion(y_pred, y_batch) # MSE between preds and targets
        loss.backward() # backprop: calc gradients
        optimizer.step() # update model weights

        train_loss += loss.item() * X_batch.size(0)

    train_loss /= len(train_loader.dataset)
        
    all_pred = []
    # Validation 
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            val_loss += loss.item() * X_batch.size(0)
            
            all_pred.append(y_pred)
    all_pred = torch.cat(all_pred, dim=0)
    all_pred = all_pred.cpu().numpy()
    

    val_loss /= len(test_loader.dataset)
    if (epoch+1) % 1 == 0:
        print(f"Epoch {epoch+1}: train_loss={train_loss:.6f}, val_loss={val_loss:.6f}")

In [16]:
y_test_true = scaler_y.inverse_transform(y_test_seq)
y_test_pred  = scaler_y.inverse_transform(all_pred)
y_test_pred.shape, y_test_true.shape

### Metrics Calculation

In [17]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

mse = mean_squared_error(y_test_pred, y_test_true)
mae = mean_absolute_error(y_test_pred, y_test_true)

rmse = mse ** 0.5 

print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}") # Usually same unit as target and not squared like MSE


#### Plotting

In [18]:
import matplotlib.pyplot as plt

# Visualize a segment
plt.figure(figsize=(12, 6))
plt.plot(y_test_true[:200], label='Real')
plt.plot(y_test_pred[:200], label='Predicted')
plt.legend()
plt.show()