âœ… **Advanced Time Series Forecasting with Attention-Based Transformers**

This project focuses on implementing and evaluating deep learning models for multivariate time series forecasting, including:


*   Transformer with Self-Attention (Advanced Model)
*   LSTM (Baseline Model)

ðŸ“Œ **Project Details**


*   Dataset Type: Synthetic Multivariate Time Series
*   Dataset Size: 6500+ observations


*   Frameworks Used: PyTorch, NumPy, Scikit-learn
*   Evaluation Metrics: MAE (Mean Absolute Error), RMSE (Root Mean Squared Error)


This work demonstrates the use of positional encoding and self-attention mechanisms for multi-step forecasting and compares results against a traditional recurrent baseline model.

In [None]:
# Install required libraries
!pip install torch numpy pandas scikit-learn matplotlib

# Core imports
import numpy as np
import pandas as pd
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

import matplotlib.pyplot as plt




In [None]:
import numpy as np
import torch

torch.manual_seed(42)
np.random.seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Device: cpu


In [None]:
# Multivariate synthetic dataset with interactions
def generate_data(n_steps=6500):
    t = np.arange(n_steps)

    s1 = 0.03*t + np.sin(0.02*t) + np.random.normal(0, 0.2, n_steps)
    s2 = np.cos(0.015*t) + 0.6*s1 + np.random.normal(0, 0.25, n_steps)
    s3 = np.sin(0.01*t) + np.cos(0.02*t) + np.random.normal(0, 0.3, n_steps)

    return np.stack([s1, s2, s3], axis=1)

data = generate_data()
print("Dataset shape:", data.shape)


Dataset shape: (6500, 3)


In [None]:
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)

In [None]:
def create_sequences(data, input_len, output_len):
    X, y = [], []
    for i in range(len(data) - input_len - output_len):
        X.append(data[i:i+input_len])
        y.append(data[i+input_len:i+input_len+output_len])
    return np.array(X), np.array(y)

INPUT_LEN = 48     # Tuned
OUTPUT_LEN = 12    # Multi-step forecast

X, y = create_sequences(data_scaled, INPUT_LEN, OUTPUT_LEN)
print(X.shape, y.shape)


(6440, 48, 3) (6440, 12, 3)


In [None]:
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]


In [None]:
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


train_loader = DataLoader(
    TimeSeriesDataset(X_train, y_train),
    batch_size=32,
    shuffle=True
)

test_loader = DataLoader(
    TimeSeriesDataset(X_test, y_test),
    batch_size=32
)


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=500):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)


In [None]:
class TransformerForecast(nn.Module):
    def __init__(self, input_dim, d_model=64, nhead=4, num_layers=2):
        super().__init__()

        self.embedding = nn.Linear(input_dim, d_model)
        self.pos_encoding = PositionalEncoding(d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.output_layer = nn.Linear(d_model, input_dim)

    def forward(self, x):
        x = self.embedding(x)
        x = self.pos_encoding(x)
        x = self.encoder(x)
        return self.output_layer(x[:, -OUTPUT_LEN:, :])


In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=64):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, input_dim)

    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -OUTPUT_LEN:, :])


In [None]:
def train_model(model, epochs, lr):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)

            optimizer.zero_grad()
            preds = model(xb)
            loss = loss_fn(preds, yb)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss/len(train_loader):.5f}")


In [None]:
configs = [
    {"epochs": 10, "lr": 0.001},
    {"epochs": 20, "lr": 0.001}
]

best_model = None
best_rmse = float("inf")

for cfg in configs:
    print("\nTraining Transformer:", cfg)
    model = TransformerForecast(input_dim=3)
    train_model(model, cfg["epochs"], cfg["lr"])

    mae, rmse = 0, 0
    model.eval()
    preds, actuals = [], []

    with torch.no_grad():
        for xb, yb in test_loader:
            xb = xb.to(device)
            preds.append(model(xb).cpu().numpy())
            actuals.append(yb.numpy())

    preds = np.concatenate(preds)
    actuals = np.concatenate(actuals)

    rmse = np.sqrt(mean_squared_error(actuals.flatten(), preds.flatten()))
    print("RMSE:", rmse)

    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model



Training Transformer: {'epochs': 10, 'lr': 0.001}
Epoch 1/10 | Loss: 0.04985
Epoch 2/10 | Loss: 0.00410
Epoch 3/10 | Loss: 0.00276
Epoch 4/10 | Loss: 0.00242
Epoch 5/10 | Loss: 0.00226
Epoch 6/10 | Loss: 0.00218
Epoch 7/10 | Loss: 0.00212
Epoch 8/10 | Loss: 0.00203
Epoch 9/10 | Loss: 0.00196
Epoch 10/10 | Loss: 0.00195
RMSE: 0.04429778892425369

Training Transformer: {'epochs': 20, 'lr': 0.001}
Epoch 1/20 | Loss: 0.05019
Epoch 2/20 | Loss: 0.00468
Epoch 3/20 | Loss: 0.00292
Epoch 4/20 | Loss: 0.00260
Epoch 5/20 | Loss: 0.00236
Epoch 6/20 | Loss: 0.00221
Epoch 7/20 | Loss: 0.00211
Epoch 8/20 | Loss: 0.00210
Epoch 9/20 | Loss: 0.00198
Epoch 10/20 | Loss: 0.00198
Epoch 11/20 | Loss: 0.00191
Epoch 12/20 | Loss: 0.00192
Epoch 13/20 | Loss: 0.00188
Epoch 14/20 | Loss: 0.00181
Epoch 15/20 | Loss: 0.00181
Epoch 16/20 | Loss: 0.00177
Epoch 17/20 | Loss: 0.00174
Epoch 18/20 | Loss: 0.00174
Epoch 19/20 | Loss: 0.00174
Epoch 20/20 | Loss: 0.00168
RMSE: 0.05670946352903893


In [None]:
lstm = LSTMModel(input_dim=3)
train_model(lstm, epochs=20, lr=0.001)


Epoch 1/20 | Loss: 0.04938
Epoch 2/20 | Loss: 0.00298
Epoch 3/20 | Loss: 0.00249
Epoch 4/20 | Loss: 0.00233
Epoch 5/20 | Loss: 0.00222
Epoch 6/20 | Loss: 0.00208
Epoch 7/20 | Loss: 0.00183
Epoch 8/20 | Loss: 0.00177
Epoch 9/20 | Loss: 0.00174
Epoch 10/20 | Loss: 0.00173
Epoch 11/20 | Loss: 0.00173
Epoch 12/20 | Loss: 0.00169
Epoch 13/20 | Loss: 0.00165
Epoch 14/20 | Loss: 0.00165
Epoch 15/20 | Loss: 0.00162
Epoch 16/20 | Loss: 0.00164
Epoch 17/20 | Loss: 0.00164
Epoch 18/20 | Loss: 0.00159
Epoch 19/20 | Loss: 0.00159
Epoch 20/20 | Loss: 0.00157


In [None]:
def evaluate(model):
    model.eval()
    preds, actuals = [], []

    with torch.no_grad():
        for xb, yb in test_loader:
            xb = xb.to(device)
            preds.append(model(xb).cpu().numpy())
            actuals.append(yb.numpy())

    preds = np.concatenate(preds)
    actuals = np.concatenate(actuals)

    mae = mean_absolute_error(actuals.flatten(), preds.flatten())
    rmse = np.sqrt(mean_squared_error(actuals.flatten(), preds.flatten()))
    return mae, rmse


In [None]:
mae_t, rmse_t = evaluate(best_model)
mae_l, rmse_l = evaluate(lstm)

print("FINAL RESULTS")
print("-----------------------------")
print(f"Transformer â†’ MAE: {mae_t:.4f}, RMSE: {rmse_t:.4f}")
print(f"LSTM        â†’ MAE: {mae_l:.4f}, RMSE: {rmse_l:.4f}")


FINAL RESULTS
-----------------------------
Transformer â†’ MAE: 0.0335, RMSE: 0.0443
LSTM        â†’ MAE: 0.0273, RMSE: 0.0422
