In [2]:
# Clone PatchTST repository and set it as working directory
import os

# Change to the repository directory
os.chdir('/content/PatchTST')
print(f"Current directory: {os.getcwd()}")

Current directory: /content/PatchTST


## 1. Setup and Installation <a id='setup'></a>

First, let's import necessary libraries and set up the environment.

In [3]:
import sys
import os

# Add PatchTST_supervised to path
sys.path.append('/content/PatchTST/PatchTST_supervised')

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch import optim
from torch.optim import lr_scheduler
import matplotlib.pyplot as plt
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')

print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Device: {torch.cuda.get_device_name(0)}")

PyTorch Version: 2.9.0+cu126
CUDA Available: True
CUDA Device: NVIDIA GeForce RTX 4060 Laptop GPU


## 2. Understanding the PatchTST Architecture <a id='architecture'></a>

### Key Concepts:

**Patching**: Time series is segmented into subseries-level patches which serve as input tokens to the Transformer.

**Channel-independence**: Each channel contains a single univariate time series that shares the same embedding and Transformer weights across all series.

### Architecture Components:
1. **RevIN (Reversible Instance Normalization)**: Normalizes input data
2. **Patching Layer**: Segments time series into patches
3. **Transformer Encoder**: Processes patches
4. **Prediction Head**: Maps encoded patches to predictions

In [4]:
# Change to PatchTST_supervised directory for imports
os.chdir('/content/PatchTST/PatchTST_supervised')
dataset_path='/content/PatchTST/datasets/weather'
model_checkopoints='/content/model/checkpoints_weather'
print(f"Changed to: {os.getcwd()}")

# Import PatchTST components
from models.PatchTST import Model as PatchTST
from layers.PatchTST_backbone import PatchTST_backbone
from layers.PatchTST_layers import *

# Change back to root directory
os.chdir('..')
print(f"Back to: {os.getcwd()}")

# Visualize the architecture
print("\n" + "=" * 50)
print("PatchTST Architecture Overview")
print("=" * 50)
print("\n1. Input Time Series: [Batch, Seq_len, Channels]")
print("   ↓")
print("2. RevIN: Normalization")
print("   ↓")
print("3. Patching: Divide into patches [Batch, Channels, Patch_num, Patch_len]")
print("   ↓")
print("4. Transformer Encoder: Process patches")
print("   ↓")
print("5. Flatten & Linear Head: Generate predictions")
print("   ↓")
print("6. RevIN Denormalization")
print("   ↓")
print("7. Output: [Batch, Pred_len, Channels]")
print("=" * 50)

Changed to: /content/PatchTST/PatchTST_supervised
Back to: /content/PatchTST

PatchTST Architecture Overview

1. Input Time Series: [Batch, Seq_len, Channels]
   ↓
2. RevIN: Normalization
   ↓
3. Patching: Divide into patches [Batch, Channels, Patch_num, Patch_len]
   ↓
4. Transformer Encoder: Process patches
   ↓
5. Flatten & Linear Head: Generate predictions
   ↓
6. RevIN Denormalization
   ↓
7. Output: [Batch, Pred_len, Channels]


## 3. Data Loading and Preparation <a id='data'></a>

Let's explore the data loading process and prepare a sample dataset.

In [5]:
from data_provider.data_loader import Dataset_ETT_hour, Dataset_ETT_minute, Dataset_Custom
from data_provider.data_factory import data_provider
from torch.utils.data import DataLoader
import random

def set_seed(seed):
    """Set random seed for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # For multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"Random seed set to: {seed}")

# Define a simple configuration class
class Config:
    def __init__(self):
        self.random_seed = 2021  # Original PatchTST default seed

        # Data parameters
        self.data = 'custom'  # Use 'custom' for weather dataset
        self.root_path = dataset_path
        self.data_path = 'weather.csv'
        self.features = 'M'  # M: multivariate, S: univariate, MS: multivariate to univariate
        self.target = 'OT'
        self.freq = '10min'  # 10 min intervals for weather data
        self.embed = 'timeF'
        
        # Forecasting task
        self.seq_len = 336  # Input sequence length (look back window)
        self.label_len = 48  # Decoder start token (not used in PatchTST)
        self.pred_len = 336  # Prediction length (prediction window)
        
        # Model parameters
        self.model = 'PatchTST'
        self.enc_in = 21  # Number of input channels (weather has 21 features)
        self.dec_in = 21
        self.c_out = 21  # Number of output channels
        self.d_model = 128  # Dimension of model
        self.n_heads = 8  # Number of attention heads
        self.e_layers = 3  # Number of encoder layers
        self.d_layers = 1  # Decoder layers (not used in PatchTST)
        self.d_ff = 256  # Dimension of fcn
        self.dropout = 0.2
        self.fc_dropout = 0.2
        self.head_dropout = 0.0
        
        # PatchTST specific
        self.patch_len = 16  # Length of each patch
        self.stride = 8  # Stride for patching
        self.padding_patch = 'end'
        self.revin = 1  # Use RevIN
        self.affine = 0
        self.subtract_last = 0
        self.decomposition = 0
        self.kernel_size = 25
        self.individual = 0  # Individual head for each channel
        
        # Training parameters
        self.batch_size = 16      # Original PatchTST batch size
        self.learning_rate = 0.0001  # Standard learning rate
        self.train_epochs = 100   # Max epochs (early stopping will kick in)
        self.patience = 3        # Early stopping patience
        self.num_workers = 0      # No multiprocessing (Colab compatible)
        self.lradj = 'type3'      # Learning rate adjustment type
        self.use_amp = False      # No automatic mixed precision
        self.pct_start = 0.3      # OneCycleLR warmup percentage
        
        # GPU
        self.use_gpu = True if torch.cuda.is_available() else False
        self.gpu = 0
        self.use_multi_gpu = False
        self.devices = '0'

        # Other
        self.checkpoints = model_checkopoints
        self.output_attention = False  # Don't output attention weights
        self.embed_type = 0       # Default embedding type
        self.activation = 'gelu'  # Activation function
        self.distil = True        # Use distillation (for Informer, not PatchTST)
        self.factor = 1           # Attention factor
        self.moving_avg = 25      # Moving average window
        self.do_predict = False   # Not in prediction mode
        self.itr = 1              # Number of experiment iterations
        self.des = 'Exp'          # Experiment description
        self.loss = 'mse'         # Loss function
        
args = Config()
set_seed(args.random_seed)

Random seed set to: 2021


In [6]:
train_data, train_loader = data_provider(args, flag='train')
val_data, val_loader = data_provider(args, flag='val')
test_data, test_loader = data_provider(args, flag='test')

print(f"\nData Loaders Created:")
print(f"  Training samples: {len(train_data)}")
print(f"  Validation samples: {len(val_data)}")
print(f"  Test samples: {len(test_data)}")

# Inspect a batch
for batch_x, batch_y, batch_x_mark, batch_y_mark in train_loader:
    print(f"\nBatch shapes:")
    print(f"  Input (batch_x): {batch_x.shape}")
    print(f"  Target (batch_y): {batch_y.shape}")
    print(f"  Input time features (batch_x_mark): {batch_x_mark.shape}")
    print(f"  Target time features (batch_y_mark): {batch_y_mark.shape}")
    break

train 36040
val 4935
test 10204

Data Loaders Created:
  Training samples: 36040
  Validation samples: 4935
  Test samples: 10204

Batch shapes:
  Input (batch_x): torch.Size([16, 512, 21])
  Target (batch_y): torch.Size([16, 384, 21])
  Input time features (batch_x_mark): torch.Size([16, 512, 5])
  Target time features (batch_y_mark): torch.Size([16, 384, 5])


## 4. Model Configuration and Creation <a id='model'></a>

Let's create the PatchTST model and explore its structure.

In [7]:
# Create the model
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = PatchTST(args).float()
model = model.to(device)

print(f"\nModel created successfully!")
print(f"\nModel Architecture:")
print(model)

Using device: cuda:0

Model created successfully!

Model Architecture:
Model(
  (model): PatchTST_backbone(
    (revin_layer): RevIN()
    (padding_patch_layer): ReplicationPad1d((0, 8))
    (backbone): TSTiEncoder(
      (W_P): Linear(in_features=16, out_features=128, bias=True)
      (dropout): Dropout(p=0.2, inplace=False)
      (encoder): TSTEncoder(
        (layers): ModuleList(
          (0-2): 3 x TSTEncoderLayer(
            (self_attn): _MultiheadAttention(
              (W_Q): Linear(in_features=128, out_features=128, bias=True)
              (W_K): Linear(in_features=128, out_features=128, bias=True)
              (W_V): Linear(in_features=128, out_features=128, bias=True)
              (sdp_attn): _ScaledDotProductAttention(
                (attn_dropout): Dropout(p=0.0, inplace=False)
              )
              (to_out): Sequential(
                (0): Linear(in_features=128, out_features=128, bias=True)
                (1): Dropout(p=0.2, inplace=False)
              

## 5. Training the Model <a id='training'></a>

Now let's set up the training loop with proper optimization and learning rate scheduling.

In [8]:
# Fix NumPy 2.0 compatibility issue
import numpy as np
if not hasattr(np, 'Inf'):
    np.Inf = np.inf
    np.NaN = np.nan
    np.NAN = np.nan
    np.NINF = np.NINF if hasattr(np, 'NINF') else -np.inf
    print("NumPy compatibility patch applied for np.Inf -> np.inf")
else:
    print("NumPy already has np.Inf attribute")

NumPy compatibility patch applied for np.Inf -> np.inf


In [9]:
from utils.tools import EarlyStopping, adjust_learning_rate
from utils.metrics import metric
import time

# Training setup
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)

# Learning rate scheduler
if os.path.exists(dataset_path):
    train_steps = len(train_loader)
    scheduler = lr_scheduler.OneCycleLR(
        optimizer=optimizer,
        steps_per_epoch=train_steps,
        pct_start=args.pct_start,
        epochs=args.train_epochs,
        max_lr=args.learning_rate
    )
    
    print(f"Training Setup:")
    print(f"  Criterion: MSE Loss")
    print(f"  Optimizer: Adam (lr={args.learning_rate})")
    print(f"  Scheduler: OneCycleLR")
    print(f"  Training steps per epoch: {train_steps}")
    print(f"  Total epochs: {args.train_epochs}")

Training Setup:
  Criterion: MSE Loss
  Optimizer: Adam (lr=0.0001)
  Scheduler: OneCycleLR
  Training steps per epoch: 2252
  Total epochs: 100


In [10]:
# Simplified evaluation function - only MSE and MAE
def evaluate_model(model, test_loader, device, args):
    model.eval()
    preds = []
    trues = []
    inputs = []
    
    with torch.no_grad():
        for batch_x, batch_y, batch_x_mark, batch_y_mark in test_loader:
            batch_x = batch_x.float().to(device)
            batch_y = batch_y.float().to(device)
            
            # Forward pass
            outputs = model(batch_x)
            
            # Extract predictions
            f_dim = -1 if args.features == 'MS' else 0
            outputs = outputs[:, -args.pred_len:, f_dim:]
            batch_y = batch_y[:, -args.pred_len:, f_dim:]
            
            # Store results
            preds.append(outputs.detach().cpu().numpy())
            trues.append(batch_y.detach().cpu().numpy())
            inputs.append(batch_x.detach().cpu().numpy())
    
    # Concatenate all batches
    preds = np.concatenate(preds, axis=0)
    trues = np.concatenate(trues, axis=0)
    inputs = np.concatenate(inputs, axis=0)
    
    # Calculate only MSE and MAE
    mse = np.mean((preds - trues) ** 2)
    mae = np.mean(np.abs(preds - trues))
    
    print("\nTest Set Evaluation Metrics:")
    print("=" * 50)
    print(f"  MSE:  {mse:.7f}")
    print(f"  MAE:  {mae:.7f}")
    print("=" * 50)
    
    return preds, trues, inputs, {'mse': mse, 'mae': mae}

# Count model parameters
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params

# Validation function
def validate(model, val_loader, criterion, device):
    model.eval()
    total_loss = []
    
    with torch.no_grad():
        for batch_x, batch_y, batch_x_mark, batch_y_mark in val_loader:
            batch_x = batch_x.float().to(device)
            batch_y = batch_y.float().to(device)
            
            # Forward pass
            outputs = model(batch_x)
            
            # Calculate loss
            f_dim = -1 if args.features == 'MS' else 0
            outputs = outputs[:, -args.pred_len:, f_dim:]
            batch_y = batch_y[:, -args.pred_len:, f_dim:]
            
            loss = criterion(outputs.cpu(), batch_y.cpu())
            total_loss.append(loss.item())
    
    model.train()
    return np.mean(total_loss)


In [16]:
# Train multiple model configurations with different prediction lengths
# This cell handles TRAINING ONLY and saves checkpoints

import datetime

total, trainable = count_parameters(model)
print(f"\nModel Parameters:")
print(f"  Total parameters: {total:,}")
print(f"  Trainable parameters: {trainable:,}")
print(f"  Model size: ~{total * 4 / 1024 / 1024:.2f} MB (fp32)")

# Training configurations: (pred_len, d_model, model_name)
training_configs = [
    # (336, 64, 'PatchTST/64'),
    (336, 42, 'PatchTST/42'),
    # (720, 64, 'PatchTST/64'),
    # (720, 42, 'PatchTST/42'),
]

# Dictionary to store checkpoint paths
checkpoint_registry = {
    336: {
        # 'PatchTST/64': None,
        'PatchTST/42': None
    },
    # 720: {'PatchTST/64': None, 'PatchTST/42': None}
}

print("\n" + "="*90)
print("STARTING MULTI-MODEL TRAINING WITH DIFFERENT CONFIGURATIONS")
print("="*90)

for config_idx, (pred_len, d_model, model_name) in enumerate(training_configs, 1):
    print(f"\n{'='*90}")
    print(f"Configuration {config_idx}/{len(training_configs)}: {model_name} (pred_len={pred_len}, d_model={d_model})")
    print(f"{'='*90}")
    
    # Update configuration
    args.pred_len = pred_len
    args.d_model = d_model

    # Adjust related parameters based on d_model
    # For d_model=42: comes from patching (336-16)/8 + 1 = 42 patches
    if d_model == 42:
        args.n_heads = 3  # Divisor of 42
        args.d_ff = 128
    elif d_model == 64:
        args.n_heads = 4
        args.d_ff = 256
    else:
        args.n_heads = 8
        args.d_ff = 256
    
    print(f"  Updated config: pred_len={args.pred_len}, d_model={args.d_model}, n_heads={args.n_heads}, d_ff={args.d_ff}")
    print(f"  Patching: seq_len={args.seq_len}, patch_len={args.patch_len}, stride={args.stride}")
    print(f"  Number of patches: ({args.seq_len} - {args.patch_len})/{args.stride} + 1 = {(args.seq_len - args.patch_len) // args.stride + 1}")
    
    # Recreate data loaders for new pred_len
    print(f"  Loading data with pred_len={args.pred_len}...")
    train_data, train_loader = data_provider(args, flag='train')
    val_data, val_loader = data_provider(args, flag='val')
    test_data, test_loader = data_provider(args, flag='test')
    
    print(f"    Training samples: {len(train_data)}")
    print(f"    Validation samples: {len(val_data)}")
    print(f"    Test samples: {len(test_data)}")
    
    # Create new model
    print(f"  Creating model...")
    model = PatchTST(args).float()
    model = model.to(device)
    
    total_params, trainable_params = count_parameters(model)
    print(f"  Model parameters: {total_params:,} (trainable: {trainable_params:,})")
    
    # Setup training
    train_steps = len(train_loader)
    scheduler = lr_scheduler.OneCycleLR(
        optimizer=optimizer,
        steps_per_epoch=train_steps,
        pct_start=args.pct_start,
        epochs=args.train_epochs,
        max_lr=args.learning_rate
    )
    
    # Reset training history
    train_losses = []
    val_losses = []
    test_losses = []
    
    early_stopping = EarlyStopping(patience=args.patience, verbose=False)
    
    # Training loop
    setting = f"{args.model}_{args.data}_sl{args.seq_len}_pl{args.pred_len}_dm{args.d_model}_patch{args.patch_len}"
    checkpoint_path = os.path.join(args.checkpoints, setting)
    os.makedirs(checkpoint_path, exist_ok=True)
    
    print(f"  Starting training (max {args.train_epochs} epochs)...")
    for epoch in range(args.train_epochs):
        model.train()
        epoch_time = time.time()
        train_loss = []
        
        for batch_x, batch_y, batch_x_mark, batch_y_mark in train_loader:
            optimizer.zero_grad()
            
            batch_x = batch_x.float().to(device)
            batch_y = batch_y.float().to(device)
            
            outputs = model(batch_x)
            
            f_dim = -1 if args.features == 'MS' else 0
            outputs = outputs[:, -args.pred_len:, f_dim:]
            batch_y = batch_y[:, -args.pred_len:, f_dim:]
            
            loss = criterion(outputs, batch_y)
            train_loss.append(loss.item())
            loss.backward()
            optimizer.step()
            
            if args.lradj == 'TST':
                adjust_learning_rate(optimizer, scheduler, epoch + 1, args, printout=False)
                scheduler.step()
        
        # Validation and testing
        train_loss_avg = np.mean(train_loss)
        val_loss = validate(model, val_loader, criterion, device)
        test_loss = validate(model, test_loader, criterion, device)
        
        train_losses.append(train_loss_avg)
        val_losses.append(val_loss)
        test_losses.append(test_loss)
        
        if (epoch + 1) % 5 == 0 or epoch == 0:
            epoch_duration = time.time() - epoch_time
            print(f"    Epoch {epoch+1:3d}/{args.train_epochs} | Time: {epoch_duration:.2f}s | "
                    f"Train Loss: {train_loss_avg:.7f} | Val Loss: {val_loss:.7f}")
        
        # Early stopping
        early_stopping(val_loss, model, checkpoint_path)
        if early_stopping.early_stop:
            print(f"    Early stopping at epoch {epoch+1}")
            break
        
        if args.lradj != 'TST':
            adjust_learning_rate(optimizer, scheduler, epoch + 1, args)
    
    # Save final checkpoint for this configuration
    final_checkpoint_path = os.path.join(checkpoint_path, 'final_checkpoint.pth')
    torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'train_loss': train_losses[-1] if train_losses else 0,
        'val_loss': val_losses[-1] if val_losses else 0,
        'config': vars(args)
    }, final_checkpoint_path)
    print(f"  Final checkpoint saved to: {final_checkpoint_path}")
    
    # Store checkpoint path in registry
    checkpoint_registry[pred_len][model_name] = checkpoint_path


print("Run the next cell to generate evaluation metrics and visualizations")

print("Checkpoint locations saved in 'checkpoint_registry' variable")print(f"{'='*90}\n")

checkpoint_registry[pred_len][model_name] = checkpoint_path

print(f"\n{'='*90}")



Model Parameters:
  Total parameters: 1,994,427
  Trainable parameters: 1,994,424
  Model size: ~7.61 MB (fp32)

STARTING MULTI-MODEL TRAINING WITH DIFFERENT CONFIGURATIONS

Configuration 1/1: PatchTST/42 (pred_len=336, d_model=42)
  Updated config: pred_len=336, d_model=128, n_heads=16, d_ff=256
  Loading data with pred_len=336...
train 36040
val 4935
test 10204
    Training samples: 36040
    Validation samples: 4935
    Test samples: 10204
  Creating model...
  Model parameters: 3,160,659 (trainable: 3,160,656)
  Starting training (max 100 epochs)...
    Epoch   1/100 | Time: 163.53s | Train Loss: 1.0070179 | Val Loss: 0.8861486
Updating learning rate to 0.0001


KeyboardInterrupt: 

In [15]:
# Load saved checkpoints and generate evaluation metrics and visualizations
# This cell handles EVALUATION and VISUALIZATION ONLY

import datetime

# Dictionary to store all results
all_results = {
    336: {
        'PatchTST/64': {'mse': None, 'mae': None, 'checkpoint_path': None},
        'PatchTST/42': {'mse': None, 'mae': None, 'checkpoint_path': None}
    },
    720: {
        'PatchTST/64': {'mse': None, 'mae': None, 'checkpoint_path': None},
        'PatchTST/42': {'mse': None, 'mae': None, 'checkpoint_path': None}
    }
}

# Training configurations: (pred_len, d_model, model_name)
training_configs = [
    # (336, 64, 'PatchTST/64'),
    (336, 42, 'PatchTST/42'),
    # (720, 64, 'PatchTST/64'),
    # (720, 42, 'PatchTST/42'),
]

print("\n" + "="*90)
print("LOADING CHECKPOINTS AND EVALUATING MODELS")
print("="*90)

for config_idx, (pred_len, d_model, model_name) in enumerate(training_configs, 1):
    print(f"\n{'='*90}")
    print(f"Configuration {config_idx}/{len(training_configs)}: {model_name} (pred_len={pred_len}, d_model={d_model})")
    print(f"{'='*90}")
    
    # Update configuration
    args.pred_len = pred_len
    args.d_model = d_model
    
    # Adjust related parameters proportionally (MUST MATCH TRAINING)
    if d_model == 42:
        args.n_heads = 3  # Adjust heads for smaller model
        args.d_ff = 128
    elif d_model == 64:
        args.n_heads = 4
        args.d_ff = 256
    else:
        args.n_heads = 8  # Original PatchTST
        args.d_ff = 256
    
    # Recreate data loaders for evaluation
    print(f"  Loading data with pred_len={args.pred_len}...")
    train_data, train_loader = data_provider(args, flag='train')
    val_data, val_loader = data_provider(args, flag='val')
    test_data, test_loader = data_provider(args, flag='test')
    
    # Create model for loading checkpoint
    print(f"  Creating model for evaluation...")
    model = PatchTST(args).float()
    model = model.to(device)
    
    # Get checkpoint path
    checkpoint_path = checkpoint_registry[pred_len][model_name]
    best_model_path = os.path.join(checkpoint_path, 'checkpoint.pth')
    final_checkpoint_path = os.path.join(checkpoint_path, 'final_checkpoint.pth')
    
    # Load best model from checkpoint
    print(f"  Loading best model from: {best_model_path}")
    model.load_state_dict(torch.load(best_model_path, weights_only=False))
    
    # Evaluate model
    print(f"  Evaluating model on test set...")
    preds, trues, inputs, metrics = evaluate_model(model, test_loader, device, args)
    
    # Store results
    all_results[pred_len][model_name]['mse'] = metrics['mse']
    all_results[pred_len][model_name]['mae'] = metrics['mae']
    all_results[pred_len][model_name]['checkpoint_path'] = final_checkpoint_path

print(f"\n{'='*90}")
print("EVALUATION COMPLETED")
print(f"{'='*90}\n")

# Save all results to a file
results_log_path = os.path.join(args.checkpoints, 'training_results_summary.txt')
with open(results_log_path, 'w') as f:
    f.write("PatchTST Multi-Configuration Training Results\n")
    f.write(f"Timestamp: {datetime.datetime.now()}\n")
    f.write("="*90 + "\n\n")
    
    for pred_len in [336, 720]:
        f.write(f"Prediction Length: {pred_len}\n")
        f.write("-"*90 + "\n")
        for model_name in ['PatchTST/64', 'PatchTST/42']:
            mse = all_results[pred_len][model_name]['mse']
            mae = all_results[pred_len][model_name]['mae']
            checkpoint = all_results[pred_len][model_name]['checkpoint_path']
            f.write(f"  {model_name}:\n")
            f.write(f"    MSE: {mse:.7f}\n")
            f.write(f"    MAE: {mae:.7f}\n")
            f.write(f"    Checkpoint: {checkpoint}\n")
        f.write("\n")

print(f"Results summary saved to: {results_log_path}")

# Display final comparison table
print("\n" + "="*90)
print("FINAL RESULTS COMPARISON")
print("="*90)
print(f"\n{'Pred Length':<15} {'PatchTST/64 MSE':<18} {'PatchTST/64 MAE':<18} {'PatchTST/42 MSE':<18} {'PatchTST/42 MAE':<18}")
print("-"*90)

for pred_len in [336, 720]:
    mse_64 = all_results[pred_len]['PatchTST/64']['mse']
    mae_64 = all_results[pred_len]['PatchTST/64']['mae']
    mse_42 = all_results[pred_len]['PatchTST/42']['mse']
    mae_42 = all_results[pred_len]['PatchTST/42']['mae']
    
    mse_64_str = f"{mse_64:.7f}" if mse_64 is not None else "N/A"
    mae_64_str = f"{mae_64:.7f}" if mae_64 is not None else "N/A"
    mse_42_str = f"{mse_42:.7f}" if mse_42 is not None else "N/A"
    mae_42_str = f"{mae_42:.7f}" if mae_42 is not None else "N/A"
    
    print(f"{str(pred_len):<15} {mse_64_str:<18} {mae_64_str:<18} {mse_42_str:<18} {mae_42_str:<18}")

print("="*90)


LOADING CHECKPOINTS AND EVALUATING MODELS

Configuration 1/4: PatchTST/64 (pred_len=336, d_model=64)
  Loading data with pred_len=336...
train 36040
val 4935
test 10204
  Creating model for evaluation...
  Loading best model from: /content/model/checkpoints_weather/PatchTST_custom_sl512_pl336_dm64_patch16/checkpoint.pth
  Evaluating model on test set...

Test Set Evaluation Metrics:
  MSE:  0.3707829
  MAE:  0.3848366

Configuration 2/4: PatchTST/42 (pred_len=336, d_model=42)
  Loading data with pred_len=336...
train 36040
val 4935
test 10204
  Creating model for evaluation...
  Loading best model from: /content/model/checkpoints_weather/PatchTST_custom_sl512_pl336_dm42_patch16/checkpoint.pth
  Evaluating model on test set...

Test Set Evaluation Metrics:
  MSE:  0.3694819
  MAE:  0.3843580

Configuration 3/4: PatchTST/64 (pred_len=720, d_model=64)
  Loading data with pred_len=720...
train 35656
val 4551
test 9820
  Creating model for evaluation...
  Loading best model from: /content/