# HuggingFace PatchTST test

1. Pendulum prediction test (transfer learning)

2. Pendulum representation learning test: encoder-decoder separation (transfer learning)

3. Train simple PatchTST ??


In [1]:
# Standard librairy
import os
from pathlib import Path
from tqdm import tqdm

# Librairies
import numpy as np
%matplotlib widget
import matplotlib.pyplot as plt
import pandas as pd

# ML librairies
import torch
import torch.nn as nn

In [2]:
from transformers import (
    PatchTSTConfig,
    PatchTSTForPrediction
)

In [3]:
from transformers import set_seed

set_seed(2023)

In [4]:
# Load model directly

model_name = "namctin/patchtst_etth1_forecast"
token = os.getenv('HUGGING_FACE_HUB_TOKEN')


patch_tst = PatchTSTForPrediction.from_pretrained(model_name, use_auth_token=token)
print(patch_tst)
print("----------------")
print(patch_tst.model.encoder)
print("----------------")
print(patch_tst.head)



PatchTSTForPrediction(
  (model): PatchTSTModel(
    (scaler): PatchTSTScaler(
      (scaler): PatchTSTStdScaler()
    )
    (patchifier): PatchTSTPatchify()
    (masking): Identity()
    (encoder): PatchTSTEncoder(
      (embedder): PatchTSTEmbedding(
        (input_embedding): Linear(in_features=12, out_features=128, bias=True)
      )
      (positional_encoder): PatchTSTPositionalEncoding(
        (positional_dropout): Identity()
      )
      (layers): ModuleList(
        (0-2): 3 x PatchTSTEncoderLayer(
          (self_attn): PatchTSTAttention(
            (k_proj): Linear(in_features=128, out_features=128, bias=True)
            (v_proj): Linear(in_features=128, out_features=128, bias=True)
            (q_proj): Linear(in_features=128, out_features=128, bias=True)
            (out_proj): Linear(in_features=128, out_features=128, bias=True)
          )
          (dropout_path1): Identity()
          (norm_sublayer1): PatchTSTBatchNorm(
            (batchnorm): BatchNorm1d(128, eps

### Test the different parts (input and output shapes)

#### PatchTST total 

In [5]:
print(patch_tst)

PatchTSTForPrediction(
  (model): PatchTSTModel(
    (scaler): PatchTSTScaler(
      (scaler): PatchTSTStdScaler()
    )
    (patchifier): PatchTSTPatchify()
    (masking): Identity()
    (encoder): PatchTSTEncoder(
      (embedder): PatchTSTEmbedding(
        (input_embedding): Linear(in_features=12, out_features=128, bias=True)
      )
      (positional_encoder): PatchTSTPositionalEncoding(
        (positional_dropout): Identity()
      )
      (layers): ModuleList(
        (0-2): 3 x PatchTSTEncoderLayer(
          (self_attn): PatchTSTAttention(
            (k_proj): Linear(in_features=128, out_features=128, bias=True)
            (v_proj): Linear(in_features=128, out_features=128, bias=True)
            (q_proj): Linear(in_features=128, out_features=128, bias=True)
            (out_proj): Linear(in_features=128, out_features=128, bias=True)
          )
          (dropout_path1): Identity()
          (norm_sublayer1): PatchTSTBatchNorm(
            (batchnorm): BatchNorm1d(128, eps

In [6]:
configuration = patch_tst.config
print(configuration)

PatchTSTConfig {
  "activation_function": "gelu",
  "architectures": [
    "PatchTSTForPrediction"
  ],
  "attention_dropout": 0.0,
  "bias": true,
  "channel_attention": false,
  "channel_consistent_masking": false,
  "context_length": 512,
  "d_model": 128,
  "distribution_output": "student_t",
  "do_mask_input": null,
  "dropout": 0.2,
  "dtype": "float32",
  "ff_dropout": 0.0,
  "ffn_dim": 512,
  "head_dropout": 0.2,
  "init_std": 0.02,
  "loss": "mse",
  "mask_input": null,
  "mask_type": "random",
  "mask_value": 0,
  "model_type": "patchtst",
  "norm_eps": 1e-05,
  "norm_type": "batchnorm",
  "num_attention_heads": 16,
  "num_forecast_mask_patches": [
    2
  ],
  "num_hidden_layers": 3,
  "num_input_channels": 7,
  "num_parallel_samples": 100,
  "num_targets": 1,
  "output_range": null,
  "patch_length": 12,
  "patch_stride": 12,
  "path_dropout": 0.0,
  "pooling_type": null,
  "positional_dropout": 0.0,
  "positional_encoding_type": "sincos",
  "pre_norm": true,
  "prediction_

In [7]:
x = torch.randn(32, 512, 7)     # (batch_size, seq_len, feature_dim)
print("Input shape:", x.shape)
y = patch_tst(x)

print(y)
print(y.keys())
print("Loc shape:", y["loc"].shape)                                      # (batch_size, 1, feature_dim)
print("Scale shape:", y["scale"].shape)                                  # (batch_size, 1, feature_dim)
print("Prediction output shape:", y["prediction_outputs"].shape)         # (batch_size, pred_len, feature_dim)

Input shape: torch.Size([32, 512, 7])
PatchTSTForPredictionOutput(loss=None, prediction_outputs=tensor([[[ 8.8111e-02,  1.7028e-01,  3.6903e-01,  ...,  7.0812e-02,
           1.7413e-01,  2.9766e-01],
         [ 8.6662e-02,  1.9881e-01,  3.8791e-01,  ...,  1.1349e-01,
           1.4575e-01,  2.9962e-01],
         [ 6.9755e-02,  2.1825e-01,  3.5104e-01,  ...,  1.5535e-01,
           8.8342e-02,  2.8630e-01],
         ...,
         [ 3.4225e-02,  1.0364e-01,  4.6720e-03,  ..., -6.8436e-02,
           7.2459e-02,  1.1766e-01],
         [ 5.4895e-02,  8.3268e-02,  1.1626e-01,  ..., -3.1151e-02,
           1.1622e-01,  1.5621e-01],
         [ 3.0978e-02,  5.2004e-02,  2.3360e-01,  ..., -2.0433e-02,
           1.1170e-01,  1.4780e-01]],

        [[ 3.1012e-02,  4.8573e-01,  2.5063e-01,  ...,  1.5257e-01,
           2.2662e-01, -4.2260e-01],
         [ 8.7235e-02,  4.8412e-01,  2.7962e-01,  ...,  5.9964e-02,
           2.2064e-01, -4.1747e-01],
         [ 1.2523e-01,  4.3521e-01,  2.8834e-01,

#### Model

In [8]:
model = patch_tst.model
print(model)

PatchTSTModel(
  (scaler): PatchTSTScaler(
    (scaler): PatchTSTStdScaler()
  )
  (patchifier): PatchTSTPatchify()
  (masking): Identity()
  (encoder): PatchTSTEncoder(
    (embedder): PatchTSTEmbedding(
      (input_embedding): Linear(in_features=12, out_features=128, bias=True)
    )
    (positional_encoder): PatchTSTPositionalEncoding(
      (positional_dropout): Identity()
    )
    (layers): ModuleList(
      (0-2): 3 x PatchTSTEncoderLayer(
        (self_attn): PatchTSTAttention(
          (k_proj): Linear(in_features=128, out_features=128, bias=True)
          (v_proj): Linear(in_features=128, out_features=128, bias=True)
          (q_proj): Linear(in_features=128, out_features=128, bias=True)
          (out_proj): Linear(in_features=128, out_features=128, bias=True)
        )
        (dropout_path1): Identity()
        (norm_sublayer1): PatchTSTBatchNorm(
          (batchnorm): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (

In [9]:
if hasattr(model, 'config'):
    if model.config == configuration:
        print("Config is the same as the patchtst config")
    else:
        print("Config is different from the patchtst config")
        print("Model config:", model.config)
else:
    print("No config attribute for model")


Config is the same as the patchtst config


In [10]:
x = torch.randn(32, 512, 7)  # (batch_size, seq_len, feature_dim)
print(x.shape)
y = model(x)

print(y)
print(y.keys())
print(y["loc"].shape)               # (batch_size, 1, feature_dim)
print(y["scale"].shape)             # (batch_size, 1, feature_dim)
print(y["patch_input"].shape)       # (batch_size, feature_dim, n_patches, patch_len)
print(y["last_hidden_state"].shape) # (batch_size, feature_dim, n_patches, d_model)

torch.Size([32, 512, 7])
PatchTSTModelOutput(last_hidden_state=tensor([[[[ 1.1530e-01,  3.8802e-02, -5.7090e-01,  ...,  3.4679e-01,
            2.9975e-01,  2.9606e-01],
          [ 1.7347e-01,  4.6780e-01, -4.3653e-01,  ...,  2.4095e-01,
            2.3122e-01,  3.3059e-01],
          [ 3.4398e-01, -2.4829e-01, -2.1687e-01,  ...,  4.1609e-01,
            2.0628e-01,  1.7155e-01],
          ...,
          [-1.1571e-01,  4.2251e-01, -2.5873e-01,  ...,  1.8190e-01,
           -8.6008e-02,  9.3466e-02],
          [ 1.1427e-02, -3.0515e-01, -1.9051e-01,  ...,  1.1153e-01,
            1.3485e-01,  2.4354e-01],
          [ 1.5352e-01, -8.4608e-01, -9.8022e-01,  ...,  2.3162e-01,
            2.8967e-02,  3.4100e-01]],

         [[-2.1811e-01,  8.4555e-01, -4.9562e-01,  ...,  1.2342e-01,
           -8.0912e-02, -9.9286e-02],
          [-1.0289e-01,  1.3632e+00, -6.9327e-01,  ...,  2.6556e-01,
           -1.7301e-01,  2.6455e-01],
          [-7.7201e-02,  4.2372e-01, -6.7731e-02,  ..., -4.4374e

#### Scaler

In [11]:
scaler = patch_tst.model.scaler
print(scaler)

PatchTSTScaler(
  (scaler): PatchTSTStdScaler()
)


In [12]:
if hasattr(scaler, 'config'):
    if scaler.config == configuration:
        print("Config is the same as the patchtst config")
    else:
        print("Config is different from the patchtst config")
        print("scaler config:", scaler.config)
else:
    print("No config attribute for scaler")

No config attribute for scaler


In [13]:
print(help(scaler))

Help on PatchTSTScaler in module transformers.models.patchtst.modeling_patchtst object:

class PatchTSTScaler(torch.nn.modules.module.Module)
 |  PatchTSTScaler(config: transformers.models.patchtst.configuration_patchtst.PatchTSTConfig)
 |  
 |  Method resolution order:
 |      PatchTSTScaler
 |      torch.nn.modules.module.Module
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, config: transformers.models.patchtst.configuration_patchtst.PatchTSTConfig)
 |      Initialize internal Module state, shared by both nn.Module and ScriptModule.
 |  
 |  forward(self, data: torch.Tensor, observed_indicator: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]
 |      Parameters:
 |          data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
 |              Input for scaler calculation
 |          observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
 |              Calculat

In [14]:
x = torch.randn(32, 512, 7)  # (batch_size, seq_len, feature_dim)
observed_indicator = torch.ones_like(x)
print(x.shape)
y = scaler(x, observed_indicator)


print(y)
print(y[0].shape)               # (batch_size, seq_len, feature_dim)
print(y[1].shape)               # (batch_size, 1, feature_dim)
print(y[2].shape)               # (batch_size, 1, feature_dim)

torch.Size([32, 512, 7])
(tensor([[[ 0.3972,  0.2267,  1.1127,  ..., -0.3972, -1.8628, -0.4584],
         [ 1.4118, -0.2611,  0.7645,  ..., -0.2996, -0.0775, -0.9644],
         [-0.7245, -0.1894,  0.6083,  ..., -0.0585,  0.0872, -1.4757],
         ...,
         [-0.0803,  0.3079,  0.3420,  ...,  0.4334, -0.3565, -0.6520],
         [-1.0479,  0.1220, -1.2501,  ...,  0.5923,  0.6201,  1.8374],
         [-0.6825,  2.2455, -0.2523,  ...,  1.1343,  0.3267,  0.2213]],

        [[ 0.5742,  0.5077, -0.7264,  ..., -0.4919,  2.5178,  1.8407],
         [ 0.8922, -0.0727, -0.5757,  ...,  1.5378,  0.7419, -0.7947],
         [-0.2786, -0.6793, -1.6733,  ...,  0.7933,  0.1786, -1.1213],
         ...,
         [-3.8304, -0.5799,  1.2930,  ...,  0.3280,  0.3620,  1.1060],
         [ 0.0528, -0.8538, -0.4834,  ..., -0.1926, -0.4699, -0.7764],
         [-0.1945, -0.0110, -1.1665,  ..., -0.5518, -0.1372, -1.1199]],

        [[ 0.6212, -1.2352, -0.0760,  ...,  0.3068, -1.5586,  1.2645],
         [-0.0947, 

#### Patchifier

In [15]:
patchifier = patch_tst.model.patchifier
print(patchifier)

PatchTSTPatchify()


In [16]:
if hasattr(patchifier, 'config'):
    if patchifier.config == configuration:
        print("Config is the same as the patchtst config")
    else:
        print("Config is different from the patchtst config")
        print("patchifier config:", patchifier.config)
else:
    print("No config attribute for patchifier")

No config attribute for patchifier


In [17]:
# print(help(patchifier))

In [18]:
x = torch.randn(32, 512, 7)  # (batch_size, seq_len, feature_dim)
print(x.shape)
y = patchifier(x)

print(y)
print(y.shape)

torch.Size([32, 512, 7])
tensor([[[[ 2.6716e+00, -1.2197e-01,  5.1084e-01,  ...,  6.0517e-01,
           -5.6489e-01,  2.5530e-01],
          [ 1.1997e-01,  1.1466e-01,  2.4277e-01,  ...,  7.8695e-01,
            3.6354e-02,  4.6507e-01],
          [ 9.8573e-01,  1.3754e-01, -4.4183e-02,  ..., -9.5744e-02,
            2.8034e+00,  1.0092e+00],
          ...,
          [ 3.0133e-01,  2.0709e+00, -1.4508e-02,  ..., -5.4333e-01,
            6.0142e-02, -2.0382e-01],
          [ 5.8343e-01, -1.6002e+00, -8.7410e-01,  ...,  2.6471e-01,
            3.1500e-01, -1.6383e+00],
          [ 2.2934e+00,  8.4243e-01, -2.5930e-02,  ..., -1.6417e+00,
            1.3874e+00, -4.1150e-01]],

         [[-4.9003e-01, -4.9462e-01,  1.4232e+00,  ...,  3.5034e-01,
           -7.0691e-01,  1.0614e+00],
          [-6.1696e-01, -7.4324e-01, -9.9000e-01,  ..., -3.8949e-01,
            1.9900e+00,  6.3477e-01],
          [-1.9229e+00,  2.4430e+00,  9.9762e-01,  ..., -3.8964e-02,
            1.2430e+00,  4.5243e-

#### Masking

In [19]:
masking = patch_tst.model.masking
print(masking)

Identity()


In [20]:
if hasattr(masking, 'config'):
    if masking.config == configuration:
        print("Config is the same as the patchtst config")
    else:
        print("Config is different from the patchtst config")
        print("masking config:", masking.config)
else:
    print("No config attribute for masking")

No config attribute for masking


In [21]:
# print(help(masking))

In [22]:
x = torch.ones(32, 512, 12)  # (batch_size, seq_len, feature_dim)
print(x.shape)
y = masking(x)

print(y)
print(y.shape)
assert (x.numpy() == y.numpy()).all()

torch.Size([32, 512, 12])
tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]],

        ...,

        [[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1

#### Encoder

In [23]:
encoder = patch_tst.model.encoder
print(encoder)


PatchTSTEncoder(
  (embedder): PatchTSTEmbedding(
    (input_embedding): Linear(in_features=12, out_features=128, bias=True)
  )
  (positional_encoder): PatchTSTPositionalEncoding(
    (positional_dropout): Identity()
  )
  (layers): ModuleList(
    (0-2): 3 x PatchTSTEncoderLayer(
      (self_attn): PatchTSTAttention(
        (k_proj): Linear(in_features=128, out_features=128, bias=True)
        (v_proj): Linear(in_features=128, out_features=128, bias=True)
        (q_proj): Linear(in_features=128, out_features=128, bias=True)
        (out_proj): Linear(in_features=128, out_features=128, bias=True)
      )
      (dropout_path1): Identity()
      (norm_sublayer1): PatchTSTBatchNorm(
        (batchnorm): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (ff): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): GELUActivation()
        (2): Identity()
        (3): Linear(in_features=512, out_features=128,

In [24]:
if hasattr(encoder, 'config'):
    if encoder.config == configuration:
        print("Config is the same as the patchtst config")
    else:
        print("Config is different from the patchtst config")
        print("encoder config:", encoder.config)
else:
    print("No config attribute for encoder")

Config is the same as the patchtst config


In [25]:
# print(help(encoder))

In [26]:
x = torch.randn(32, 7, 42, 12)  # (batch_size, seq_len, feature_dim)
print(x.shape)
y = encoder(x)

print(y)
print(y.keys())
print(y["last_hidden_state"].shape) # (batch_size, feature_dim, n_patches, d_model)

torch.Size([32, 7, 42, 12])


BaseModelOutput(last_hidden_state=tensor([[[[-1.7059e-01,  3.5729e-02,  1.3017e-01,  ..., -6.3459e-01,
           -1.4387e-01, -5.5916e-01],
          [-1.7156e-02, -1.5409e-01,  6.1539e-01,  ..., -5.6577e-01,
            2.8412e-02, -5.2964e-01],
          [ 1.3428e-01, -2.2634e-01, -1.5831e-01,  ..., -3.1195e-01,
           -1.8213e-01, -2.2653e-01],
          ...,
          [-1.0776e-01, -2.2477e-02, -4.2349e-01,  ..., -3.8298e-01,
           -5.5220e-01, -4.8367e-01],
          [-4.5110e-02, -9.1618e-03, -1.2861e-01,  ..., -4.7202e-01,
           -1.6093e-01, -3.8218e-01],
          [-1.8988e-01,  4.5202e-01,  1.0803e-01,  ..., -4.7452e-01,
            6.1693e-02,  1.7700e-01]],

         [[-4.5324e-01,  7.9789e-01, -8.6619e-01,  ...,  1.2203e-02,
           -1.5963e-01, -3.3243e-01],
          [-3.6295e-01,  9.5589e-01, -6.2298e-01,  ...,  2.4897e-01,
           -1.8741e-01, -5.3131e-02],
          [-1.2978e-01,  3.6470e-02, -7.4439e-01,  ...,  1.8562e-01,
           -1.5664e-01, 

#### Encoder components

In [27]:
x0 = torch.randn(32, 7, 42, 12)  # (batch_size, features_dim, n_patches, len_patch)
x1 = encoder.embedder(x0)
x2 = encoder.positional_encoder(x1)
x3 = torch.randn(11, 22, 33, 128)
x3 = encoder.layers[0](x3)
x3 = x3[0]
x3 = encoder.layers[1](x3)
# print(x1)
# print(x2)
print(len(x3))
print(x1.shape)  # (batch_size, features_dim, n_patches, d_model)
print(x2.shape)  # (batch_size, features_dim, n_patches, d_model)
print(x3[0].shape)  # (batch_size, features_dim, n_patches, d_model)

1
torch.Size([32, 7, 42, 128])
torch.Size([32, 7, 43, 128])
torch.Size([11, 22, 33, 128])


#### Head

In [28]:
head = patch_tst.head
print(head)

PatchTSTPredictionHead(
  (flatten): Flatten(start_dim=2, end_dim=-1)
  (projection): Linear(in_features=128, out_features=96, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [29]:
if hasattr(head, 'config'):
    if head.config == configuration:
        print("Config is the same as the patchtst config")
    else:
        print("Config is different from the patchtst config")
        print("head config:", head.config)
else:
    print("No config attribute for head")

No config attribute for head


In [30]:
# print(help(head))

In [31]:
x = torch.randn(32, 7, 42, 128)  # (batch_size, feature_dim, n_patches, d_model)
print(x.shape)
y = head(x)

print(y)
print(y.shape)

torch.Size([32, 7, 42, 128])
tensor([[[-0.1354,  0.4033, -0.0155,  ..., -0.1611,  0.0125,  0.0577],
         [-0.3234,  0.3424, -0.2550,  ..., -0.1485, -0.4947,  0.1702],
         [ 0.0599,  0.2622, -0.1050,  ..., -0.3702, -0.0433, -0.4791],
         ...,
         [-0.1392,  0.2729, -0.0319,  ..., -0.2653, -0.3301,  0.2183],
         [-0.4272,  0.4116, -0.2587,  ..., -0.4063, -0.2970,  0.1108],
         [ 0.0620,  0.2191, -0.1331,  ..., -0.1038, -0.1996,  0.2124]],

        [[ 0.3768,  0.1791, -0.2440,  ..., -0.0517,  0.1739, -0.0062],
         [ 0.5042,  0.0958,  0.0570,  ..., -0.2111,  0.1739, -0.4097],
         [ 0.7940,  0.0791,  0.3117,  ...,  0.1187,  0.1764, -0.5925],
         ...,
         [-0.2983, -0.2720,  0.0367,  ..., -0.1240,  0.4437, -0.1258],
         [ 0.3337, -0.2774, -0.4875,  ..., -0.0276,  0.3326,  0.0426],
         [-0.0398,  0.1002, -0.0268,  ...,  0.1891,  0.2481,  0.0864]],

        [[ 0.0373,  0.0627, -0.1637,  ...,  0.0972, -0.2946,  0.3979],
         [-0.012