# HuggingFace TimesFM test

In [6]:
# Standard librairy
import os
from pathlib import Path
from tqdm import tqdm

# Librairies
import numpy as np
%matplotlib widget
import matplotlib.pyplot as plt
import pandas as pd

# ML librairies
import torch
import torch.nn as nn

In [7]:
from transformers import (
    PatchTSTConfig,
    TimesFmModelForPrediction
)

from transformers import set_seed
set_seed(42)

In [8]:
timesfm = TimesFmModelForPrediction.from_pretrained(
    "google/timesfm-2.0-500m-pytorch",
    dtype=torch.bfloat16,
    attn_implementation="sdpa",
    device_map="auto",
)

In [9]:
print(f"\nModel: \n{timesfm}")
print("\n-----------------------\n\n")
print(f"Number of parameters: \n{timesfm.num_parameters()/1e6:.2f}M")
print("\n-----------------------\n\n")
print(f"Config: \n{timesfm.config}")


Model: 
TimesFmModelForPrediction(
  (decoder): TimesFmModel(
    (input_ff_layer): TimesFmResidualBlock(
      (input_layer): Linear(in_features=64, out_features=1280, bias=True)
      (activation): SiLU()
      (output_layer): Linear(in_features=1280, out_features=1280, bias=True)
      (residual_layer): Linear(in_features=64, out_features=1280, bias=True)
    )
    (freq_emb): Embedding(3, 1280)
    (layers): ModuleList(
      (0-49): 50 x TimesFmDecoderLayer(
        (self_attn): TimesFmAttention(
          (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (o_proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (mlp): TimesFmMLP(
          (gate_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (down_proj): Linear(in_features=1280, out_features=1280, bias=True)

In [62]:
dummy_input = [torch.randn(length).to(torch.bfloat16).to(timesfm.device) for length in [5, 500, 1000, 3000, 10000]]
outputs = timesfm(past_values=dummy_input)
print("-----------------------")
print(f"Input len: {len(dummy_input)}")
print(f"Input (first) size: {dummy_input[0].shape}")
print("-----------------------")
print(f"Output: {outputs.keys()}")
print(f"Output (hidden) size: {outputs['last_hidden_state'].shape}")
print(f"Output (points) size: {outputs['mean_predictions'].shape}")
print(f"Output (quantiles) size: {outputs['full_predictions'].shape}")


-----------------------
Input len: 5
Input (first) size: torch.Size([5])
-----------------------
Output: odict_keys(['last_hidden_state', 'mean_predictions', 'full_predictions'])
Output (hidden) size: torch.Size([5, 64, 1280])
Output (points) size: torch.Size([5, 128])
Output (quantiles) size: torch.Size([5, 128, 10])


### Observe each component of the model: input size, output size, number of parameters

#### Decoder

In [63]:
decoder = timesfm.decoder
print(f"\nDecoder: \n{decoder}")
print("\n-----------------------\n\n")
print(f"\nNumber of trainable parameters: \n{sum(p.numel() for p in decoder.parameters() if p.requires_grad)/1e6:.2f}M")
print(f"Number of parameters: \n{sum(p.numel() for p in decoder.parameters())/1e6:.2f}M")


Decoder: 
TimesFmModel(
  (input_ff_layer): TimesFmResidualBlock(
    (input_layer): Linear(in_features=64, out_features=1280, bias=True)
    (activation): SiLU()
    (output_layer): Linear(in_features=1280, out_features=1280, bias=True)
    (residual_layer): Linear(in_features=64, out_features=1280, bias=True)
  )
  (freq_emb): Embedding(3, 1280)
  (layers): ModuleList(
    (0-49): 50 x TimesFmDecoderLayer(
      (self_attn): TimesFmAttention(
        (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (o_proj): Linear(in_features=1280, out_features=1280, bias=True)
      )
      (mlp): TimesFmMLP(
        (gate_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (down_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (layer_norm): LayerNorm((1280,), eps=1e-06, elementwise_affine=Tr

In [None]:
def prepare_input_for_decoder(raw_timeseries, model, freq_idx=None):
    """
    raw_timeseries: [batch_size, sequence_length]
    retourne: [batch_size, num_patches, hidden_size] prêt pour le decoder
    """
    batch_size, seq_len = raw_timeseries.shape
    
    # 1. Patching
    patch_length = model.config.patch_length
    num_patches = seq_len // patch_length
    
    # Reshape en patches
    patches = raw_timeseries.reshape(batch_size, num_patches, patch_length)
    
    # 2. Projection via le input_ff_layer
    # Le decoder a son propre input_ff_layer !
    projected_patches = model.decoder.input_ff_layer(patches)
    
    # 3. Ajout du freq_emb si disponible
    if hasattr(model.decoder, 'freq_emb') and freq_idx is not None:
        freq_embedding = model.decoder.freq_emb(freq_idx)
        projected_patches = projected_patches + freq_embedding.unsqueeze(1)
    
    return projected_patches

# Usage :
ready_input = prepare_input_for_decoder(
    raw_timeseries=your_data,  # [batch, seq_len]
    model=model,
    freq_idx=torch.tensor([0, 1, 2])  # selon vos données
)

# Maintenant vous pouvez utiliser le decoder seul
decoder_output = model.decoder(ready_input)

In [68]:
dummy_input = torch.randn(8, 64, 1280).to(torch.bfloat16).to(timesfm.device)
dummy_padding = torch.ones(8, 64, 1280).to(torch.bfloat16).to(timesfm.device)
dummy_frequency = torch.tensor(5).to(torch.int).to(timesfm.device)
output = decoder(past_values=dummy_input, past_values_padding=dummy_padding, freq=dummy_frequency)
print("-----------------------")
print(f"Input size: {dummy_input.shape}")
print(f"Output size: {output.shape}")

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
with torch.no_grad():
    # Passer la série through le modèle
    outputs = .decoder(input_patches)
    
    # outputs.shape = [batch_size, num_patches, hidden_size=1280]
    
    # Option 1: Moyenne sur tous les patches
    series_embedding = outputs.mean(dim=1)  # [batch_size, 1280]

NameError: name 'model' is not defined