## Preparation

In [1]:
import os
import time
from datetime import date

import numpy as np
import polars as pl
import torch
import sys


dir = "C:/Users/USER/PycharmProjects/ts_forecaster_lib/raw_data/"        # default project directory\
save_dir = os.path.join(dir, 'fit')
os.makedirs(save_dir, exist_ok = True)
save_root = os.path.join(save_dir, 'Xpatchtst', '20260119')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
lookback = 52
horizon = 27
batch_size = 256
freq = 'weekly'
split_mode = 'multi'
shuffle = True
id_col = 'unique_id'
date_col = 'date'
y_col = 'y'

# add past exogenous continuous variable columns
past_exo_cont_cols = (
    # "exo_p_y_lag_1w",
    "exo_p_y_lag_2w",
    # "exo_p_y_lag_52w",
    "exo_p_y_rollmean_4w","exo_p_y_rollmean_12w","exo_p_y_rollstd_4w",
    # "exo_p_weeks_since_holiday",
    # "exo_p_temperature",
    # "exo_p_fuel_price",
    # "exo_p_cpi",
    # "exo_p_unemployment",
    # "exo_p_markdown_sum",
    # "exo_p_markdown1",
    # "exo_p_markdown2",
    # "exo_p_markdown3",
    # "exo_p_markdown4",
    # "exo_p_markdown5",
    # "exo_markdown1_isnull",
    # "exo_markdown2_isnull",
    # "exo_markdown3_isnull",
    # "exo_markdown4_isnull",
    # "exo_markdown5_isnull",
)

# add past exogenous categorical variable columns
past_exo_cat_cols = (
    # "exo_c_woy_bucket",
)

future_exo_cb = None

# real dataframe
df = pl.read_parquet(dir + 'train_data/walmart_best_feature_train.parquet')

In [2]:
from modeling_module.training.model_trainers.total_train import run_total_train_weekly
from modeling_module.data_loader import MultiPartExoDataModule

def inspect(loader, name):
    b = next(iter(loader))
    x, y, uid, fe, pe_cont, pe_cat = b
    print(f"[{name}] x:", x.shape, x.device, x.dtype)
    print(f"[{name}] fe:", fe.shape, fe.device, fe.dtype)
    print(f"[{name}] pe:", pe_cont.shape, pe_cont.device, pe_cont.dtype)
    print(f"[{name}] future_exo_cb is None?", loader.collate_fn.future_exo_cb is None)
    if fe.shape[-1] > 0:
        print(f"[{name}] fe sample:", fe[0, :3, :])

data_module = MultiPartExoDataModule(
    df = df,
    id_col = id_col,
    date_col = date_col,
    y_col = y_col,
    lookback = lookback,
    horizon = horizon,
    batch_size = batch_size,
    past_exo_cont_cols = past_exo_cont_cols,
    past_exo_cat_cols = past_exo_cat_cols,
    future_exo_cb = future_exo_cb,
    freq = freq,
    shuffle = shuffle,
    split_mode = split_mode,
)

train_loader = data_module.get_train_loader()
val_loader = data_module.get_val_loader()

inspect(train_loader, 'train_loader')

run_total_train_weekly(
    train_loader, val_loader, device = device,
    lookback = lookback, horizon = horizon,
    warmup_epochs = 30, spike_epochs = 0,
    save_dir = save_root,
    use_exogenous_mode = False,
    models_to_run = ['patchmixer'], use_ssl_pretrain = False
)

[train_loader] x: torch.Size([256, 52, 1]) cpu torch.float32
[train_loader] fe: torch.Size([256, 27, 0]) cpu torch.float32
[train_loader] pe: torch.Size([256, 52, 4]) cpu torch.float32
[train_loader] future_exo_cb is None? True

[total_train] === RUN: patchmixer (weekly) ===
PatchMixer Base (Weekly)
[EXO-setup] inferred E=0, model.exo_dim=0, has_head=False

[train_patchmixer] ===== Stage 1/2 =====
  - spike: OFF
  - epochs: 30 | lr=0.0003 | horizon_decay=False
[train_patchmixer] Effective TrainingConfig:
{
  "device": "cuda",
  "log_every": 100,
  "use_amp": true,
  "lookback": 52,
  "horizon": 27,
  "epochs": 30,
  "lr": 0.0003,
  "weight_decay": 0.001,
  "t_max": 40,
  "patience": 100,
  "max_grad_norm": 30.0,
  "amp_device": "cuda",
  "loss_mode": "point",
  "point_loss": "huber",
  "huber_delta": 0.8,
  "q_star": 0.5,
  "use_cost_q_star": false,
  "Cu": 1.0,
  "Co": 1.0,
  "quantiles": [
    0.1,
    0.5,
    0.9
  ],
  "use_intermittent": true,
  "alpha_zero": 3.0,
  "alpha_pos": 

{'PatchMixer Base': {'model': BaseModel(
    (backbone): PatchMixerBackbone(
      (padding_patch_layer): ReplicationPad1d((0, 8))
      (blocks): ModuleList(
        (0-2): 3 x PatchMixerLayer(
          (token_mixer): Sequential(
            (0): Conv1d(64, 64, kernel_size=(5,), stride=(1,), groups=64)
            (1): GELU(approximate='none')
            (2): GroupNorm(32, 64, eps=1e-05, affine=True)
          )
          (channel_mixer): Sequential(
            (0): Conv1d(64, 64, kernel_size=(1,), stride=(1,))
            (1): GELU(approximate='none')
            (2): GroupNorm(32, 64, eps=1e-05, affine=True)
          )
          (dropout): Dropout(p=0.05, inplace=False)
        )
      )
      (W_P): Linear(in_features=12, out_features=64, bias=True)
      (flatten): Flatten(start_dim=-2, end_dim=-1)
    )
    (z_proj): Identity()
    (expander): TemporalExpander(
      (time_bias): Sequential(
        (0): Linear(in_features=480, out_features=448, bias=True)
        (1): GELU(a