# TabPFN Rolling-Origin Workflow (with FE Grid, corr-spec variants, and target-only on/off)

This notebook runs Stage A (calibration) and Stage B (final) for TabPFN
over a **feature-engineering grid** in `tuning.py`, across **correlation specs**
(expanding vs EWMA) and **target-only blocks** (TSFresh + Chronos) on/off.

**Prereqs:** Your repo layout with `src/` (config, tuning, io_timesplits, models) is available.
TSFresh/Chronos parquet files exist when `use_target_blocks=True` is selected.

> If `tabpfn` is missing in your env: `pip install tabpfn torch`.


In [9]:
# --- Setup & Imports ---
import os, sys
from pathlib import Path

# Try to locate repo root that contains `src/`
def _locate_repo_root(start: Path) -> Path:
    cur = start.resolve()
    for _ in range(5):  # walk up to 5 levels
        if (cur / 'src').exists():
            return cur
        if cur.parent == cur:
            break
        cur = cur.parent
    return start.resolve()

NOTEBOOK_DIR = Path.cwd()
PROJECT_ROOT = _locate_repo_root(NOTEBOOK_DIR)
os.environ['PROJECT_ROOT'] = str(PROJECT_ROOT)
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.config import GlobalConfig, DEFAULT_CORR_SPEC, outputs_for_model
from src.tuning import run_stageA, run_stageB
from src.io_timesplits import load_target, load_ifo_features
from src.models.tabpfn import ForecastModel

import numpy as np, pandas as pd
from itertools import product

# Ensure base output structure exists
outputs_for_model('tabpfn')
print('PROJECT_ROOT =', PROJECT_ROOT)
print('Imports ok. If `tabpfn` is missing: `pip install tabpfn torch`.')


PROJECT_ROOT = /Users/jonasschernich/Documents/Masterarbeit/Code
Imports ok. If `tabpfn` is missing: `pip install tabpfn torch`.


In [10]:
# --- Load data ---
y = load_target()           # ΔIP with DatetimeIndex
X = load_ifo_features()     # ifo panel

idx = y.index.intersection(X.index)
y, X = y.loc[idx], X.loc[idx]
print('Shapes:', X.shape, y.shape)
display(y.describe())


Shapes: (407, 20) (407,)


count    407.000000
mean       0.042741
std        1.905304
min      -18.219895
25%       -0.822431
50%        0.105485
75%        1.063048
max       10.000000
Name: IP_change, dtype: float64

In [11]:
# --- Correlation spec helpers: expanding / EWMA ---
def make_corr_spec(kind: str, window: int = 60, lam: float = 0.98) -> dict:
    spec = dict(DEFAULT_CORR_SPEC)
    spec['mode'] = kind
    if kind == 'expanding':
        spec.pop('window', None)
        spec.pop('lambda', None)
    elif kind == 'ewm':
        spec['window'] = int(window)
        spec['lambda'] = float(lam)
    else:
        raise ValueError("kind must be 'expanding' or 'ewm'")
    return spec

corr_specs = {
    #'expanding':  make_corr_spec('expanding'),
    'ewm_098':    make_corr_spec('ewm', window=60, lam=0.98),
    #'ewm_096':    make_corr_spec('ewm', window=60, lam=0.96),
}
corr_specs


{'ewm_098': {'mode': 'ewm', 'window': 60, 'lam': None, 'lambda': 0.98}}

In [12]:
# --- Base config (splits & policy as in thesis) ---
def base_cfg() -> GlobalConfig:
    cfg = GlobalConfig()
    # Stage A/B rolling-origin splits
    cfg.W0_A     = 180
    cfg.BLOCKS_A = [(181,200), (201,220), (221,240)]
    cfg.W0_B     = 240
    # FE refresh cadence (months)
    cfg.refresh_cadence = 12
    # Online policy
    cfg.policy_window   = 24
    cfg.policy_decay    = 0.95
    cfg.policy_gain_min = 0.03
    cfg.policy_cooldown = 3
    # Target-only blocks (TSFresh + Chronos Parquet) on/off per variant
    cfg.use_target_blocks = False
    return cfg

cfg0 = base_cfg()
cfg0


GlobalConfig(seed=123, refresh_cadence_months=12, corr_spec={'mode': 'expanding', 'window': None, 'lam': None}, lag_candidates=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), top_k_lags_per_feature=1, use_rm3=True, k1_topk=50, screen_threshold=None, redundancy_method='greedy', redundancy_param=0.9, dr_method='none', pca_var_target=0.95, pca_kmax=25, pls_components=2, W0_A=180, BLOCKS_A=[(181, 200), (201, 220), (221, 240)], W0_B=240, policy_window=24, policy_gain_min=0.03, policy_cooldown=3)

In [13]:
# --- FE/DR/Screening grid (in model_grid) ---
lag_sets = [
    (1, 2, 3, 4, 6, 12),
]
topk_lags     = [1]
use_rm3_flags = [True]
k1_topk_vals  = [20]
redund_params = [1]

dr_options = [
    {"dr_method": "none"},
    {"dr_method": "pca", "pca_var_target": 0.95, "pca_kmax": 50},
    #{"dr_method": "pca", "pca_var_target": 0.99, "pca_kmax": 50},
    {"dr_method": "pls", "pls_components": 4},
    #{"dr_method": "pls", "pls_components": 8},
]

def build_model_grid():
    hp_grid = []
    for L in lag_sets:
        for k_top in topk_lags:
            for rm3 in use_rm3_flags:
                for k1 in k1_topk_vals:
                    for red in redund_params:
                        for dr in dr_options:
                            hp = {
                                'lag_candidates': L,
                                'top_k_lags_per_feature': k_top,
                                'use_rm3': rm3,
                                'k1_topk': k1,
                                'screen_threshold': None,
                                'redundancy_method': 'greedy',
                                'redundancy_param': red,
                                # TabPFN specifics
                                'use_gpu': False,          # set True if CUDA is stable
                                'posterior_samples': 8,
                            }
                            hp.update(dr)
                            hp_grid.append(hp)
    return hp_grid

model_grid = build_model_grid()
len(model_grid)


3

In [14]:
# --- Build all variants: (corr_spec x target_blocks on/off) ---
variants = []
for corr_name, spec in corr_specs.items():
    for tb in [False, True]:
        cfg = base_cfg()
        cfg.corr_spec = dict(spec)
        cfg.use_target_blocks = bool(tb)
        variants.append((f"{corr_name}_{'tbON' if tb else 'tbOFF'}", cfg))

for name, cfg in variants:
    print(name, '| use_target_blocks:', cfg.use_target_blocks, '| corr:', cfg.corr_spec.get('mode'))


ewm_098_tbOFF | use_target_blocks: False | corr: ewm
ewm_098_tbON | use_target_blocks: True | corr: ewm


In [15]:
# --- Stage A/B for all variants ---
results = {}

for name, cfg in variants:
    model_name = f"tabpfn__{name}"
    outputs_for_model(model_name)

    print(f"\n=== Stage A: {model_name} ===")
    shortlist = run_stageA(
        model_name=model_name,
        model_ctor=lambda hp: ForecastModel(hp),
        model_grid=model_grid,
        X=X, y=y, cfg=cfg,
        keep_top_k_final=5,
        min_survivors_per_block=3,
    )
    results[(name, 'shortlist')] = shortlist

    print(f"\n=== Stage B: {model_name} ===")
    run_stageB(
        model_name=model_name,
        model_ctor=lambda hp: ForecastModel(hp),
        shortlist=shortlist,
        X=X, y=y, cfg=cfg,
        # max_months=24,   # optionally throttle for a first run
    )

print("\nDone. Check outputs/stageA|stageB for results.")



=== Stage A: tabpfn__ewm_098_tbOFF ===
[Stage A][Block 1] train_end=180, OOS=181-200 | configs=3
  - Config 1/3: {'lag_candidates': (1, 2, 3, 4, 6, 12), 'top_k_lags_per_feature': 1, 'use_rm3': True, 'k1_topk': 20, 'screen_threshold': None, 'redundancy_method': 'greedy', 'redundancy_param': 1, 'use_gpu': False, 'posterior_samples': 8, 'dr_method': 'none'}
    · Month 1/20 processed | running...MSE=1.5504
    · Month 2/20 processed | running...MSE=2.2540
    · Month 3/20 processed | running...MSE=1.9245
    · Month 4/20 processed | running...MSE=1.6744
    · Month 5/20 processed | running...MSE=1.6210
    · Month 6/20 processed | running...MSE=1.4859
    · Month 7/20 processed | running...MSE=1.3778
    · Month 8/20 processed | running...MSE=1.2965
    · Month 9/20 processed | running...MSE=1.3370
    · Month 10/20 processed | running...MSE=1.2966
    · Month 11/20 processed | running...MSE=1.2420
    · Month 12/20 processed | running...MSE=1.2077
    · Month 13/20 processed | running..

analytics-python queue is full


[Stage B] Month origin t=299 | evaluating 3 configs | active=1
[Stage B] Month origin t=300 | evaluating 3 configs | active=1
[Stage B] Month origin t=301 | evaluating 3 configs | active=1
[Stage B] Month origin t=302 | evaluating 3 configs | active=1
[Stage B] Month origin t=303 | evaluating 3 configs | active=1
[Stage B] Month origin t=304 | evaluating 3 configs | active=1
[Stage B] Month origin t=305 | evaluating 3 configs | active=1
[Stage B] Month origin t=306 | evaluating 3 configs | active=1
[Stage B] Month origin t=307 | evaluating 3 configs | active=1
[Stage B] Month origin t=308 | evaluating 3 configs | active=1
[Stage B] Month origin t=309 | evaluating 3 configs | active=1
[Stage B] Month origin t=310 | evaluating 3 configs | active=1
[Stage B] Month origin t=311 | evaluating 3 configs | active=1
[Stage B] Month origin t=312 | evaluating 3 configs | active=1
[Stage B] Month origin t=313 | evaluating 3 configs | active=1
[Stage B] Month origin t=314 | evaluating 3 configs | a