In [4]:
# %% [markdown]
# # TTM on M4 Daily
#
# **Dataset**: M4 Daily (4,227 series, horizon=14 days)  
# **Model**: IBM Granite TTM R2  

# %% [markdown]
# ## Imports

# %%
import math
import os
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm

import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import Subset
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments, set_seed

from tsfm_public import (
    ForecastDFDataset,
    TimeSeriesForecastingPipeline,
    TimeSeriesPreprocessor,
    TinyTimeMixerForPrediction,
    TrackingCallback,
    count_parameters,
)
from tsfm_public.toolkit.time_series_preprocessor import prepare_data_splits

warnings.filterwarnings('ignore')
set_seed(42)

device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f" Libraries imported!")
print(f" Device: {device}\n")

# %% [markdown]
# ## Configuration

# %%
# Model & Data
FORECAST_LENGTH = 14
CONTEXT_LENGTH = 90
TTM_MODEL_PATH = "ibm-granite/granite-timeseries-ttm-r2"
REVISION = "90-30-ft-l1-r2.1"

# Paths
data_path = Path('../data/M4')
train_file = data_path / 'Daily-train.csv'
test_file = data_path / 'Daily-test.csv'

# Training
N_SERIES_FINETUNE = 500   
N_SERIES_TEST = 1000      
FEWSHOT_FRACTION = 0.20   # 20% of data for training
NUM_EPOCHS = 20             
BATCH_SIZE = 64
LEARNING_RATE = 0.001

print("="*80)
print("CONFIGURATION")
print("="*80)
print(f"Context: {CONTEXT_LENGTH} days | Horizon: {FORECAST_LENGTH} days")
print(f"Model: {TTM_MODEL_PATH} ({REVISION})")
print(f"Fine-tune: {N_SERIES_FINETUNE} series | Test: {N_SERIES_TEST} series")
print(f"Epochs: {NUM_EPOCHS} | Batch: {BATCH_SIZE} | LR: {LEARNING_RATE}")

# %% [markdown]
# ## Load M4 Daily Data

# %%
print("\n" + "="*80)
print("LOADING DATA")
print("="*80)

train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)

print(f" Train: {train_df.shape} | Test: {test_df.shape}")
print(f" Total series: {len(train_df)}")

# %% [markdown]
# ## Prepare Data for TTM

# %%
def prepare_m4_for_ttm_clean(train_df, test_df, series_indices, include_test=False):
    """
    Convert M4 wide format to long format
    
    Args:
        train_df: M4 train data
        test_df: M4 test data (only used if include_test=True)
        series_indices: which series to include
        include_test: if True, concatenate test data (for evaluation only!)
    
    Returns:
        DataFrame in long format
    """
    all_data = []
    
    for idx in tqdm(series_indices, desc="Converting"):
        series_id = train_df.iloc[idx, 0]
        train_values = train_df.iloc[idx, 1:].dropna().values
        
        if include_test:
            test_values = test_df.iloc[idx, 1:].dropna().values
            all_values = np.concatenate([train_values, test_values])
        else:
            all_values = train_values
        
        if len(all_values) < CONTEXT_LENGTH + FORECAST_LENGTH:
            continue
        
        timestamps = pd.date_range(start='2015-01-01', periods=len(all_values), freq='D')
        
        df_series = pd.DataFrame({
            'timestamp': timestamps,
            'series_id': series_id,
            'value': all_values,
        })
        
        all_data.append(df_series)
    
    return pd.concat(all_data, ignore_index=True)

print(f"\n Preparing {N_SERIES_FINETUNE} series for fine-tuning...")

finetune_indices = list(range(N_SERIES_FINETUNE))

# fine-tuning!
data_finetune = prepare_m4_for_ttm_clean(
    train_df, 
    test_df, 
    finetune_indices, 
    include_test=False  
)


# %% [markdown]
# ## Train Preprocessor & Create Datasets

# %%
print("\n" + "="*80)
print("PREPROCESSING")
print("="*80)

# Column specifiers
column_specifiers = {
    "timestamp_column": "timestamp",
    "id_columns": ["series_id"],
    "target_columns": ["value"],
}

# Train preprocessor
tsp = TimeSeriesPreprocessor(
    **column_specifiers,
    context_length=CONTEXT_LENGTH,
    prediction_length=FORECAST_LENGTH,
    scaling=True,
    encode_categorical=True,
    scaler_type="standard",
)

# 80% of TRAIN data for scaler
train_end_date = data_finetune['timestamp'].quantile(0.8)
df_train = data_finetune[data_finetune['timestamp'] <= train_end_date]

trained_tsp = tsp.train(df_train)

# Prepare splits
split_params = {"train": 0.6, "test": 0.2}
train_data, valid_data, test_data = prepare_data_splits(
    data_finetune,  
    id_columns=column_specifiers["id_columns"],
    split_config=split_params,
    context_length=CONTEXT_LENGTH
)

# Filter out series that are too short after splitting
min_length = CONTEXT_LENGTH + FORECAST_LENGTH + 2  # +2 for safety

def filter_short_series(df, id_cols, min_len):
    """Remove series with insufficient data"""
    series_counts = df.groupby(id_cols).size()
    valid_series = series_counts[series_counts >= min_len].index
    return df[df[id_cols[0]].isin(valid_series)]

print(f"Before filtering:")
print(f"  Train: {len(train_data)} rows, {train_data['series_id'].nunique()} series")
print(f"  Valid: {len(valid_data)} rows, {valid_data['series_id'].nunique()} series")
print(f"  Test: {len(test_data)} rows, {test_data['series_id'].nunique()} series")

train_data = filter_short_series(train_data, column_specifiers["id_columns"], min_length)
valid_data = filter_short_series(valid_data, column_specifiers["id_columns"], min_length)
test_data = filter_short_series(test_data, column_specifiers["id_columns"], min_length)

print(f"\nAfter filtering (min_length={min_length}):")
print(f"  Train: {len(train_data)} rows, {train_data['series_id'].nunique()} series")
print(f"  Valid: {len(valid_data)} rows, {valid_data['series_id'].nunique()} series")
print(f"  Test: {len(test_data)} rows, {test_data['series_id'].nunique()} series")

# Create datasets
frequency_token = tsp.get_frequency_token(tsp.freq)
dataset_params = {
    "timestamp_column": column_specifiers["timestamp_column"],
    "id_columns": column_specifiers["id_columns"],
    "target_columns": column_specifiers["target_columns"],
    "frequency_token": frequency_token,
    "context_length": CONTEXT_LENGTH,
    "prediction_length": FORECAST_LENGTH,
}

train_dataset = ForecastDFDataset(tsp.preprocess(train_data), **dataset_params)
valid_dataset = ForecastDFDataset(tsp.preprocess(valid_data), **dataset_params)
test_dataset = ForecastDFDataset(tsp.preprocess(test_data), **dataset_params)

# Few-shot sampling
n_train_all = len(train_dataset)
train_index = np.random.permutation(n_train_all)[:int(FEWSHOT_FRACTION * n_train_all)]
train_dataset = Subset(train_dataset, train_index)

n_valid_all = len(valid_dataset)
valid_index = np.random.permutation(n_valid_all)[:int(FEWSHOT_FRACTION * n_valid_all)]
valid_dataset = Subset(valid_dataset, valid_index)

print(f"Train: {len(train_dataset)} | Valid: {len(valid_dataset)} | Test: {len(test_dataset)}")

# %% [markdown]
# ## Load & Fine-tune TTM Model

# %%
print("\n" + "="*80)
print("LOADING MODEL")
print("="*80)

finetune_forecast_model = TinyTimeMixerForPrediction.from_pretrained(
    TTM_MODEL_PATH,
    revision=REVISION,
    context_length=CONTEXT_LENGTH,
    prediction_filter_length=FORECAST_LENGTH,
    num_input_channels=tsp.num_input_channels,
    prediction_channel_indices=tsp.prediction_channel_indices,
)

print(f"Model loaded! Parameters: {count_parameters(finetune_forecast_model):,}")

# Configure training
OUT_DIR = "../outputs/ttm_finetuned_clean/"
os.makedirs(OUT_DIR, exist_ok=True)

finetune_forecast_args = TrainingArguments(
    output_dir=os.path.join(OUT_DIR, "output"),
    overwrite_output_dir=True,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    do_eval=True,
    eval_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=2 * BATCH_SIZE,
    dataloader_num_workers=1,
    report_to="none",
    save_strategy="epoch",
    logging_strategy="epoch",
    save_total_limit=1,
    logging_dir=os.path.join(OUT_DIR, "logs"),
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    use_cpu=(device == "cpu"),
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=10,
    early_stopping_threshold=0.0,
)
tracking_callback = TrackingCallback()

optimizer = AdamW(finetune_forecast_model.parameters(), lr=LEARNING_RATE)
scheduler = OneCycleLR(
    optimizer,
    LEARNING_RATE,
    epochs=NUM_EPOCHS,
    steps_per_epoch=math.ceil(len(train_dataset) / BATCH_SIZE),
)

finetune_forecast_trainer = Trainer(
    model=finetune_forecast_model,
    args=finetune_forecast_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    callbacks=[early_stopping_callback, tracking_callback],
    optimizers=(optimizer, scheduler),
)

print("\n" + "="*80)
print("FINE-TUNING")
print("="*80)

finetune_forecast_trainer.train()

test_results = finetune_forecast_trainer.evaluate(test_dataset)
print(f"\n Test Loss: {test_results['eval_loss']:.4f}")

# %% [markdown]
# ## Evaluation Functions

# %%
def smape(y_true, y_pred):
    """Symmetric Mean Absolute Percentage Error"""
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return 100 * np.mean(diff)

def mase(y_true, y_pred, y_train, seasonality=1):
    """Mean Absolute Scaled Error"""
    mae = np.mean(np.abs(y_true - y_pred))
    naive_mae = np.mean(np.abs(y_train[seasonality:] - y_train[:-seasonality]))
    if naive_mae == 0:
        return np.nan
    return mae / naive_mae

def evaluate_ttm_m4(model, tsp, train_df, test_df, start_idx, n_series, 
                    context_length, forecast_length, device):
    model.eval()
    model.to(device)
    
    smape_scores = []
    mase_scores = []
    
    end_idx = min(start_idx + n_series, len(train_df))
    
    print(f"\n Evaluating {end_idx - start_idx} series...")
    
    # Get scaler stats
    if hasattr(tsp, 'scaler') and tsp.scaler is not None:
        scaler_mean = float(tsp.scaler.mean_[0])
        scaler_scale = float(tsp.scaler.scale_[0])
        use_scaler = True
    else:
        scaler_mean = 0.0
        scaler_scale = 1.0
        use_scaler = False
    
    # Get frequency token
    freq_token = tsp.get_frequency_token(tsp.freq)
    freq_token = torch.tensor([freq_token], dtype=torch.long, device=device)
    
    with torch.no_grad():
        for idx in tqdm(range(start_idx, end_idx), desc="Forecasting"):
            try:
                train_values = train_df.iloc[idx, 1:].dropna().values
                test_values = test_df.iloc[idx, 1:].dropna().values
                
                if len(train_values) < context_length or len(test_values) < forecast_length:
                    continue
                
                context = train_values[-context_length:].astype(np.float64)
                
                # Normalize
                context_normalized = (context - scaler_mean) / scaler_scale if use_scaler else context
                
                # Create tensor
                past_values = torch.from_numpy(
                    context_normalized.astype(np.float32)
                ).unsqueeze(0).unsqueeze(-1).to(device)
                
                # Forward
                outputs = model(past_values=past_values, freq_token=freq_token)
                
                # Extract prediction
                if hasattr(outputs, 'prediction_outputs'):
                    forecast = outputs.prediction_outputs.squeeze().cpu().numpy()
                elif hasattr(outputs, 'logits'):
                    forecast = outputs.logits.squeeze().cpu().numpy()
                else:
                    forecast = outputs.squeeze().cpu().numpy()
                
                forecast = forecast[:forecast_length]
                
                # Denormalize
                forecast_denorm = forecast * scaler_scale + scaler_mean if use_scaler else forecast
                
                # Compare with M4 test data
                y_true = test_values[:forecast_length]
                y_pred = forecast_denorm[:forecast_length]
                
                if len(y_pred) == forecast_length and len(y_true) == forecast_length:
                    smape_score = smape(y_true, y_pred)
                    mase_score = mase(y_true, y_pred, train_values, seasonality=1)
                    
                    if not np.isnan(smape_score) and not np.isinf(smape_score):
                        smape_scores.append(smape_score)
                    
                    if not np.isnan(mase_score) and not np.isinf(mase_score):
                        mase_scores.append(mase_score)
                
            except Exception as e:
                continue
    
    return {
        'smape_mean': np.mean(smape_scores) if smape_scores else np.nan,
        'smape_median': np.median(smape_scores) if smape_scores else np.nan,
        'smape_std': np.std(smape_scores) if smape_scores else np.nan,
        'mase_mean': np.mean(mase_scores) if mase_scores else np.nan,
        'mase_median': np.median(mase_scores) if mase_scores else np.nan,
        'mase_std': np.std(mase_scores) if mase_scores else np.nan,
        'n_series': len(smape_scores),
        'smape_scores': smape_scores,
        'mase_scores': mase_scores,
    }

# %% [markdown]
# ## Final Evaluation on Test Set

# %%
print("\n" + "="*80)
print("FINAL EVALUATION")
print("="*80)

results = evaluate_ttm_m4(
    model=finetune_forecast_model,
    tsp=trained_tsp,
    train_df=train_df,
    test_df=test_df,
    start_idx=N_SERIES_FINETUNE,
    n_series=N_SERIES_TEST,
    context_length=CONTEXT_LENGTH,
    forecast_length=FORECAST_LENGTH,
    device=device
)

print("\n" + "="*80)
print("RESULTS")
print("="*80)
print(f"\n Series evaluated: {results['n_series']}")
print(f"\n   sMAPE (mean):   {results['smape_mean']:.4f} ± {results['smape_std']:.4f}")
print(f"   sMAPE (median): {results['smape_median']:.4f}")
print(f"\n   MASE (mean):    {results['mase_mean']:.4f} ± {results['mase_std']:.4f}")
print(f"   MASE (median):  {results['mase_median']:.4f}")

 Libraries imported!
 Device: mps

CONFIGURATION
Context: 90 days | Horizon: 14 days
Model: ibm-granite/granite-timeseries-ttm-r2 (90-30-ft-l1-r2.1)
Fine-tune: 500 series | Test: 1000 series
Epochs: 20 | Batch: 64 | LR: 0.001

LOADING DATA
 Train: (4227, 9920) | Test: (4227, 15)
 Total series: 4227

 Preparing 500 series for fine-tuning...


Converting:   0%|          | 0/500 [00:00<?, ?it/s]


PREPROCESSING
Before filtering:
  Train: 315209 rows, 494 series
  Valid: 145834 rows, 470 series
  Test: 148799 rows, 494 series

After filtering (min_length=106):
  Train: 309703 rows, 429 series
  Valid: 145806 rows, 458 series
  Test: 148781 rows, 488 series
Train: 53103 | Valid: 19726 | Test: 98517

LOADING MODEL
Model loaded! Parameters: 440,145

FINE-TUNING


Epoch,Training Loss,Validation Loss
1,0.1327,0.146887
2,0.1319,0.144521
3,0.1318,0.145905
4,0.1318,0.145095
5,0.1321,0.143331
6,0.1322,0.146785
7,0.1312,0.144018
8,0.1306,0.143132
9,0.1297,0.146301
10,0.1287,0.143252


[TrackingCallback] Mean Epoch Time = 104.02872111797333 seconds, Total Train Time = 2293.7512550354004



 Test Loss: 0.1498

FINAL EVALUATION

 Evaluating 1000 series...


Forecasting:   0%|          | 0/1000 [00:00<?, ?it/s]


RESULTS

 Series evaluated: 1000

   sMAPE (mean):   2.5115 ± 1.7323
   sMAPE (median): 2.0262

   MASE (mean):    2.7201 ± 1.9670
   MASE (median):  2.2055
