In [1]:
from datasetsforecast.m3 import M3

df = M3().load("../data", group="Monthly")[0]
df.sort_values(["unique_id", "ds"], inplace=True)
df.to_parquet("data/input/m3-monthly.parquet")

  freq = pd.tseries.frequencies.to_offset(class_group.freq)


In [2]:
import json
import os

%%time
import pandas as pd
import torch
from tqdm import tqdm

# === Config ===
input_path = "data/input/m3-monthly.parquet"
output_path = "data/intermediate/m3-monthly_scaled.parquet"
scaler_save_path = "artifacts/m3-monthly_scalers.json"
scaler = "minmax"  # or "standard"
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Using device: {device}")

# === Load Data ===
df = pd.read_parquet(input_path)
scaled_parts = []
scaler_stats = {}

# === Process per unique_id with tqdm ===
for uid, group in tqdm(df.groupby("unique_id"), desc="Scaling series"):
    y = torch.tensor(group["y"].values, dtype=torch.float32, device=device)

    if scaler == "minmax":
        min_val, max_val = y.min(), y.max()
        scaled_y = (y - min_val) / (max_val - min_val + 1e-8)

        scaler_stats[uid] = {
            "type": "minmax",
            "min": float(min_val.cpu()),
            "max": float(max_val.cpu()),
        }

    elif scaler == "standard":
        mean, std = y.mean(), y.std(unbiased=False)
        scaled_y = (y - mean) / (std + 1e-8)

        scaler_stats[uid] = {
            "type": "standard",
            "mean": float(mean.cpu()),
            "std": float(std.cpu()),
        }

    else:
        raise ValueError(f"Unsupported scaler type: {scaler}")

    group = group.copy()
    group["y_scaled"] = scaled_y.cpu().numpy()
    scaled_parts.append(group)

# === Save Scaled Data and Scaler Stats ===
scaled_df = pd.concat(scaled_parts).sort_values(["unique_id", "ds"])
scaled_df.to_parquet(output_path, index=False)

with open(scaler_save_path, "w") as f:
    json.dump(scaler_stats, f, indent=2)

print(f"✅ Scaled data saved to: {output_path}")
print(f"✅ Scaler stats saved to: {scaler_save_path}")

Using device: cuda


Scaling series:   0%|          | 0/1428 [00:00<?, ?it/s]

Scaling series:  12%|█▏        | 176/1428 [00:00<00:00, 1759.18it/s]

Scaling series:  46%|████▌     | 655/1428 [00:00<00:00, 3539.67it/s]

Scaling series:  71%|███████   | 1013/1428 [00:00<00:00, 3313.25it/s]

Scaling series: 100%|██████████| 1428/1428 [00:00<00:00, 3534.50it/s]

✅ Scaled data saved to: data/intermediate/m3-monthly_scaled.parquet
✅ Scaler stats saved to: artifacts/m3-monthly_scalers.json
CPU times: user 1.07 s, sys: 128 ms, total: 1.2 s
Wall time: 1.2 s



