In [1]:
from datasetsforecast.m5 import M5

df = M5().load("../data")[0]  # , group="Monthly"
df.sort_values(["unique_id", "ds"], inplace=True)
df.to_parquet("data/input/m5.parquet")

In [2]:
import json
import os

import pandas as pd
import torch
from tqdm import tqdm

# === Config ===
input_path = "data/input/m5.parquet"
output_path = "data/intermediate/m5_scaled.parquet"
scaler_save_path = "artifacts/m5_scalers.json"
scaler = "minmax"  # or "standard"
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Using device: {device}")

# === Load Data ===
df = pd.read_parquet(input_path)
scaled_parts = []
scaler_stats = {}

# === Process per unique_id with tqdm ===
for uid, group in tqdm(df.groupby("unique_id"), desc="Scaling series"):
    y = torch.tensor(group["y"].values, dtype=torch.float32, device=device)

    if scaler == "minmax":
        min_val, max_val = y.min(), y.max()
        scaled_y = (y - min_val) / (max_val - min_val + 1e-8)

        scaler_stats[uid] = {
            "type": "minmax",
            "min": float(min_val.cpu()),
            "max": float(max_val.cpu()),
        }

    elif scaler == "standard":
        mean, std = y.mean(), y.std(unbiased=False)
        scaled_y = (y - mean) / (std + 1e-8)

        scaler_stats[uid] = {
            "type": "standard",
            "mean": float(mean.cpu()),
            "std": float(std.cpu()),
        }

    else:
        raise ValueError(f"Unsupported scaler type: {scaler}")

    group = group.copy()
    group["y_scaled"] = scaled_y.cpu().numpy()
    scaled_parts.append(group)

# === Save Scaled Data and Scaler Stats ===
scaled_df = pd.concat(scaled_parts).sort_values(["unique_id", "ds"])
scaled_df.to_parquet(output_path, index=False)

with open(scaler_save_path, "w") as f:
    json.dump(scaler_stats, f, indent=2)

print(f"✅ Scaled data saved to: {output_path}")
print(f"✅ Scaler stats saved to: {scaler_save_path}")

Using device: cuda


  for uid, group in tqdm(df.groupby("unique_id"), desc="Scaling series"):
Scaling series: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30490/30490 [00:08<00:00, 3584.14it/s]


✅ Scaled data saved to: data/intermediate/m5_scaled.parquet
✅ Scaler stats saved to: artifacts/m5_scalers.json
