In [1]:
import os
import time
from datetime import date

import numpy as np
import polars as pl
import torch
import sys



from modeling_module.data_loader.MultiPartDataModule import MultiPartDataModule
from modeling_module.data_loader.MultiPartExoDataModule import MultiPartExoDataModule

'''
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
https://developer.nvidia.com/cuda-12-8-0-download-archive
'''

MAC_DIR = '/Users/igwanhyeong/PycharmProjects/ts_forecaster_lib/raw_data/'
WINDOW_DIR = 'C:/Users/USER/PycharmProjects/ts_forecaster_lib/raw_data/'

if sys.platform == 'win32':
    DIR = WINDOW_DIR
    print(torch.cuda.is_available())
    print(torch.cuda.device_count())
    print(torch.version.cuda)
    print(torch.__version__)
    print(torch.cuda.get_device_name(0))
    print(torch.__version__)
else:
    DIR = MAC_DIR
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

True
1
12.8
2.11.0.dev20260112+cu128
NVIDIA GeForce RTX 5080
2.11.0.dev20260112+cu128


# Base Train Data

In [None]:
# -----------------------------
# YYYYWW -> day_idx (Monday of ISO week)
# day_idx: days since 1970-01-01 (int)
# -----------------------------

walmart_df = pl.read_parquet(DIR + 'train_data/walmart_train.parquet')
walmart_df.head()
EPOCH = date(1970, 1, 1)

def _yyyyww_to_monday_dayidx(yyyyww: int) -> int:
    y = int(yyyyww) // 100
    w = int(yyyyww) % 100
    monday = date.fromisocalendar(y, w, 1)
    return (monday - EPOCH).days

from datetime import date, timedelta


def _monday_dayidx_to_yyyyww(dayidx: int) -> int:
    """
    dayidx: days since 1970-01-01 (EPOCH)
    가정: dayidx는 '월요일'에 해당하는 날짜(ISO week Monday)
    반환: YYYYWW (int)
    """
    d = EPOCH + timedelta(days=int(dayidx))
    iso_y, iso_w, iso_d = d.isocalendar()

    # 안전장치: monday로 만든 값이 아니면 여기서 바로 티나게 함(원하면 제거 가능)
    # iso_d: Monday=1 ... Sunday=7
    if iso_d != 1:
        # 월요일이 아니어도 해당 날짜가 속한 ISO week로 변환은 가능하지만,
        # 지금 파이프라인 의도(week anchor)가 깨졌을 수 있어 경고 성격으로 둠
        # raise ValueError(f"dayidx={dayidx} is not Monday (iso_d={iso_d})")
        pass

    return int(iso_y) * 100 + int(iso_w)

df = (
    walmart_df
    .with_columns([
        pl.col("unique_id").cast(pl.Utf8),
        pl.col("dt").cast(pl.Int32).alias("yyyyww"),
        pl.col("y").cast(pl.Float32),
        pl.col("is_holiday").cast(pl.Float32).alias("exo_is_holiday"),
    ])
    .with_columns([
        pl.col("yyyyww").map_elements(_yyyyww_to_monday_dayidx, return_dtype=pl.Int32).alias("date_idx")
    ])
    .with_columns([pl.col("yyyyww").alias("date")])
    .select(["unique_id", "date", "date_idx", "y", "exo_is_holiday"])
)

# Optional: complete missing weeks per store (step=7 days), fill y=0, holiday=0
def complete_weekly(g: pl.DataFrame) -> pl.DataFrame:
    g = g.sort("date_idx")
    mn = int(g["date_idx"].min())
    mx = int(g["date_idx"].max())
    full = pl.DataFrame({"date_idx": pl.int_range(mn, mx + 1, step=7, eager=True).cast(pl.Int32)})
    out = full.join(g, on="date_idx", how="left").with_columns([
        pl.col("unique_id").fill_null(g["unique_id"][0]),
        pl.col("date").fill_null(pl.col("date_idx").map_elements(_monday_dayidx_to_yyyyww, return_dtype=pl.Int32)),
        pl.col("y").fill_null(0.0),
        pl.col("exo_is_holiday").fill_null(0.0),
    ])
    return out

df = pl.concat([complete_weekly(g) for g in df.partition_by("unique_id")], how="vertical").sort(["unique_id","date_idx"])

# -----------------------------
# Past Exo Feature Engineering
# -----------------------------
df = (
    df
    .sort(["unique_id", "date_idx"])  # 시간 순 정렬 필수
    .with_columns([
        # 1. Rolling Features (최근 추세 및 변동성)
        # 4주(한 달) 이동 평균
        pl.col("y")
          .rolling_mean(window_size=4)
          .over("unique_id")
          .fill_null(0) # 앞부분 결측 채움
          .alias("exo_rolling_mean_4w"),

        # 12주(분기) 이동 평균
        pl.col("y")
          .rolling_mean(window_size=12)
          .over("unique_id")
          .fill_null(0)
          .alias("exo_rolling_mean_12w"),

        # 4주 변동성 (표준편차)
        pl.col("y")
          .rolling_std(window_size=4)
          .over("unique_id")
          .fill_null(0)
          .alias("exo_rolling_std_4w"),

        # 2. Lag Features (작년 동기 대비)
        # 52주 전 데이터 (YoY) - 계절성 핵심
        pl.col("y")
          .shift(52)
          .over("unique_id")
          .fill_null(pl.col("y")) # 결측이면 현재 값이나 0으로 대체 (상황따라 선택)
          .alias("exo_lag_52w"),
    ])
)

# 3. Days Since Last Holiday (휴일로부터 경과일)
# 이 로직은 조금 복잡해서 "휴일이었던 날의 인덱스"를 활용해 forward fill 하는 방식으로 구현
df = df.with_columns([
    pl.when(pl.col("exo_is_holiday") == 1)
      .then(pl.col("date_idx"))
      .otherwise(None)
      .alias("last_holiday_idx")
])

df = df.with_columns([
    pl.col("last_holiday_idx")
      .forward_fill()
      .over("unique_id")
])

df = df.with_columns([
    (pl.col("date_idx") - pl.col("last_holiday_idx"))
      .fill_null(999) # 휴일이 한 번도 없었던 초반부는 큰 값으로
      .cast(pl.Float32)
      .alias("exo_days_since_holiday")
]).drop("last_holiday_idx") # 임시 컬럼 삭제

# 결과 확인
print(df.select(["unique_id", "date", "y", "exo_rolling_mean_4w", "exo_lag_52w", "exo_days_since_holiday"]).tail(10))

# Raw Train + Feature Data

In [None]:
import polars as pl
import numpy as np
from datetime import date, timedelta

# -----------------------------
# 0) Load & basic clean
# -----------------------------
raw = (
    pl.read_csv(DIR + 'csv/walmart/train.csv', infer_schema_length=100_000)
    .join(
        pl.read_csv(DIR + 'csv/walmart/features.csv', infer_schema_length=100_000),
        on=['Store', 'Date'],
        how='inner'
    )
    .select([
        'Store', 'Date', 'Weekly_Sales', 'IsHoliday',
        'Temperature', 'Fuel_Price',
        'MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5',
        'CPI', 'Unemployment'
    ])
)

# 문자열로 들어온 numeric 컬럼 정리 ("" -> null -> float)
def to_float(col):
    return (
        pl.col(col)
        .cast(pl.Utf8, strict=False)
        .str.strip_chars()
        .replace("", None)
        .cast(pl.Float32, strict=False)
    )

df = (
    raw
    .with_columns([
        pl.col("Store").cast(pl.Int32),
        pl.col("Date").str.strptime(pl.Date, "%Y-%m-%d", strict=False).alias("date_dt"),
        pl.col("Weekly_Sales").cast(pl.Float32).alias("y"),
        pl.col("IsHoliday").cast(pl.Int8).alias("is_holiday"),
        pl.col("Temperature").cast(pl.Float32),
        pl.col("Fuel_Price").cast(pl.Float32),
        to_float("MarkDown1").alias("MarkDown1"),
        to_float("MarkDown2").alias("MarkDown2"),
        to_float("MarkDown3").alias("MarkDown3"),
        to_float("MarkDown4").alias("MarkDown4"),
        to_float("MarkDown5").alias("MarkDown5"),
        to_float("CPI").alias("CPI"),
        to_float("Unemployment").alias("Unemployment"),
    ])
)

# MarkDown은 결측이 잦습니다. "0으로 채움 + 결측 indicator" 조합이 실무적으로 안정적입니다.
md_cols = ["MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5"]

df = df.with_columns(
    [
        pl.col(c).is_null().cast(pl.Int8).alias(f"exo_{c}_isnull") for c in md_cols
    ] + [
        pl.col(c).fill_null(0.0).alias(c) for c in md_cols
    ] + [
        # 총 markdown 강도(합)
        sum([pl.col(c) for c in md_cols]).alias("MarkDown_sum")
    ]
)

# -----------------------------
# 1) weekly index 만들기 (ISO week Monday anchor)
# -----------------------------
EPOCH = date(1970, 1, 1)

def _date_to_monday_dayidx(d: date) -> int:
    # d -> 해당 ISO week Monday -> dayidx
    iso_y, iso_w, _ = d.isocalendar()
    monday = date.fromisocalendar(iso_y, iso_w, 1)
    return (monday - EPOCH).days

def _dayidx_to_yyyyww(dayidx: int) -> int:
    d = EPOCH + timedelta(days=int(dayidx))
    iso_y, iso_w, _ = d.isocalendar()
    return int(iso_y) * 100 + int(iso_w)

df = (
    df
    .with_columns([
        pl.col("Store").cast(pl.Utf8).alias("unique_id"),
        pl.col("date_dt").map_elements(_date_to_monday_dayidx, return_dtype=pl.Int32).alias("date_idx"),
    ])
    .with_columns([
        pl.col("date_idx").map_elements(_dayidx_to_yyyyww, return_dtype=pl.Int32).alias("date"),
    ])
    .select([
        "unique_id", "date", "date_idx",
        "y", "is_holiday",
        "Temperature","Fuel_Price","CPI","Unemployment",
        *md_cols, "MarkDown_sum",
        *[f"exo_{c}_isnull" for c in md_cols],
    ])
    .sort(["unique_id","date_idx"])
)

# (선택) 주간 누락 보정: y=0, feature는 0 또는 forward-fill 정책 선택
def complete_weekly(g: pl.DataFrame) -> pl.DataFrame:
    g = g.sort("date_idx")
    mn = int(g["date_idx"].min())
    mx = int(g["date_idx"].max())
    full = pl.DataFrame({"date_idx": pl.int_range(mn, mx + 1, step=7, eager=True).cast(pl.Int32)})

    out = (
        full.join(g, on="date_idx", how="left")
        .with_columns([
            pl.col("unique_id").fill_null(g["unique_id"][0]),
            pl.col("date").fill_null(pl.col("date_idx").map_elements(_dayidx_to_yyyyww, return_dtype=pl.Int32)),
            pl.col("y").fill_null(0.0),
            pl.col("is_holiday").fill_null(0).cast(pl.Int8),

            # feature 정책: 실무에서는 보통 forward-fill 혹은 0 채움
            pl.col("Temperature").fill_null(strategy="forward").fill_null(0.0),
            pl.col("Fuel_Price").fill_null(strategy="forward").fill_null(0.0),
            pl.col("CPI").fill_null(strategy="forward").fill_null(0.0),
            pl.col("Unemployment").fill_null(strategy="forward").fill_null(0.0),

            *[pl.col(c).fill_null(0.0) for c in md_cols],
            pl.col("MarkDown_sum").fill_null(0.0),
            *[pl.col(f"exo_{c}_isnull").fill_null(1).cast(pl.Int8) for c in md_cols],
        ])
    )
    return out

df = (
    pl.concat([complete_weekly(g) for g in df.partition_by("unique_id")], how="vertical")
    .sort(["unique_id","date_idx"])
)

# -----------------------------
# 2) Future exo (calendar + holiday + (option) markdown plan)
# -----------------------------
# Calendar: week-of-year sin/cos, month sin/cos, linear trend
# date_idx는 "day" 단위이므로 week_idx = date_idx/7
df = df.with_columns([
    (pl.col("date_idx") / 7).cast(pl.Int32).alias("week_idx"),
])

# week-of-year를 date(YYYYWW)에서 뽑기 귀찮으면, date_dt가 없으므로
# 간단히 52주 주기 sin/cos를 week_idx로 만들 수 있습니다.
# (완벽한 ISO week alignment는 아니지만 실무에서는 충분히 강력한 seasonality feature입니다.)
two_pi = 2.0 * np.pi

df = df.with_columns([
    (pl.col("week_idx") * (two_pi / 52.0)).sin().alias("exo_f_woy_sin"),
    (pl.col("week_idx") * (two_pi / 52.0)).cos().alias("exo_f_woy_cos"),
    (pl.col("week_idx") * (two_pi / 365.25)).sin().alias("exo_f_long_sin"),  # 장주기 보조
    (pl.col("week_idx") * (two_pi / 365.25)).cos().alias("exo_f_long_cos"),
    pl.col("is_holiday").cast(pl.Float32).alias("exo_f_is_holiday"),
    pl.col("week_idx").cast(pl.Float32).alias("exo_f_trend"),
])

# (선택) MarkDown을 "미래에도 계획으로 제공된다"는 가정이면 future로 승격 가능
# 그렇지 않으면 아래 컬럼들은 future_exo가 아니라 past_exo로만 쓰는 것을 권합니다.
df = df.with_columns([
    pl.col("MarkDown_sum").cast(pl.Float32).alias("exo_f_markdown_sum"),
    *[pl.col(c).cast(pl.Float32).alias(f"exo_f_{c.lower()}") for c in md_cols],
])

# -----------------------------
# 3) Past exo cont (history-only derived)
# -----------------------------
# 타깃 기반 파생 (rolling / lag)
df = df.with_columns([
    pl.col("y").rolling_mean(4).over("unique_id").fill_null(0.0).alias("exo_p_y_rollmean_4w"),
    pl.col("y").rolling_mean(12).over("unique_id").fill_null(0.0).alias("exo_p_y_rollmean_12w"),
    pl.col("y").rolling_std(4).over("unique_id").fill_null(0.0).alias("exo_p_y_rollstd_4w"),

    pl.col("y").shift(1).over("unique_id").fill_null(0.0).alias("exo_p_y_lag_1w"),
    pl.col("y").shift(2).over("unique_id").fill_null(0.0).alias("exo_p_y_lag_2w"),
    pl.col("y").shift(52).over("unique_id").fill_null(0.0).alias("exo_p_y_lag_52w"),
])

# 휴일 이후 경과 주(weeks since holiday)
df = df.with_columns([
    pl.when(pl.col("is_holiday") == 1)
      .then(pl.col("date_idx"))
      .otherwise(None)
      .alias("_last_holiday_idx"),
]).with_columns([
    pl.col("_last_holiday_idx").forward_fill().over("unique_id"),
]).with_columns([
    ((pl.col("date_idx") - pl.col("_last_holiday_idx")) / 7.0)
      .fill_null(999.0)
      .cast(pl.Float32)
      .alias("exo_p_weeks_since_holiday")
]).drop("_last_holiday_idx")

# 과거 관측 feature들도 past_exo로 넣기 (미래값이 없다고 가정)
df = df.with_columns([
    pl.col("Temperature").cast(pl.Float32).alias("exo_p_temperature"),
    pl.col("Fuel_Price").cast(pl.Float32).alias("exo_p_fuel_price"),
    pl.col("CPI").cast(pl.Float32).alias("exo_p_cpi"),
    pl.col("Unemployment").cast(pl.Float32).alias("exo_p_unemployment"),
    pl.col("MarkDown_sum").cast(pl.Float32).alias("exo_p_markdown_sum"),
    *[pl.col(c).cast(pl.Float32).alias(f"exo_p_{c.lower()}") for c in md_cols],
    *[pl.col(f"exo_{c}_isnull").cast(pl.Float32).alias(f"exo_p_{c.lower()}_isnull") for c in md_cols],
])

# -----------------------------
# 4) Past exo cat
# -----------------------------
# Store 자체가 범주형으로 가장 강력
# 추가로 week-of-year bucket 같은 범주도 가능하지만, 기본은 Store만 추천
df = df.with_columns([
    pl.col("unique_id").alias("exo_c_store"),
    # 선택: 52주 bucket (0~51)
    (pl.col("week_idx") % 52).cast(pl.Int16).alias("exo_c_woy_bucket"),
])

df.write_parquet(DIR + 'train_data/walmart_feature_train_raw.parquet')

# Best Train + Feature Data

In [2]:
import polars as pl
from datetime import date, timedelta

# ============================================================
# 0) Utils: date <-> yyyyww/dayidx (ISO week Monday anchor)
# ============================================================
EPOCH = date(1970, 1, 1)

def _dt_to_yyyyww(d: date) -> int:
    iso_y, iso_w, _ = d.isocalendar()
    return int(iso_y) * 100 + int(iso_w)

def _dt_to_monday_dayidx(d: date) -> int:
    """dt가 속한 ISO week의 Monday를 anchor로 dayidx 생성"""
    iso_y, iso_w, _ = d.isocalendar()
    monday = date.fromisocalendar(int(iso_y), int(iso_w), 1)
    return (monday - EPOCH).days

def _monday_dayidx_to_yyyyww(dayidx: int) -> int:
    d = EPOCH + timedelta(days=int(dayidx))
    iso_y, iso_w, _ = d.isocalendar()
    return int(iso_y) * 100 + int(iso_w)

# ============================================================
# 1) Load + Deduplicate (Store, Date)
#    - train: Weekly_Sales sum, IsHoliday OR(max)
#    - features: numeric mean, MarkDown mean (or sum 선택 가능)
# ============================================================
DIR = DIR  # 이미 선언돼 있다고 가정
md_cols = ["MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5"]

train = pl.read_csv(DIR + "csv/walmart/train.csv", infer_schema_length=100_000)
feat  = pl.read_csv(DIR + "csv/walmart/features.csv", infer_schema_length=100_000)

train_s = (
    train
    .group_by(["Store","Date"])
    .agg([
        pl.col("Weekly_Sales").sum().alias("Weekly_Sales"),
        pl.col("IsHoliday").max().alias("IsHoliday"),  # OR
    ])
)

feat_s = (
    feat
    .group_by(["Store","Date"])
    .agg([
        pl.col("Temperature").mean().alias("Temperature"),
        pl.col("Fuel_Price").mean().alias("Fuel_Price"),

        # CPI, Unemployment: string->float->mean
        pl.col("CPI").cast(pl.Utf8).str.strip_chars().replace("", None)
          .cast(pl.Float32, strict=False).mean().alias("CPI"),
        pl.col("Unemployment").cast(pl.Utf8).str.strip_chars().replace("", None)
          .cast(pl.Float32, strict=False).mean().alias("Unemployment"),

        # MarkDown: string->float->mean (원하면 sum으로 변경 가능)
        *[
            pl.col(c).cast(pl.Utf8).str.strip_chars().replace("", None)
              .cast(pl.Float32, strict=False).mean().alias(c)
            for c in md_cols
        ]
    ])
)

df0 = (
    train_s
    .join(feat_s, on=["Store","Date"], how="inner")
)

# ============================================================
# 2) Basic schema + dt/date/date_idx + observed exo
# ============================================================
df_base = (
    df0
    .with_columns([
        pl.col("Store").cast(pl.Int32),
        pl.col("Store").cast(pl.Utf8).alias("unique_id"),

        pl.col("Weekly_Sales").cast(pl.Float32).alias("y"),
        pl.col("IsHoliday").cast(pl.Float32).alias("exo_is_holiday"),

        pl.col("Temperature").cast(pl.Float32).alias("exo_temperature"),
        pl.col("Fuel_Price").cast(pl.Float32).alias("exo_fuel_price"),

        pl.col("CPI").cast(pl.Float32).alias("exo_cpi"),
        pl.col("Unemployment").cast(pl.Float32).alias("exo_unemployment"),

        pl.col("Date").str.strptime(pl.Date, "%Y-%m-%d", strict=False).alias("dt"),
    ])
    # MarkDown float + isnull flags
    .with_columns([
        *[
            pl.col(c).cast(pl.Float32).alias(f"exo_{c.lower()}")
            for c in md_cols
        ],
        *[
            pl.col(c).is_null().cast(pl.Int8).cast(pl.Float32).alias(f"exo_{c.lower()}_isnull")
            for c in md_cols
        ],
    ])
    .with_columns([
        (
            pl.coalesce([pl.col("exo_markdown1"), pl.lit(0.0)]) +
            pl.coalesce([pl.col("exo_markdown2"), pl.lit(0.0)]) +
            pl.coalesce([pl.col("exo_markdown3"), pl.lit(0.0)]) +
            pl.coalesce([pl.col("exo_markdown4"), pl.lit(0.0)]) +
            pl.coalesce([pl.col("exo_markdown5"), pl.lit(0.0)])
        ).alias("exo_markdown_sum"),

        # 안전한 ISO week anchor
        pl.col("dt").map_elements(_dt_to_yyyyww, return_dtype=pl.Int32).alias("date"),
        pl.col("dt").map_elements(_dt_to_monday_dayidx, return_dtype=pl.Int32).alias("date_idx"),
    ])
    .select([
        "unique_id","date","date_idx","y",
        "exo_is_holiday",
        "exo_temperature","exo_fuel_price","exo_cpi","exo_unemployment",
        "exo_markdown_sum",
        "exo_markdown1","exo_markdown2","exo_markdown3","exo_markdown4","exo_markdown5",
        "exo_markdown1_isnull","exo_markdown2_isnull","exo_markdown3_isnull","exo_markdown4_isnull","exo_markdown5_isnull",
    ])
    .sort(["unique_id","date_idx"])
)

# ============================================================
# 3) Weekly complete (벡터화): store별 [min..max] step=7 grid 생성 후 left join
#    - y: missing=0
#    - holiday: missing=0
#    - temp/fuel/cpi/unemp: ff/bf
#    - markdown: missing=0, isnull=1
# ============================================================
# store별 min/max
rng = (
    df_base
    .group_by("unique_id")
    .agg([
        pl.col("date_idx").min().alias("mn"),
        pl.col("date_idx").max().alias("mx"),
    ])
    .with_columns([
        pl.int_ranges("mn", pl.col("mx") + 1, step=7).alias("date_idx")
    ])
    .explode("date_idx")
    .select(["unique_id","date_idx"])
)

df_complete = (
    rng
    .join(df_base, on=["unique_id","date_idx"], how="left")
    .with_columns([
        # missing date (yyyyww) 재생성
        pl.col("date").fill_null(
            pl.col("date_idx").map_elements(_monday_dayidx_to_yyyyww, return_dtype=pl.Int32)
        ),

        # y, holiday
        pl.col("y").fill_null(0.0),
        pl.col("exo_is_holiday").fill_null(0.0),
    ])
)

# ff/bf for continuous observed covariates
for c in ["exo_temperature","exo_fuel_price","exo_cpi","exo_unemployment"]:
    df_complete = df_complete.with_columns(
        pl.col(c).forward_fill().over("unique_id")
    ).with_columns(
        pl.col(c).backward_fill().over("unique_id")
    ).with_columns(
        pl.col(c).fill_null(0.0)
    )

# markdown fill
md_fill = ["exo_markdown1","exo_markdown2","exo_markdown3","exo_markdown4","exo_markdown5","exo_markdown_sum"]
md_isnull = ["exo_markdown1_isnull","exo_markdown2_isnull","exo_markdown3_isnull","exo_markdown4_isnull","exo_markdown5_isnull"]

for c in md_fill:
    df_complete = df_complete.with_columns(pl.col(c).fill_null(0.0))
for c in md_isnull:
    df_complete = df_complete.with_columns(pl.col(c).fill_null(1.0))

df_complete = df_complete.sort(["unique_id","date_idx"])

# ============================================================
# 4) Past exo (complete 이후 재계산이 핵심)
# ============================================================
df_feat = (
    df_complete
    .with_columns([
        # y lags
        pl.col("y").shift(1).over("unique_id").fill_null(0.0).alias("exo_p_y_lag_1w"),
        pl.col("y").shift(2).over("unique_id").fill_null(0.0).alias("exo_p_y_lag_2w"),
        pl.col("y").shift(52).over("unique_id").fill_null(0.0).alias("exo_p_y_lag_52w"),

        # rolling
        pl.col("y").rolling_mean(4).over("unique_id").fill_null(0.0).alias("exo_p_y_rollmean_4w"),
        pl.col("y").rolling_mean(12).over("unique_id").fill_null(0.0).alias("exo_p_y_rollmean_12w"),
        pl.col("y").rolling_std(4).over("unique_id").fill_null(0.0).alias("exo_p_y_rollstd_4w"),
    ])
)

# weeks_since_holiday (complete 이후)
df_feat = (
    df_feat
    .with_columns([
        pl.when(pl.col("exo_is_holiday") == 1.0).then(pl.col("date_idx")).otherwise(None).alias("_last_hol_idx")
    ])
    .with_columns([
        pl.col("_last_hol_idx").forward_fill().over("unique_id")
    ])
    .with_columns([
        ((pl.col("date_idx") - pl.col("_last_hol_idx")) / 7.0)
        .fill_null(999.0).cast(pl.Float32).alias("exo_p_weeks_since_holiday")
    ])
    .drop("_last_hol_idx")
)

# observed covariates -> past naming 통일
df_feat = df_feat.with_columns([
    pl.col("exo_temperature").alias("exo_p_temperature"),
    pl.col("exo_fuel_price").alias("exo_p_fuel_price"),
    pl.col("exo_cpi").alias("exo_p_cpi"),
    pl.col("exo_unemployment").alias("exo_p_unemployment"),

    pl.col("exo_markdown_sum").alias("exo_p_markdown_sum"),
    pl.col("exo_markdown1").alias("exo_p_markdown1"),
    pl.col("exo_markdown2").alias("exo_p_markdown2"),
    pl.col("exo_markdown3").alias("exo_p_markdown3"),
    pl.col("exo_markdown4").alias("exo_p_markdown4"),
    pl.col("exo_markdown5").alias("exo_p_markdown5"),
])

# categorical: woy bucket
df_feat = (
    df_feat
    .with_columns((pl.col("date") % 100).cast(pl.Int32).alias("_woy"))
    .with_columns((pl.col("_woy") // 4).cast(pl.Int32).alias("exo_c_woy_bucket"))  # 0~13
    .drop("_woy")
)

df = df_feat
print(df.schema)
print(df.shape)

# ============================================================
# 5) Sanity check: step=7 보장 여부
# ============================================================
chk = (
    df
    .with_columns((pl.col("date_idx") - pl.col("date_idx").shift(1).over("unique_id")).alias("diff"))
    .filter(pl.col("diff").is_not_null() & (pl.col("diff") != 7))
)
print("n_breaks:", chk.height)
if chk.height > 0:
    print(chk.select(["diff"]).value_counts().sort("diff"))


Schema({'unique_id': String, 'date_idx': Int64, 'date': Int32, 'y': Float32, 'exo_is_holiday': Float32, 'exo_temperature': Float32, 'exo_fuel_price': Float32, 'exo_cpi': Float32, 'exo_unemployment': Float32, 'exo_markdown_sum': Float32, 'exo_markdown1': Float32, 'exo_markdown2': Float32, 'exo_markdown3': Float32, 'exo_markdown4': Float32, 'exo_markdown5': Float32, 'exo_markdown1_isnull': Float32, 'exo_markdown2_isnull': Float32, 'exo_markdown3_isnull': Float32, 'exo_markdown4_isnull': Float32, 'exo_markdown5_isnull': Float32, 'exo_p_y_lag_1w': Float32, 'exo_p_y_lag_2w': Float32, 'exo_p_y_lag_52w': Float32, 'exo_p_y_rollmean_4w': Float32, 'exo_p_y_rollmean_12w': Float32, 'exo_p_y_rollstd_4w': Float32, 'exo_p_weeks_since_holiday': Float32, 'exo_p_temperature': Float32, 'exo_p_fuel_price': Float32, 'exo_p_cpi': Float32, 'exo_p_unemployment': Float32, 'exo_p_markdown_sum': Float32, 'exo_p_markdown1': Float32, 'exo_p_markdown2': Float32, 'exo_p_markdown3': Float32, 'exo_p_markdown4': Float3

In [7]:
df.select('date').max()

date
i32
201243
