# Import

In [1]:
!pip install catboost pytorch-lightning pytorch-forecasting

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting pytorch-lightning
  Downloading pytorch_lightning-2.5.3-py3-none-any.whl.metadata (20 kB)
Collecting pytorch-forecasting
  Downloading pytorch_forecasting-1.4.0-py3-none-any.whl.metadata (14 kB)
Collecting torchmetrics>0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.8.1-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Collecting lightning<3.0.0,>=2.0.0 (from pytorch-forecasting)
  Downloading lightning-2.5.3-py3-none-any.whl.metadata (39 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_runtime_cu

In [2]:
import os, gc, math, random
import numpy as np
import pandas as pd
from datetime import timedelta

import torch
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.loggers import CSVLogger

from catboost import CatBoostRegressor, Pool

from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer
from pytorch_forecasting.metrics import RMSE

import lightning as L
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
from lightning.pytorch.loggers import CSVLogger

import glob, re, os
import numpy as np
import pandas as pd
from datetime import timedelta
from catboost import Pool
from tqdm import tqdm
import logging

# Lightning / PL 로그 억제
logging.getLogger("lightning").setLevel(logging.ERROR)
logging.getLogger("lightning.pytorch").setLevel(logging.ERROR)
logging.getLogger("pytorch_lightning").setLevel(logging.ERROR)

from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/LGAI'


Mounted at /content/drive


# Fixed RandomSeed & Setting Hyperparameter

In [3]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(42)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

Device: cuda


In [4]:
# 공통 설정
ENC_LEN  = 28
PRED_LEN = 7
ROLL_WINS = [7, 14, 28]

# CatBoost 하이퍼파라미터
CAT_PARAMS = dict(
    depth=8,
    learning_rate=0.05,
    iterations=2000,
    loss_function="RMSE",
    l2_leaf_reg=5,
    random_seed=42,
    verbose=200,
    od_type="Iter",
    od_wait=200,
)

# TFT-mini 하이퍼파라미터
TFT_PARAMS = dict(
    learning_rate=2e-3,
    hidden_size=64,
    attention_head_size=2,
    dropout=0.1,
    hidden_continuous_size=32,
    lstm_layers=1,
)

# 앙상블 가중치
W_CAT = 0.4
W_TFT = 0.6

# Data Load

In [5]:
BASE_DIR = file_path
train_path  = os.path.join(BASE_DIR,'train', "train.csv")
sample_path = os.path.join(BASE_DIR, "sample_submission.csv")
test_paths  = [os.path.join(BASE_DIR,'test', f"TEST_0{i}.csv") for i in range(10)]

train  = pd.read_csv(train_path,  parse_dates=["영업일자"])
sample = pd.read_csv(sample_path)
tests  = {f"TEST_0{i}": pd.read_csv(p, parse_dates=["영업일자"]) for i, p in enumerate(test_paths)}

# 라벨 고정 및 키 분해
train["매출수량"] = pd.to_numeric(train["매출수량"], errors="coerce").fillna(0).clip(lower=0)
train["영업장명_메뉴명"] = train["영업장명_메뉴명"].astype(str)
train["업장명"] = train["영업장명_메뉴명"].str.split("_", n=1).str[0]
train["메뉴명"] = train["영업장명_메뉴명"].str.split("_", n=1).str[1].fillna("NA")
train = train.sort_values(["영업장명_메뉴명","영업일자"]).reset_index(drop=True)

# ID 매핑
def make_id_map(s: pd.Series):
    uniq = s.astype(str).unique().tolist()
    return {k: i for i, k in enumerate(uniq)}

store2id = make_id_map(train["업장명"])
item2id  = make_id_map(train["메뉴명"])
pair2id  = make_id_map(train["업장명"] + "###" + train["메뉴명"])

train["store_id"] = train["업장명"].map(store2id).astype(int)
train["item_id"]  = train["메뉴명"].map(item2id).astype(int)
train["pair_id"]  = (train["업장명"] + "###" + train["메뉴명"]).map(pair2id).astype(int)

# 공휴일/이벤트 정의
KOREA_HOLIDAYS = set([
    # ----- 2023 -----
    '2023-01-01',  # 신정
    '2023-01-21',  # 설날 연휴 시작
    '2023-01-22',  # 설날
    '2023-01-23',  # 설날 연휴
    '2023-01-24',  # 대체공휴일
    '2023-03-01',  # 삼일절
    '2023-05-05',  # 어린이날
    '2023-05-27',  # 부처님오신날
    '2023-05-29',  # 부처님오신날 대체휴일
    '2023-06-06',  # 현충일
    '2023-08-15',  # 광복절
    '2023-09-28',  # 추석 연휴 시작
    '2023-09-29',  # 추석
    '2023-09-30',  # 추석 연휴
    '2023-10-02',  # 임시공휴일
    '2023-10-03',  # 개천절
    '2023-10-09',  # 한글날
    '2023-12-25',  # 크리스마스
    # ----- 2024 -----
    '2024-01-01',  # 신정
    '2024-02-09',  # 설날 연휴 시작
    '2024-02-10',  # 설날
    '2024-02-11',  # 설날 연휴
    '2024-02-12',  # 대체공휴일
    '2024-03-01',  # 삼일절
    '2024-04-10',  # 제22대 국회의원 선거
    '2024-05-05',  # 어린이날
    '2024-05-06',  # 어린이날 대체휴일
    '2024-05-15',  # 석가탄신일
    '2024-06-06',  # 현충일
    '2024-08-15',  # 광복절
    '2024-09-16',  # 추석 연휴 시작
    '2024-09-17',  # 추석
    '2024-09-18',  # 추석 연휴
    '2024-10-03',  # 개천절
    '2024-10-09',  # 한글날
    '2024-12-25',  # 크리스마스
])

EVENTS_GLOBAL = set([

])

EVENTS_TARGETED = {

}

def expand_dates(dates, days=3):
    if not dates:
        return pd.DataFrame(columns=["영업일자","near_flag"])
    base = pd.to_datetime(sorted(pd.to_datetime(list(dates)))).normalize()
    offs = pd.to_timedelta(np.arange(-days, days+1), unit="D")
    expanded = (base.values[:, None] + offs.values[None, :]).ravel()
    out = pd.DataFrame({"영업일자": pd.to_datetime(expanded).normalize(), "near_flag": 1})
    return out.drop_duplicates()

df_near_glob = expand_dates(EVENTS_GLOBAL, days=3)
near_glob_map = df_near_glob.set_index("영업일자")["near_flag"] if len(df_near_glob) else pd.Series(dtype="int")

def add_domain_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    dt = pd.to_datetime(df["영업일자"])
    df["dow"] = dt.dt.weekday
    df["month"] = dt.dt.month
    df["is_weekend"] = (df["dow"] >= 5).astype(int)

    df["dow_sin"] = np.sin(2*np.pi*df["dow"]/7.0).astype(np.float32)
    df["dow_cos"] = np.cos(2*np.pi*df["dow"]/7.0).astype(np.float32)
    df["month_sin"] = np.sin(2*np.pi*df["month"]/12.0).astype(np.float32)
    df["month_cos"] = np.cos(2*np.pi*df["month"]/12.0).astype(np.float32)

    df["is_spring"] = df["month"].isin([3,4,5]).astype(int)
    df["is_summer"] = df["month"].isin([6,7,8]).astype(int)
    df["is_fall"]   = df["month"].isin([9,10,11]).astype(int)
    df["is_winter"] = df["month"].isin([12,1,2]).astype(int)

    df["is_peak_summer"] = df["month"].isin([7,8]).astype(int)
    df["is_peak_winter"] = df["month"].isin([12,1,2]).astype(int)

    date_str = dt.dt.strftime("%Y-%m-%d")
    date_key = dt.dt.normalize()
    df["is_holiday"] = date_str.isin(KOREA_HOLIDAYS).astype(int)
    prev_date = (dt - pd.Timedelta(days=1)).dt.strftime("%Y-%m-%d")
    next_date = (dt + pd.Timedelta(days=1)).dt.strftime("%Y-%m-%d")
    df["before_holiday"] = prev_date.isin(KOREA_HOLIDAYS).astype(int)
    df["after_holiday"]  = next_date.isin(KOREA_HOLIDAYS).astype(int)

    if len(KOREA_HOLIDAYS) > 0:
        holi_sorted = pd.to_datetime(sorted(list(KOREA_HOLIDAYS))).normalize()
        df_h = pd.DataFrame({"d": holi_sorted})
        grp = (df_h["d"].diff().dt.days.ne(1)).cumsum()
        runlen = df_h.groupby(grp)["d"].transform("size")
        run_map = pd.Series(runlen.values, index=df_h["d"].values)
        df["is_holiday_run"] = date_key.map(run_map).fillna(0).astype(np.int16)
    else:
        df["is_holiday_run"] = 0

    start_su = pd.to_datetime(dt.dt.year.astype(str) + "-07-15")
    end_su   = pd.to_datetime(dt.dt.year.astype(str) + "-08-31")
    df["is_summer_vac"] = ((dt >= start_su) & (dt <= end_su)).astype(int)

    start_w1 = pd.to_datetime(dt.dt.year.astype(str) + "-12-20")
    end_w1   = pd.to_datetime(dt.dt.year.astype(str) + "-12-31")
    start_w2 = pd.to_datetime((dt.dt.year-1).astype(str) + "-12-20")
    end_w2   = pd.to_datetime(dt.dt.year.astype(str) + "-02-28")
    df["is_winter_vac"] = (((dt >= start_w1) & (dt <= end_w1)) | ((dt >= start_w2) & (dt <= end_w2))).astype(int)

    df["EVENT_SF_SZN"]      = df["month"].isin([3,4,5,9,10,11]).astype(int)
    df["EVENT_SUMMER_SZN"]  = df["month"].isin([6,7,8]).astype(int)
    df["EVENT_WINTER_SZN"]  = df["month"].isin([12,1,2]).astype(int)

    df["is_event_global"] = date_str.isin(EVENTS_GLOBAL).astype(int)
    df["near_event_global"] = 0 if near_glob_map.empty else date_key.map(near_glob_map).fillna(0).astype("int8")

    df["is_event_target"] = 0
    df["near_event_target"] = 0
    if "업장명" in df.columns and "메뉴명" in df.columns:
        for (s_name, i_name), dates in EVENTS_TARGETED.items():
            if not dates:
                continue
            m = (df["업장명"].eq(s_name)) & (df["메뉴명"].eq(i_name))
            df.loc[m, "is_event_target"] = df.loc[m, "영업일자"].dt.strftime("%Y-%m-%d").isin(set(dates)).astype(int)
            df_near = expand_dates(dates, days=3)
            if len(df_near):
                mapper = df_near.set_index("영업일자")["near_flag"]
                df.loc[m, "near_event_target"] = df.loc[m, "영업일자"].dt.normalize().map(mapper).fillna(0).astype("int8")

    return df

train = add_domain_features(train)

# 롤링 통계(과거 기반)
for w in ROLL_WINS:
    train[f"roll_mean_{w}"] = train.groupby("영업장명_메뉴명")["매출수량"].transform(lambda s: s.rolling(w, min_periods=1).mean())

# CatBoost용 피처 구성
cat_features_cols = ["업장명","메뉴명"]
base_num_features = [
    "is_weekend","dow_sin","dow_cos","month_sin","month_cos",
    "is_spring","is_summer","is_fall","is_winter",
    "is_peak_summer","is_peak_winter",
    "is_holiday","before_holiday","after_holiday","is_holiday_run",
    "is_summer_vac","is_winter_vac",
    "EVENT_SF_SZN","EVENT_SUMMER_SZN","EVENT_WINTER_SZN",
    "is_event_global","near_event_global","is_event_target","near_event_target",
] + [f"roll_mean_{w}" for w in ROLL_WINS]

# Define Model

In [6]:
# --------------------------------------------------------------
# (1) 미래에 '알고 있는' 달력/이벤트 피처 목록
# --------------------------------------------------------------
known_future_cols = [
    "dow","month","is_weekend","dow_sin","dow_cos","month_sin","month_cos",
    "is_spring","is_summer","is_fall","is_winter",
    "is_peak_summer","is_peak_winter",
    "is_holiday","before_holiday","after_holiday","is_holiday_run",
    "is_summer_vac","is_winter_vac",
    "EVENT_SF_SZN","EVENT_SUMMER_SZN","EVENT_WINTER_SZN",
    "is_event_global","near_event_global","is_event_target","near_event_target",
]

# --------------------------------------------------------------
# (2) CatBoost 학습용: shift 벡터화 + fragmentation 방지
# --------------------------------------------------------------
# 정렬 및 최소 28일(ENC_LEN) 히스토리 보유 플래그
train_sorted = train.sort_values(["영업장명_메뉴명","영업일자"]).reset_index(drop=True)
g = train_sorted.groupby("영업장명_메뉴명", sort=False)
train_sorted["hist_ok"] = g.cumcount() >= (ENC_LEN - 1)

# 각 horizon의 y와 미래 피처를 dict로 모아 '한 번에' concat
base_for_shift = ["매출수량"] + known_future_cols
shift_blocks = []
for h in range(1, PRED_LEN + 1):
    block = g[base_for_shift].shift(-h)
    rename_map = {"매출수량": f"y_H{h}"}
    rename_map.update({c: f"{c}_H{h}" for c in known_future_cols})
    block = block.rename(columns=rename_map)
    shift_blocks.append(block)

train_shift = pd.concat(shift_blocks, axis=1)
# 새 프레임으로 copy() 하여 조각화 해소
train_sorted = pd.concat([train_sorted, train_shift], axis=1).copy()

# CatBoost 학습 행렬 생성 함수 (벡터화)
def build_catboost_Xy_by_shift(df: pd.DataFrame, stride: int = 1):
    """
    df: shift가 적용된 train_sorted
    stride: 표본 간격 샘플링(2~3으로 올리면 추가 가속)
    반환: {h: (Xh, yh)}
    """
    Xy = {}
    base_cols = cat_features_cols + [f"roll_mean_{w}" for w in ROLL_WINS]

    # 기본 컬럼 다운캐스팅
    for c in base_cols:
        if c in cat_features_cols:
            df[c] = df[c].astype("category")
        else:
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("float32")

    for h in range(1, PRED_LEN + 1):
        fut_cols_h = [f"{c}_H{h}" for c in known_future_cols]
        target_col = f"y_H{h}"

        # 유효 표본: 28일 히스토리 확보 & 라벨 존재
        mask = df["hist_ok"] & df[target_col].notna()

        if stride > 1:
            idx = (
                df[mask]
                .groupby("영업장명_메뉴명", sort=False)
                .apply(lambda x: x.iloc[::stride])
                .reset_index(level=0, drop=True)
                .index
            )
            use = df.loc[idx, base_cols + fut_cols_h + [target_col]].copy()
        else:
            use = df.loc[mask, base_cols + fut_cols_h + [target_col]].copy()

        # 미래 피처/라벨 다운캐스팅
        for c in fut_cols_h:
            use[c] = pd.to_numeric(use[c], errors="coerce").astype("float32")
        use[target_col] = pd.to_numeric(use[target_col], errors="coerce").astype("float32")

        X = use[base_cols + fut_cols_h].reset_index(drop=True)
        y = use[target_col].reset_index(drop=True)
        Xy[h] = (X, y)
    return Xy

# 필요 시 stride=2~3으로 더 가볍게
Xy_h = build_catboost_Xy_by_shift(train_sorted, stride=1)

# --------------------------------------------------------------
# (3) TFT용 데이터셋/로더/모델
# --------------------------------------------------------------
train_tft = train.copy()
train_tft["time_idx"] = (train_tft["영업일자"] - train_tft["영업일자"].min()).dt.days.astype(int)

# ★ static_categoricals는 문자열/카테고리여야 함
for c in ["store_id", "item_id", "pair_id"]:
    train_tft[c] = train_tft[c].astype(str)  # 또는 .astype("category")

static_categoricals = ["store_id", "item_id", "pair_id"]
time_varying_known_reals = [
    "time_idx",
    "dow_sin","dow_cos","month_sin","month_cos",
    "is_weekend","is_spring","is_summer","is_fall","is_winter",
    "is_peak_summer","is_peak_winter",
    "is_holiday","before_holiday","after_holiday","is_holiday_run",
    "is_summer_vac","is_winter_vac",
    "EVENT_SF_SZN","EVENT_SUMMER_SZN","EVENT_WINTER_SZN",
    "is_event_global","near_event_global","is_event_target","near_event_target",
]
time_varying_unknown_reals = ["매출수량"] + [f"roll_mean_{w}" for w in ROLL_WINS]

# dtype 정리(안전)
for c in time_varying_known_reals:
    if c != "time_idx":
        train_tft[c] = pd.to_numeric(train_tft[c], errors="coerce").astype("float32")
train_tft["time_idx"] = pd.to_numeric(train_tft["time_idx"], errors="coerce").astype("int64")
for c in [f"roll_mean_{w}" for w in ROLL_WINS] + ["매출수량"]:
    train_tft[c] = pd.to_numeric(train_tft[c], errors="coerce").astype("float32")

# encoder 28 / decoder 7 범위
training_cutoff = train_tft["time_idx"].max() - PRED_LEN
tft_dataset = TimeSeriesDataSet(
    train_tft[train_tft.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="매출수량",
    group_ids=["pair_id"],
    min_encoder_length=ENC_LEN,
    max_encoder_length=ENC_LEN,
    min_prediction_length=PRED_LEN,
    max_prediction_length=PRED_LEN,
    static_categoricals=static_categoricals,
    time_varying_known_reals=time_varying_known_reals,
    time_varying_unknown_reals=time_varying_unknown_reals,
    target_normalizer=None,      # 필요 시 Normalizer 사용 가능
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

validation = TimeSeriesDataSet.from_dataset(tft_dataset, train_tft, predict=True, stop_randomization=True)
train_loader = tft_dataset.to_dataloader(train=True,  batch_size=256, num_workers=2)
val_loader   = validation.to_dataloader(train=False, batch_size=256, num_workers=2)

# TFT-mini 모델 인스턴스 (변수명 충돌 방지: tft_model)
tft_model = TemporalFusionTransformer.from_dataset(
    tft_dataset,
    loss=RMSE(),
    log_interval=200,
    reduce_on_plateau_patience=4,
    **TFT_PARAMS
)

print("[Define Model] dataset & tft_model initialized.")
print("isinstance(tft_model, L.LightningModule):", isinstance(tft_model, L.LightningModule))

/usr/local/lib/python3.11/dist-packages/lightning/pytorch/utilities/parsing.py:210: Attribute 'loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['loss'])`.
/usr/local/lib/python3.11/dist-packages/lightning/pytorch/utilities/parsing.py:210: Attribute 'logging_metrics' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['logging_metrics'])`.


[Define Model] dataset & tft_model initialized.
isinstance(tft_model, L.LightningModule): True


# Train

In [None]:
# ==============================================================
# CatBoost 학습 (h=1..7)
# ==============================================================
if "split_train_valid" not in globals():
    def split_train_valid(X: pd.DataFrame, y: pd.Series, valid_ratio=0.1):
        n = len(X); k = int(n * (1 - valid_ratio))
        return (X.iloc[:k].reset_index(drop=True), y.iloc[:k].reset_index(drop=True),
                X.iloc[k:].reset_index(drop=True), y.iloc[k:].reset_index(drop=True))

if "cat_col_indices" not in globals():
    def cat_col_indices(cols):
        return [i for i, c in enumerate(cols) if c in cat_features_cols]

cat_models = {}

for h in range(1, PRED_LEN + 1):
    Xh, yh = Xy_h[h]
    if len(Xh) == 0:
        print(f"[CatBoost][H{h}] 학습 데이터가 없습니다. 건너뜁니다.")
        continue

    # 범주형 피처 인덱스
    cat_idx = cat_col_indices(Xh.columns)

    # 단순 시계열 분할(뒤쪽 10% 검증)
    X_tr, y_tr, X_va, y_va = split_train_valid(Xh, yh, valid_ratio=0.1)
    print(f"[CatBoost][H{h}] train={len(X_tr):,}  valid={len(X_va):,}")

    train_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
    valid_pool = Pool(X_va, y_va, cat_features=cat_idx)

    # 모델 생성 및 학습
    cb = CatBoostRegressor(**CAT_PARAMS)
    cb.fit(
        train_pool,
        eval_set=valid_pool,
        use_best_model=True,
        verbose=200
    )
    cat_models[h] = cb

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [7]:
# ==============================================================
# TFT‑mini 학습
# ==============================================================

early_stop = EarlyStopping(monitor="val_loss", patience=6, mode="min")
lr_logger  = LearningRateMonitor(logging_interval="epoch")
checkpoint = ModelCheckpoint(monitor="val_loss", save_top_k=1, mode="min")
logger     = CSVLogger("tft_logs", name="catboost_tft")

precision = "bf16-mixed" if torch.cuda.is_bf16_supported() else 32
print(f"[Precision] Using {precision}")

trainer = L.Trainer(
    max_epochs=30,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=1,
    precision="bf16-mixed" if torch.cuda.is_bf16_supported() else 32,
    gradient_clip_val=0.1,
    callbacks=[early_stop, lr_logger, checkpoint],
    logger=logger,
    enable_progress_bar=True,
    limit_val_batches=1.0,
    num_sanity_val_steps=0,   # ← 핵심: 사전 검증 스킵
)

# 학습 직전, 명시적으로 train 모드 전환
tft_model.train()             # ← 핵심: 강제 train
print("model.training =", tft_model.training)  # True 이어야 정상

# 학습
trainer.fit(tft_model, train_dataloaders=train_loader, val_dataloaders=val_loader)

# 체크포인트 로드 (val을 스킵했다면 best가 없을 수 있음)
if checkpoint.best_model_path:
    tft_model = TemporalFusionTransformer.load_from_checkpoint(checkpoint.best_model_path)
    print("[TFT] best checkpoint:", checkpoint.best_model_path)
else:
    print("[TFT] 경고: validation을 스킵하여 best checkpoint가 없습니다.")

[Precision] Using bf16-mixed
model.training = True


/usr/local/lib/python3.11/dist-packages/lightning/pytorch/utilities/model_summary/model_summary.py:231: Precision bf16-mixed is not supported by the model summary.  Estimated model size in MB will not be accurate. Using 32 bits instead.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

[TFT] best checkpoint: tft_logs/catboost_tft/version_0/checkpoints/epoch=5-step=2220.ckpt
[CatBoost][H1] train=87,544  valid=9,728
0:	learn: 34.6760845	test: 67.1064850	best: 67.1064850 (0)	total: 119ms	remaining: 3m 58s
200:	learn: 16.0263498	test: 46.4850974	best: 46.4758917 (193)	total: 12.7s	remaining: 1m 53s
400:	learn: 15.0272317	test: 45.9739890	best: 45.9008939 (387)	total: 23.1s	remaining: 1m 32s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 45.90089388
bestIteration = 387

Shrink model to first 388 iterations.
[CatBoost][H2] train=87,371  valid=9,708
0:	learn: 34.6269869	test: 67.2384195	best: 67.2384195 (0)	total: 62.4ms	remaining: 2m 4s
200:	learn: 15.9907891	test: 50.6875693	best: 50.4769793 (136)	total: 13.2s	remaining: 1m 58s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 50.4769793
bestIteration = 136

Shrink model to first 137 iterations.
[CatBoost][H3] train=87,197  valid=9,689
0:	learn: 34.6147246	test: 67.3807641	best: 67.380

# Prediction

In [8]:
# ==============================================================
# Prediction — CatBoost + TFT-mini Ensemble (제출 형식 호환)
# ==============================================================

# 앙상블 가중치 기본값
W_CAT = globals().get("W_CAT", 0.4)
W_TFT = globals().get("W_TFT", 0.6)

if "convert_to_submission_format" not in globals():
    def convert_to_submission_format(pred_df: pd.DataFrame, sample_df: pd.DataFrame) -> pd.DataFrame:
        """
        pred_df: ['영업일자','영업장명_메뉴명','매출수량'] (long)
        sample_df: 대회에서 제공한 sample_submission 스키마(첫 컬럼은 '영업일자', 나머지는 메뉴명 컬럼)
        """
        out = sample_df.copy()
        # long -> wide 피벗
        wide = pred_df.pivot(index="영업일자", columns="영업장명_메뉴명", values="매출수량")
        # sample_submission의 순서/컬럼에 맞게 채우기
        for r in range(len(out)):
            date_key = out.at[r, "영업일자"]
            if date_key in wide.index:
                for c in out.columns:
                    if c == "영업일자":
                        continue
                    if c in wide.columns:
                        val = wide.at[date_key, c]
                        out.at[r, c] = 0 if pd.isna(val) else max(float(val), 0.0)
        # 숫자 보장 / 음수 클립
        for c in out.columns:
            if c == "영업일자":
                continue
            out[c] = pd.to_numeric(out[c], errors="coerce").fillna(0).clip(lower=0)
        return out

def prepare_test(df_test_raw: pd.DataFrame) -> pd.DataFrame:
    df = df_test_raw.copy()
    df["영업장명_메뉴명"] = df["영업장명_메뉴명"].astype(str)
    df["업장명"] = df["영업장명_메뉴명"].str.split("_", n=1).str[0]
    df["메뉴명"] = df["영업장명_메뉴명"].str.split("_", n=1).str[1].fillna("NA")

    df["store_id"] = df["업장명"].map(store2id).fillna(-1).astype(int).astype(str)
    df["item_id"]  = df["메뉴명"].map(item2id).fillna(-1).astype(int).astype(str)
    df["pair_id"]  = (df["업장명"] + "###" + df["메뉴명"]).map(pair2id).fillna(-1).astype(int).astype(str)

    df = df.sort_values(["영업장명_메뉴명","영업일자"]).reset_index(drop=True)
    df = add_domain_features(df)

    for w in ROLL_WINS:
        df[f"roll_mean_{w}"] = (
            df.groupby("영업장명_메뉴명")["매출수량"]
              .transform(lambda s: s.rolling(w, min_periods=1).mean())
        )
    return df


def predict_catboost(df_28: pd.DataFrame) -> pd.DataFrame:
    preds = []
    for pair, sub in df_28.groupby("영업장명_메뉴명"):
        sub = sub.sort_values("영업일자").reset_index(drop=True)

        # 최근 28일 복사
        history = sub.iloc[-ENC_LEN:].copy()

        for h in range(1, PRED_LEN+1):
            fut_date = history["영업일자"].iloc[-1] + timedelta(days=1)

            # 미래 row 생성 (매출수량은 placeholder=0)
            fut_row = pd.DataFrame([{
                "영업일자": fut_date,
                "영업장명_메뉴명": pair,
                "업장명": history["업장명"].iloc[-1],
                "메뉴명": history["메뉴명"].iloc[-1],
                "store_id": history["store_id"].iloc[-1],
                "item_id": history["item_id"].iloc[-1],
                "pair_id": history["pair_id"].iloc[-1],
                "매출수량": 0
            }])

            fut_row = add_domain_features(fut_row)

            # roll_mean_* 업데이트: 기존 history + dummy fut_row 기준
            tmp_hist = pd.concat([history, fut_row], ignore_index=True)
            for w in ROLL_WINS:
                tmp_hist[f"roll_mean_{w}"] = (
                    tmp_hist.groupby("영업장명_메뉴명")["매출수량"]
                            .transform(lambda s: s.rolling(w, min_periods=1).mean())
                )

            fut_row = tmp_hist.iloc[[-1]]

            # feature 구성
            X = fut_row[cat_features_cols + [f"roll_mean_{w}" for w in ROLL_WINS]].copy()
            fut_feats = fut_row[known_future_cols].copy()
            fut_feats.columns = [c + f"_H{h}" for c in fut_feats.columns]
            X = pd.concat([X.reset_index(drop=True), fut_feats.reset_index(drop=True)], axis=1)

            # 예측
            model = cat_models[h]
            cat_cols_idx = [i for i, c in enumerate(X.columns) if c in cat_features_cols]
            yhat = float(model.predict(Pool(X, cat_features=cat_cols_idx))[0])
            yhat = max(0.0, yhat)

            preds.append({"영업장명_메뉴명": pair, "h": h, "pred_cat": yhat})

            # 예측값을 history에 append → 다음 horizon에서 roll_mean 반영
            fut_row.loc[:, "매출수량"] = yhat
            history = pd.concat([history, fut_row], ignore_index=True)

    return pd.DataFrame(preds)

def predict_tft(df_28: pd.DataFrame) -> pd.DataFrame:
    tmp = df_28.copy()
    tmp["time_idx"] = (tmp["영업일자"] - train["영업일자"].min()).dt.days.astype(int)

    # 미래 7일 행 생성
    rows = []
    for pair, sub in tmp.groupby("영업장명_메뉴명"):
        sub = sub.sort_values("영업일자")
        last_date = sub["영업일자"].iloc[-1]
        pid = sub["pair_id"].iloc[-1]
        sid = sub["store_id"].iloc[-1]
        iid = sub["item_id"].iloc[-1]
        for h in range(1, PRED_LEN+1):
            d = last_date + timedelta(days=h)
            rows.append({"영업장명_메뉴명": pair, "영업일자": d, "pair_id": pid, "store_id": sid, "item_id": iid})
    fut = pd.DataFrame(rows)
    fut = add_domain_features(fut.assign(매출수량=0))

    # roll_mean_* NaN 채우기
    for w in ROLL_WINS:
        fut[f"roll_mean_{w}"] = (
            fut.groupby("영업장명_메뉴명")["매출수량"]
               .transform(lambda s: s.rolling(w, min_periods=1).mean())
               .fillna(0)
        )

    fut["time_idx"] = (fut["영업일자"] - train["영업일자"].min()).dt.days.astype(int)

    enc_dec = pd.concat([tmp, fut], axis=0, ignore_index=True)

    # 혹시라도 남은 NaN 전체 제거
    enc_dec = enc_dec.fillna(0)

    predict_ds = TimeSeriesDataSet.from_dataset(
        tft_dataset, enc_dec, predict=True, stop_randomization=True
    )
    predict_loader = predict_ds.to_dataloader(train=False, batch_size=256, num_workers=2)

    yhat = tft_model.predict(predict_loader, mode="prediction")
    if isinstance(yhat, tuple):
        yhat = yhat[0]

    fut_sorted = fut.sort_values(["pair_id","영업일자"]).reset_index(drop=True)
    series_ids = fut_sorted["pair_id"].unique().tolist()

    out = []
    for i, pid in enumerate(series_ids):
        pair_name = df_28.loc[df_28["pair_id"]==pid, "영업장명_메뉴명"].iloc[0]
        for h in range(1, PRED_LEN+1):
            out.append({
                "영업장명_메뉴명": pair_name,
                "h": h,
                "pred_tft": float(max(0.0, yhat[i, h-1].item()))
            })
    return pd.DataFrame(out)

def predict_ensemble_for_test_file(test_file_path: str) -> pd.DataFrame:
    df_raw = pd.read_csv(test_file_path, parse_dates=["영업일자"])
    df_28 = prepare_test(df_raw)
    cat_pred = predict_catboost(df_28)
    tft_pred = predict_tft(df_28)
    pred = pd.merge(cat_pred, tft_pred, on=["영업장명_메뉴명","h"], how="outer").fillna(0.0)
    pred["pred_ens"] = W_CAT * pred["pred_cat"] + W_TFT * pred["pred_tft"]
    pred["file"] = os.path.basename(test_file_path)
    return pred

In [10]:
# -------------------------------
# Prediction 실행
# -------------------------------
all_preds = []
test_files = sorted(glob.glob(os.path.join(file_path, 'test', 'TEST_*.csv')))
print(f"[Predict] test files: {len(test_files)}")

for path in tqdm(test_files, desc="Test files 예측 진행률", unit="file"):
    df_pred = predict_ensemble_for_test_file(path)
    all_preds.append(df_pred)

full_pred_df = pd.concat(all_preds, ignore_index=True)

[Predict] test files: 10


Test files 예측 진행률: 100%|██████████| 10/10 [05:22<00:00, 32.28s/file]


# Submission

In [11]:
# -------------------------------
# Submission 변환 함수
# -------------------------------
def convert_preds_to_submission(all_preds, sample_submission):
    full_pred_df = pd.concat(all_preds, ignore_index=True)
    full_pred_df["test_id"] = full_pred_df["file"].str.replace(".csv", "", regex=False)

    # float으로 변환해서 경고 방지
    submission = sample_submission.copy().astype(float)

    for idx in submission.index:
        test_day = submission.loc[idx, "영업일자"]
        test_id, plus_day = test_day.split("+")
        h_val = int(plus_day.replace("일", ""))

        day_preds = full_pred_df[
            (full_pred_df["test_id"] == test_id) &
            (full_pred_df["h"] == h_val)
        ]

        for col in submission.columns[1:]:
            val = day_preds.loc[day_preds["영업장명_메뉴명"] == col, "pred_ens"]
            if not val.empty:
                submission.loc[idx, col] = float(val.values[0])  # 명시적 float 변환
            else:
                submission.loc[idx, col] = 0.0  # float 0

    return submission

  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submission.loc[idx, col] = val.values[0]
  submissio

Saved: /content/drive/MyDrive/LGAI/baseline_submission(c).csv


In [None]:
# -------------------------------
# Submission 실행
# -------------------------------
sample_submission = pd.read_csv(sample_path)
submission_df = convert_preds_to_submission(all_preds, sample_submission)
save_path = os.path.join(file_path, 'baseline_submission(c).csv')
submission_df.to_csv(save_path, index=False, encoding='utf-8-sig')
print('Saved:', save_path)

In [None]:
#all_preds

In [None]:
#df_pred.columns

In [None]:
#full_pred_df.columns