## **피처 삭제 및 변경 후 catboost + xgboost + lgbm**

### **valid weighted sMAPE=44.3632**

In [4]:
# -*- coding: utf-8 -*-
"""
리조트 식음업장 — CatBoost + XGBoost + LightGBM 3중 앙상블
+ 시간누수 방지 타깃인코딩/시계열 피처
+ 업장가중 sMAPE(담하·미라시아 가중)
+ 전역/세그먼트 가중 블렌딩(희소/연회장 아이템 보수적 혼합)
+ 7일 재귀예측 + 업장별 캘리브레이션(합계 ratio clip) + 퍼지매칭 제출
+ (옵션) 간단 weight grid로 앙상블 가중치 탐색

사용법
1) BASE_DIR 경로 수정
2) re_train_06.csv, TEST_*processed (1).csv, 곤지암sample_submission.csv 준비
3) python resort_triple_ensemble_full.py 실행 → submission_triple.csv 생성
"""
from __future__ import annotations
import os, glob, re, unicodedata, difflib, warnings
from pathlib import Path
from datetime import timedelta
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool

np.set_printoptions(suppress=True)
warnings.filterwarnings("ignore")

# ====================== Config ======================
BASE_DIR   = Path(r"C:\Users\LG\Downloads")  # ← 네 환경에 맞게 수정
TRAIN_FILE = BASE_DIR / "re_train_06.csv"
TEST_GLOB  = str(BASE_DIR / "TEST_*processed (1).csv")   # 예: TEST_00_processed (1).csv ~
SAMPLE_SUB = BASE_DIR / "곤지암sample_submission.csv"
OUT_FILE   = BASE_DIR / "submission_triple.csv"

RANDOM_STATE     = 42
VALID_LAST_DAYS  = 28
N_FOLDS          = 4  # rolling origin folds

# 업장 가중(평가식 w_s)
SHOP_WEIGHTS: Dict[str, float] = {"담하": 2.0, "미라시아": 2.0}

# 학습 가중: 양성 부스트(희소 대응)
POS_SAMPLE_BOOST = 4.0

# 캘리브레이션 clip
CAL_CLIP = (0.85, 1.15)

# 희소/정체 기준
SPARSE_NZ_THRESHOLD  = 0.10
STALE_DAYS_THRESHOLD = 21

# 재귀예측 후처리
WEEKEND_BUMP = 1.05
HOLIDAY_BUMP = 1.10
SPIKE_BUMP   = 1.06
SPIKE_RATIO  = 1.50

# 세그먼트 알파(희소/연회장일 때 앵커 혼합 비율↑)
ANCHOR_MIX_FOR_SPARSE = 0.30  # yhat = (1-0.30)*model + 0.30*anchor

# ====================== Weighted sMAPE ======================
def _smape_official_vector(y_true, y_pred):
    y_true = np.asarray(y_true, float)
    y_pred = np.asarray(y_pred, float)
    denom = np.abs(y_true) + np.abs(y_pred)
    m = denom > 0
    if not np.any(m):
        return np.nan
    return float(np.mean(2.0 * np.abs(y_true[m] - y_pred[m]) / denom[m]) * 100.0)

def weighted_smape_by_shop_item(df, shop_weights):
    """
    df: columns=['영업장명','영업장명_메뉴명','y_true','y_pred']
    - y_true==0 제외 → 아이템 sMAPE → 업장 평균 → 업장 가중
    """
    if df is None or len(df) == 0:
        return float("nan")
    work = df[df["y_true"].astype(float) != 0].copy()
    if len(work) == 0:
        return float("nan")

    item_scores = []
    for (shop, item), sub in work.groupby(["영업장명", "영업장명_메뉴명"], sort=False):
        s = _smape_official_vector(sub["y_true"].values, sub["y_pred"].values)
        if not np.isnan(s):
            item_scores.append((shop, s))
    if not item_scores:
        return float("nan")

    shop_df = (pd.DataFrame(item_scores, columns=["영업장명", "s_item"])
               .groupby("영업장명", as_index=False)["s_item"].mean()
               .rename(columns={"s_item": "s_shop"}))
    shop_df["w"] = shop_df["영업장명"].map(lambda s: float(shop_weights.get(s, 1.0)))
    wsum = float(shop_df["w"].sum())
    if wsum <= 0:
        return float(shop_df["s_shop"].mean())
    return float(np.sum(shop_df["s_shop"] * shop_df["w"]) / wsum)

# ====================== IO & Utils ======================
def safe_read_csv(path: Path):
    for enc in ["utf-8-sig", "utf-8", "cp949", "euc-kr"]:
        try:
            return pd.read_csv(path, encoding=enc)
        except Exception:
            pass
    raise RuntimeError(f"Failed to read {path}")

def ensure_upjang(df):
    if "영업장명" not in df.columns:
        if "영업장명_메뉴명" in df.columns:
            df = df.copy()
            df["영업장명"] = df["영업장명_메뉴명"].astype(str).str.split("_", n=1).str[0]
        else:
            df["영업장명"] = ""
    return df

# ====================== Feature Engineering ======================
def build_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out["영업일자"] = pd.to_datetime(out["영업일자"], errors="coerce")
    out = ensure_upjang(out)

    if "매출수량" in out.columns:
        out["매출수량"] = pd.to_numeric(out["매출수량"], errors="coerce").fillna(0.0).clip(lower=0.0)

    out = out.sort_values(["영업장명_메뉴명","영업일자"]).reset_index(drop=True)

    # 기본 캘린더
    out["dow"]   = out["영업일자"].dt.weekday
    out["week"]  = out["영업일자"].dt.isocalendar().week.astype(int)
    out["month"] = out["영업일자"].dt.month
    out["year"]  = out["영업일자"].dt.year
    out["day"]   = out["영업일자"].dt.day
    out["woy"]   = out["영업일자"].dt.isocalendar().week.astype(int)

    # Lags & rollings (누수 방지: shift)
    for lag in [1,7,14,28]:
        out[f"lag_{lag}"] = out.groupby("영업장명_메뉴명")["매출수량"].shift(lag)
    for win in [7,14,28]:
        grp = out.groupby("영업장명_메뉴명")["매출수량"]
        out[f"roll_mean_{win}"] = grp.shift(1).rolling(win, min_periods=3).mean()
        out[f"roll_std_{win}"]  = grp.shift(1).rolling(win, min_periods=3).std()
        out[f"roll_max_{win}"]  = grp.shift(1).rolling(win, min_periods=3).max()
        out[f"roll_min_{win}"]  = grp.shift(1).rolling(win, min_periods=3).min()

    # 시간-안전 타깃 인코딩(shift→expanding)
    out["exp_item_dow_mean"] = (
        out.groupby(["영업장명_메뉴명","dow"])["매출수량"].apply(lambda s: s.shift(1).expanding(min_periods=3).mean())
          .reset_index(level=[0,1], drop=True)
    )
    out["exp_item_mean"] = (
        out.groupby("영업장명_메뉴명")["매출수량"].apply(lambda s: s.shift(1).expanding(min_periods=3).mean())
          .reset_index(level=0, drop=True)
    )
    out["exp_shop_dow_mean"] = (
        out.groupby(["영업장명","dow"])["매출수량"].apply(lambda s: s.shift(1).expanding(min_periods=3).mean())
          .reset_index(level=[0,1], drop=True)
    )
    out["item_dow_idx"] = out["exp_item_dow_mean"] / (out["exp_item_mean"] + 1e-6)
    out["shop_dow_idx"] = out["exp_shop_dow_mean"] / (
        out.groupby("영업장명")["exp_shop_dow_mean"].transform(lambda s: s.shift(1).expanding(min_periods=3).mean()) + 1e-6
    )

    # Trend & sparsity
    out["trend_7_1"]  = out["roll_mean_7"]  / (out["lag_1"] + 1e-6)
    out["trend_14_7"] = out["roll_mean_14"] / (out["roll_mean_7"] + 1e-6)
    out["delta_1_7"]  = out["lag_1"] - out["roll_mean_7"]
    out["nonzero_rate_28"] = (
        out.groupby("영업장명_메뉴명")["매출수량"].transform(lambda s: s.shift(1).rolling(28, min_periods=3).apply(lambda x: (x>0).mean(), raw=True))
    )

    # 마지막 판매 이후 경과일
    def days_since_last_sale(g):
        last = None; res = []
        for d, y in zip(g["영업일자"], g["매출수량"]):
            res.append(365 if last is None else (d - last).days)
            if y > 0: last = d
        return pd.Series(res, index=g.index)
    try:
        out["days_since_last_sale"] = (
            out.groupby("영업장명_메뉴명", group_keys=False).apply(days_since_last_sale, include_groups=False).astype(float)
        )
    except TypeError:
        out["days_since_last_sale"] = (
            out.groupby("영업장명_메뉴명", group_keys=False).apply(days_since_last_sale).astype(float)
        )

    # Fourier & flags
    out["dow_sin"], out["dow_cos"] = np.sin(2*np.pi*out["dow"]/7),  np.cos(2*np.pi*out["dow"]/7)
    out["month_sin"], out["month_cos"] = np.sin(2*np.pi*out["month"]/12), np.cos(2*np.pi*out["month"]/12)
    out["woy_sin"], out["woy_cos"] = np.sin(2*np.pi*out["woy"]/53), np.cos(2*np.pi*out["woy"]/53)

    for c in ["is_spike","is_drop","is_weekday_price","is_weekend_price","is_holiday","banquet_type"]:
        if c not in out.columns: out[c] = 0
        out[c] = pd.to_numeric(out[c], errors="coerce").fillna(0).astype(int)

    # ID 인코딩
    out["item_id"] = out["영업장명_메뉴명"].astype("category").cat.codes
    out["업장_id"]  = out["영업장명"].astype("category").cat.codes

    # NA 처리
    num_cols = out.select_dtypes(include=[np.number]).columns.tolist()
    if "매출수량" in num_cols: num_cols.remove("매출수량")
    out[num_cols] = out[num_cols].fillna(0.0)
    return out

def feature_columns(df: pd.DataFrame) -> List[str]:
    base = [
        "item_id","업장_id",
        "dow","week","month","year","day","woy","woy_sin","woy_cos",
        "lag_1","lag_7","lag_14","lag_28",
        "roll_mean_7","roll_std_7","roll_max_7","roll_min_7",
        "roll_mean_14","roll_std_14","roll_max_14","roll_min_14",
        "roll_mean_28","roll_std_28","roll_max_28","roll_min_28",
        "exp_item_dow_mean","exp_item_mean","exp_shop_dow_mean",
        "item_dow_idx","shop_dow_idx",
        "trend_7_1","trend_14_7","delta_1_7",
        "nonzero_rate_28","days_since_last_sale",
        "dow_sin","dow_cos","month_sin","month_cos",
        "is_spike","is_drop","is_weekday_price","is_weekend_price","is_holiday",
        "banquet_type"
    ]
    return [c for c in base if c in df.columns]

# ====================== CV Split ======================
def build_rolling_folds(dates: pd.Series, n_folds: int = N_FOLDS, valid_days: int = VALID_LAST_DAYS) -> List[Tuple[pd.Timestamp, pd.Timestamp]]:
    dates = pd.to_datetime(dates)
    dmin, dmax = dates.min(), dates.max()
    ends = [dmax - pd.Timedelta(days=valid_days*(n_folds - i - 1)) for i in range(n_folds)]
    folds = []
    for end in ends:
        vstart = end - pd.Timedelta(days=valid_days-1)
        folds.append((vstart.normalize(), end.normalize()))
    uniq = []
    for s,e in folds:
        if e<=dmax and s>=dmin and (s,e) not in uniq:
            uniq.append((s,e))
    return uniq

def warmup_valid_mask(feat_df, vstart, item_col="영업장명_메뉴명", date_col="영업일자", need_days=28):
    last_hist = (feat_df[feat_df[date_col] < vstart]
                 .groupby(item_col)[date_col].max())
    cutoff = (last_hist + pd.Timedelta(days=need_days)).rename("warm_ok_date").to_frame()
    merged = feat_df.merge(cutoff, left_on=item_col, right_index=True, how="left")
    return (merged[date_col] >= merged["warm_ok_date"]).fillna(False).values

# ====================== 모델 파라미터 ======================
XGB_PARAMS_TW = dict(
    objective="reg:tweedie", eval_metric="rmse", tree_method="hist",
    eta=0.05, max_depth=7, subsample=0.85, colsample_bytree=0.85,
    min_child_weight=8, reg_lambda=2.5, reg_alpha=0.0,
    max_delta_step=1.0, seed=RANDOM_STATE, tweedie_variance_power=1.2
)
LGB_PARAMS_TW = dict(
    objective="tweedie", metric="rmse", tweedie_variance_power=1.2,
    learning_rate=0.05, num_leaves=127, max_depth=-1,
    feature_fraction=0.85, bagging_fraction=0.85, bagging_freq=1,
    min_data_in_leaf=60, lambda_l1=0.0, lambda_l2=2.0,
    verbosity=-1, random_state=RANDOM_STATE
)
CAT_PARAMS_LOG = dict(
    loss_function="RMSE",  # log1p target로 학습
    learning_rate=0.05, depth=8, l2_leaf_reg=3.0,
    random_seed=RANDOM_STATE, bootstrap_type="Bayesian", bagging_temperature=1.0,
    verbose=False
)

# ====================== 학습/예측 유틸 ======================
def train_xgb_tweedie(X_tr, y_tr, w_tr, X_va):
    dtr = xgb.DMatrix(X_tr, label=y_tr, weight=w_tr)
    model = xgb.train(XGB_PARAMS_TW, dtr, num_boost_round=2500, evals=[], verbose_eval=False)
    pred = model.predict(xgb.DMatrix(X_va))
    return model, np.clip(pred, 0, None)

def train_lgb_tweedie(X_tr, y_tr, w_tr, X_va):
    ltr = lgb.Dataset(X_tr, label=y_tr, weight=w_tr, free_raw_data=False)
    model = lgb.train(LGB_PARAMS_TW, ltr, num_boost_round=2500, valid_sets=[])
    pred = model.predict(X_va, num_iteration=model.best_iteration)
    return model, np.clip(pred, 0, None)

def train_cat_log1p(X_tr, y_tr, w_tr, X_va):
    m = y_tr > 0
    X_pos, y_pos, w_pos = X_tr.loc[m], np.log1p(y_tr[m]), w_tr[m]
    cat = CatBoostRegressor(**CAT_PARAMS_LOG)
    pool_tr = Pool(X_pos, label=y_pos, weight=w_pos)
    cat.fit(pool_tr)
    pred = cat.predict(X_va)
    pred = np.expm1(np.clip(pred, 0, None))
    return cat, pred

# ====================== 가중 최적화(간단 grid) ======================
def search_best_weights(valid_df: pd.DataFrame, preds: List[np.ndarray], step=0.05):
    """
    preds: [p1, p2, p3] (XGB, LGB, CAT) 순서
    w 탐색: w1,w2 in [0,1], w3=1-w1-w2, 단 w3>=0
    """
    p1, p2, p3 = preds
    best_w, best_sc = (0.34, 0.33, 0.33), 1e18
    for w1 in np.arange(0.0, 1.0+1e-9, step):
        for w2 in np.arange(0.0, 1.0-w1+1e-9, step):
            w3 = 1.0 - w1 - w2
            pv = w1*p1 + w2*p2 + w3*p3
            pack = valid_df.copy()
            pack["y_pred"] = pv
            sc = weighted_smape_by_shop_item(pack, SHOP_WEIGHTS)
            if sc < best_sc:
                best_sc, best_w = sc, (float(w1), float(w2), float(w3))
    return best_w, float(best_sc)

# ====================== 캘리브레이션 ======================
def ratio_clip(a: np.ndarray, b: np.ndarray, clip=CAL_CLIP) -> float:
    a = np.asarray(a, float); b = np.asarray(b, float)
    s_true, s_pred = float(np.sum(a)), float(np.sum(b))
    if s_pred <= 1e-6: return 1.0
    return float(np.clip(s_true/s_pred, clip[0], clip[1]))

class Calibrator:
    def __init__(self):
        self.global_ratio: float = 1.0
        self.shop_ratio: Dict[str, float] = {}
    def fit(self, df_valid: pd.DataFrame):
        df = df_valid[df_valid["y_true"]>0].copy()
        self.global_ratio = ratio_clip(df["y_true"].values, df["y_pred"].values)
        self.shop_ratio = df.groupby("영업장명").apply(lambda g: ratio_clip(g["y_true"].values, g["y_pred"].values)).to_dict()
        return self
    def apply(self, yhat: float, shop: str) -> float:
        r = self.shop_ratio.get(str(shop), self.global_ratio)
        return float(max(0.0, yhat * r))

# ====================== 7일 재귀예측 준비 ======================
def build_single_row_features(dt, cur_hist, item_id, upjang_id, dow, banquet_code):
    def pull(date):
        v = cur_hist.get(date, 0.0)
        if isinstance(v, pd.Series): v = float(v.sum())
        return float(v)
    def window_vals(win): return [pull(dt - timedelta(days=n)) for n in range(1, win+1)]
    def lag(n): return pull(dt - timedelta(days=n))

    vals7, vals14, vals28 = window_vals(7), window_vals(14), window_vals(28)
    def roll_stats(vals):
        s = pd.Series(vals)
        if s.dropna().size >= 3:
            return float(s.mean()), float(s.std(ddof=0)), float(s.max()), float(s.min())
        return 0.0, 0.0, 0.0, 0.0

    rm7, rs7, rmax7, rmin7 = roll_stats(vals7)
    rm14, rs14, rmax14, rmin14 = roll_stats(vals14)
    rm28, rs28, rmax28, rmin28 = roll_stats(vals28)
    nz28 = float(np.mean(np.array(vals28) > 0)) if len(vals28) >= 3 else 0.0

    dlast = 365.0
    for n, v in enumerate(vals28, start=1):
        if v > 0: dlast = float(n); break

    woy = int(dt.isocalendar().week)
    month = dt.month

    lags = {"lag_1": lag(1), "lag_7": lag(7), "lag_14": lag(14), "lag_28": lag(28)}

    return {
        "item_id": item_id, "업장_id": upjang_id,
        "dow": dow, "week": woy, "month": month, "year": dt.year, "day": dt.day,
        "woy": woy, "woy_sin": np.sin(2*np.pi*woy/53), "woy_cos": np.cos(2*np.pi*woy/53),
        "roll_mean_7": rm7, "roll_std_7": rs7, "roll_max_7": rmax7, "roll_min_7": rmin7,
        "roll_mean_14": rm14, "roll_std_14": rs14, "roll_max_14": rmax14, "roll_min_14": rmin14,
        "roll_mean_28": rm28, "roll_std_28": rs28, "roll_max_28": rmax28, "roll_min_28": rmin28,
        **lags,
        "exp_item_dow_mean": rm7,
        "exp_item_mean": rm28,
        "exp_shop_dow_mean": rm7,
        "item_dow_idx": 1.0, "shop_dow_idx": 1.0,
        "trend_7_1": rm7 / (lags["lag_1"] + 1e-6),
        "trend_14_7": rm14 / (rm7 + 1e-6),
        "delta_1_7": lags["lag_1"] - rm7,
        "nonzero_rate_28": nz28,
        "days_since_last_sale": dlast,
        "dow_sin": np.sin(2*np.pi*dow/7), "dow_cos": np.cos(2*np.pi*dow/7),
        "month_sin": np.sin(2*np.pi*month/12), "month_cos": np.cos(2*np.pi*month/12),
        "is_spike": 0, "is_drop": 0,
        "is_weekday_price": 1 if dow<5 else 0, "is_weekend_price": 1 if dow>=5 else 0,
        "is_holiday": 0,
        "banquet_type": banquet_code,
    }

def apply_postprocess(yhat: float, dt, feat_row: dict) -> float:
    if dt.weekday() >= 5: yhat *= WEEKEND_BUMP
    if feat_row.get("is_holiday",0)==1: yhat *= HOLIDAY_BUMP
    try:
        if feat_row["lag_1"] > SPIKE_RATIO * (feat_row["roll_mean_7"] + 1e-6): yhat *= SPIKE_BUMP
    except Exception: pass
    return float(max(0.0, yhat))

# ====================== 퍼지매칭 제출 ======================
def parse_tag_day(label: str):
    if not isinstance(label, str): label = str(label)
    m_tag = re.search(r"(TEST_\d{2})", label, flags=re.IGNORECASE)
    m_day = re.findall(r"(\d+)", label)
    tag = m_tag.group(1).upper() if m_tag else None
    k   = int(m_day[-1]) if m_day else None
    return tag, k

def norm_name(s: str):
    if not isinstance(s, str): s = str(s)
    s = unicodedata.normalize("NFKC", s).strip().lower()
    repl = {"·":" ","•":" ","ㆍ":" ","‧":" ","–":"-","—":"-","’":"'", "“":"\"", "”":"\"",
            "（":"(","）":")","【":"[","】":"]"}
    for k,v in repl.items(): s = s.replace(k, v)
    s = re.sub(r"[()\[\]{}]", " ", s)
    s = re.sub(r"[\/_\-]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def build_row_index_map(pred_full: pd.DataFrame):
    pred_index_map = {}
    if pred_full is not None and not pred_full.empty:
        for ridx, row in pred_full.iterrows():
            t, k = parse_tag_day(ridx)
            if (t is not None) and (k is not None):
                pred_index_map[(t, k)] = row
    return pred_index_map

def build_column_mapping(sample_cols, pred_cols, cutoff=0.90):
    pred_norm2orig = {}
    for c in pred_cols:
        nc = norm_name(c)
        if nc not in pred_norm2orig: pred_norm2orig[nc] = c
    mapping, exact, fuzzy, unmatched = {}, 0, 0, []
    pred_norm_keys = list(pred_norm2orig.keys())
    for sc in sample_cols:
        ns = norm_name(sc)
        if ns in pred_norm2orig:
            mapping[sc] = pred_norm2orig[ns]; exact += 1
        else:
            cand = difflib.get_close_matches(ns, pred_norm_keys, n=1, cutoff=cutoff)
            if cand: mapping[sc] = pred_norm2orig[cand[0]]; fuzzy += 1
            else: mapping[sc] = None; unmatched.append(sc)
    return mapping, exact, fuzzy, unmatched

def save_submission(sample: pd.DataFrame, pred_full: pd.DataFrame, out_path: Path):
    pred_index_map = build_row_index_map(pred_full)
    submission = sample.copy()
    idx_labels = submission["영업일자"].tolist()
    item_cols  = submission.columns.tolist()[1:]

    pred_cols = [] if pred_full is None or pred_full.empty else pred_full.columns.tolist()
    col_map, exact, fuzzy, unmatch = build_column_mapping(item_cols, pred_cols, cutoff=0.90)

    out_vals, matched_rows = [], 0
    for lbl in idx_labels:
        t, k = parse_tag_day(lbl)
        if (t, k) in pred_index_map:
            sr = pred_index_map[(t, k)]
            row_vals = []
            for sc in item_cols:
                pc = col_map.get(sc)
                v = float(sr.get(pc, 0.0)) if pc is not None else 0.0
                row_vals.append(float(max(0.0, v)))
            matched_rows += 1
        else:
            row_vals = [0.0]*len(item_cols)
        out_vals.append(row_vals)

    final_df = pd.DataFrame(out_vals, columns=item_cols)
    final_df[item_cols] = np.round(final_df[item_cols].values, 0)
    final_df.insert(0, "영업일자", idx_labels)
    final_df.to_csv(out_path, index=False, encoding="utf-8-sig")

    total_pred_sum = float(np.nansum(final_df[item_cols].to_numpy()))
    print(f"[DBG] matched_rows={matched_rows}/{len(idx_labels)} | exact={exact} fuzzy={fuzzy} unmatch={len(unmatch)} | nonzero_sum={total_pred_sum:.2f}")
    if unmatch[:5]: print("[DBG] sample columns not matched (first 5):", unmatch[:5])
    print(f"[OK] Saved submission: {out_path}  exists={os.path.exists(out_path)}")

# ====================== 메인 ======================
def main():
    # 1) 데이터 로드
    train  = safe_read_csv(TRAIN_FILE)
    sample = safe_read_csv(SAMPLE_SUB)

    train["영업일자"] = pd.to_datetime(train["영업일자"], errors="coerce")
    train = ensure_upjang(train).sort_values(["영업장명_메뉴명","영업일자"]).reset_index(drop=True)

    feat_df = build_features(train)
    feats   = feature_columns(feat_df)
    y_all   = feat_df["매출수량"].astype(float).values

    # 2) CV 마지막 폴드 valid (블렌딩/캘리브레이션용)
    folds = build_rolling_folds(feat_df["영업일자"], n_folds=N_FOLDS, valid_days=VALID_LAST_DAYS)
    print(f"[CV] folds: {folds}")
    vs, ve = folds[-1]
    vmask = (feat_df["영업일자"]>=vs) & (feat_df["영업일자"]<=ve)
    vmask = vmask & warmup_valid_mask(feat_df, vs)
    tmask = ~vmask

    X_tr, y_tr = feat_df.loc[tmask, feats], y_all[tmask]
    X_va, y_va = feat_df.loc[vmask, feats], y_all[vmask]
    shops_tr   = feat_df.loc[tmask, "영업장명"].astype(str).values
    shops_va   = feat_df.loc[vmask, "영업장명"].astype(str).values
    items_va   = feat_df.loc[vmask, "영업장명_메뉴명"].astype(str).values

    w_tr = pd.Series(shops_tr).map(lambda s: SHOP_WEIGHTS.get(s,1.0)).values * np.where(y_tr>0, POS_SAMPLE_BOOST, 1.0)

    # 3) 세 모델 학습 & 검증 예측
    m_xgb, p_xgb = train_xgb_tweedie(X_tr, y_tr, w_tr, X_va)
    m_lgb, p_lgb = train_lgb_tweedie(X_tr, y_tr, w_tr, X_va)
    m_cat, p_cat = train_cat_log1p(X_tr, y_tr, w_tr, X_va)

    valid_pack = pd.DataFrame({"영업장명": shops_va, "영업장명_메뉴명": items_va, "y_true": y_va})
    (wx, wl, wc), sc_val = search_best_weights(valid_pack, [p_xgb, p_lgb, p_cat], step=0.05)
    print(f"[BLEND] best weights (XGB,LGB,CAT)=({wx:.2f},{wl:.2f},{wc:.2f}) | valid weighted sMAPE={sc_val:.4f}")

    # 4) 캘리브레이션(전역 가중 사용)
    y_pred_blend_va = wx*p_xgb + wl*p_lgb + wc*p_cat
    calib_pack = valid_pack.copy(); calib_pack["y_pred"] = y_pred_blend_va
    calibrator = Calibrator().fit(calib_pack)

    # 5) 전체 데이터로 재학습
    shops_all = feat_df["영업장명"].astype(str).values
    w_all = pd.Series(shops_all).map(lambda s: SHOP_WEIGHTS.get(s,1.0)).values * np.where(y_all>0, POS_SAMPLE_BOOST, 1.0)

    m_xgb_full, _ = train_xgb_tweedie(feat_df[feats], y_all, w_all, feat_df[feats].iloc[:1])
    m_lgb_full, _ = train_lgb_tweedie(feat_df[feats], y_all, w_all, feat_df[feats].iloc[:1])
    m_cat_full, _ = train_cat_log1p(feat_df[feats], y_all, w_all, feat_df[feats].iloc[:1])
    

    # 6) 테스트 7일 재귀예측
    test_files = sorted(glob.glob(TEST_GLOB))
    print(f"[Info] Found {len(test_files)} test files: {test_files[:3]}{' ...' if len(test_files)>3 else ''}")

    def forecast_7days_for_testfile(models, feats, train_df, test_df, test_tag, weights):
        model_xgb, model_lgb, model_cat = models
        wx, wl, wc = weights

        test_df = ensure_upjang(test_df.copy())
        test_df["영업일자"] = pd.to_datetime(test_df["영업일자"], errors="coerce")
        test_df = test_df.sort_values(["영업장명_메뉴명","영업일자"]).reset_index(drop=True)
        last_date = test_df["영업일자"].max()
        items = sorted(test_df["영업장명_메뉴명"].astype(str).unique().tolist())

        cols = ["영업일자","영업장명_메뉴명","매출수량","banquet_type","영업장명","dow"]
        a = ensure_upjang(train_df.copy()); b = ensure_upjang(test_df.copy())
        take = lambda df: df[[c for c in cols if c in df.columns]]
        hist_src = pd.concat([take(a), take(b)], ignore_index=True)
        hist_src["영업일자"] = pd.to_datetime(hist_src["영업일자"], errors="coerce")

        results = {k: {} for k in range(1,8)}

        # 카테고리 ID 맵
        all_items = pd.Categorical(hist_src["영업장명_메뉴명"].astype(str))
        item_to_id = {name:i for i,name in enumerate(all_items.categories)}
        all_shops = pd.Categorical(hist_src["영업장명"].astype(str))
        shop_to_id = {name:i for i,name in enumerate(all_shops.categories)}

        for item in items:
            g = hist_src[hist_src["영업장명_메뉴명"].astype(str)==item].copy().sort_values("영업일자")
            y_series = (g.groupby("영업일자", as_index=True)["매출수량"].sum().astype(float).clip(lower=0.0))
            shop = g["영업장명"].dropna().astype(str).iloc[-1] if "영업장명" in g.columns and len(g["영업장명"].dropna()) else (item.split("_",1)[0] if "_" in item else "")
            last_bt = int(g["banquet_type"].dropna().astype(int).iloc[-1]) if "banquet_type" in g.columns and g["banquet_type"].notna().any() else -1

            item_id = item_to_id.get(item, -1)
            shop_id = shop_to_id.get(shop, -1)

            cur_hist = y_series.copy()
            for k in range(1,8):
                dt = last_date + timedelta(days=k)
                dow = dt.weekday()
                feat_row = build_single_row_features(dt, cur_hist, item_id, shop_id, dow, last_bt)
                X = pd.DataFrame([feat_row])[feats].fillna(0.0)

                y_xgb = float(np.clip(m_xgb_full.predict(xgb.DMatrix(X)), 0, None))
                y_lgb = float(np.clip(m_lgb_full.predict(X, num_iteration=m_lgb_full.best_iteration), 0, None))
                y_cat = float(np.expm1(np.clip(m_cat_full.predict(X), 0, None)))

                # 기본 앙상블
                yhat = wx*y_xgb + wl*y_lgb + wc*y_cat

                # 세그먼트(희소/연회장/장기무판매) → 앵커 혼합
                nz  = float(feat_row.get("nonzero_rate_28", 0.0))
                dsl = float(feat_row.get("days_since_last_sale", 999.0))
                is_sparse = (nz < SPARSE_NZ_THRESHOLD) or (dsl > STALE_DAYS_THRESHOLD) or (last_bt not in [-1, 0])
                if is_sparse:
                    anchor = max(feat_row.get("lag_7", 0.0), feat_row.get("roll_mean_7", 0.0))
                    yhat = (1.0-ANCHOR_MIX_FOR_SPARSE)*yhat + ANCHOR_MIX_FOR_SPARSE*anchor

                # 업장 캘리브레이션
                yhat = calibrator.apply(yhat, shop)
                # 후처리
                yhat = apply_postprocess(yhat, dt, feat_row)

                results[k][item] = yhat
                cur_hist.loc[dt] = yhat

        out_rows = []
        for k in range(1,8):
            row = {"영업일자": f"{test_tag}+{k}일"}; row.update(results[k]); out_rows.append(row)
        return pd.DataFrame(out_rows).set_index("영업일자")

    all_pred_wide = []
    for tf in test_files:
        test_df = safe_read_csv(Path(tf))
        assert {"영업일자","영업장명_메뉴명","매출수량"}.issubset(test_df.columns), f"필수 컬럼 누락: {tf}"
        tag = Path(tf).stem.split("_")[1]  # 예: TEST_00_processed (1).csv → "00"
        if not tag.isdigit():
            tag = re.findall(r"(\d+)", Path(tf).stem)
            tag = tag[0] if tag else "00"
        test_tag = f"TEST_{tag.zfill(2)}"
        wide = forecast_7days_for_testfile(
            models=(m_xgb_full, m_lgb_full, m_cat_full), feats=feats,
            train_df=train, test_df=test_df, test_tag=test_tag,
            weights=(wx, wl, wc)
        )
        all_pred_wide.append(wide)

    pred_full = pd.concat(all_pred_wide, axis=0) if all_pred_wide else pd.DataFrame()

    # 7) 제출 저장
    save_submission(sample, pred_full, OUT_FILE)
    print("[OK] Done. Output:", OUT_FILE)

if __name__ == "__main__":
    main()


IndentationError: unexpected indent (4121527312.py, line 615)