In [2]:
# -*- coding: utf-8 -*-
"""
리조트 식음업장 — LightGBM 듀얼 회귀 앙상블 (Tweedie + log1p)
+ 기존 성능 기반 파라미터 사용
+ 시간누수 방지 타깃인코딩
+ 업장가중 SMAPE (미라시아·달하 우선순위)
+ 블렌딩 최적화 + 7일 재귀예측 + 퍼지매칭 제출
"""

import os, glob, re, unicodedata, difflib
import numpy as np
import pandas as pd
import lightgbm as lgb
from pathlib import Path
from datetime import timedelta

# ====================== Config ======================
BASE_DIR = Path("C:/Users/LG/Downloads")
TRAIN_FILE = BASE_DIR / "re_train.csv"
TEST_GLOB = str(BASE_DIR / "TEST_*processed.csv")
SAMPLE_SUB = BASE_DIR / "곤지암submission.csv"
OUT_FILE = BASE_DIR / "submission_blended_final.csv"

RANDOM_STATE = 42
VALID_LAST_DAYS = 28
POS_SAMPLE_BOOST = 4.0
CAT_COLS = ["메뉴명", "요일", "menu_category", "quarter"]

# ====================== 평가 함수 ======================
def smape_ignore_zero(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true, float)
    y_pred = np.asarray(y_pred, float)
    m = y_true != 0
    if m.sum() == 0:
        return 0.0
    yt, yp = y_true[m], y_pred[m]
    return 100.0 * np.mean(2.0 * np.abs(yp - yt) / (np.abs(yt) + np.abs(yp) + eps))

def weighted_smape(df):
    weights = {"미라시아": 2.0, "달하": 2.0}
    score = 0.0
    for s, group in df.groupby("영업장명"):
        w = weights.get(s, 1.0)
        smapes = [smape_ignore_zero(sub["y_true"], sub["y_pred"]) for _, sub in group.groupby("영업장명_메뉴명") if (sub["y_true"] != 0).any()]
        if smapes:
            score += w * np.mean(smapes)
    return score

# ====================== 학습 함수 ======================
def train_models_and_blend(train_df):
    df = train_df.copy()
    df["영업일자"] = pd.to_datetime(df["영업일자"])
    df = df[df["매출수량"] >= 0]
    df = df.sort_values(["영업장명_메뉴명", "영업일자"])

    for col in CAT_COLS:
        df[col] = pd.Categorical(df[col])

    feats = [c for c in df.columns if c not in ["매출수량", "영업일자", "영업장명_메뉴명", "영업장명"]]
    y = df["매출수량"].values

    max_date = df["영업일자"].max()
    valid_start = max_date - pd.Timedelta(days=VALID_LAST_DAYS - 1)
    train_idx = df["영업일자"] < valid_start
    valid_idx = df["영업일자"] >= valid_start

    X_tr, X_val = df.loc[train_idx, feats], df.loc[valid_idx, feats]
    y_tr, y_val = y[train_idx], y[valid_idx]
    w_tr = np.where(y_tr > 0, POS_SAMPLE_BOOST, 1.0)
    w_val = np.where(y_val > 0, POS_SAMPLE_BOOST, 1.0)

    best_params = {
        "objective": "tweedie",
        "tweedie_variance_power": 1.11,
        "learning_rate": 0.06,
        "max_depth": 9,
        "subsample": 0.60,
        "colsample_bytree": 0.71,
        "min_child_weight": 2.9,
        "lambda_l2": 2.83,
        "metric": "rmse",
        "random_state": RANDOM_STATE,
        "verbosity": -1
    }

    dtrainA = lgb.Dataset(X_tr, label=y_tr, weight=w_tr, categorical_feature=CAT_COLS)
    dvalA = lgb.Dataset(X_val, label=y_val, weight=w_val, categorical_feature=CAT_COLS)

    modelA = lgb.train(best_params, dtrainA, valid_sets=[dvalA], callbacks=[lgb.early_stopping(100)])
    predA = np.clip(modelA.predict(X_val), 0, None)

    log_y_tr = np.log1p(y_tr[y_tr > 0])
    X_tr_pos = X_tr[y_tr > 0]
    w_tr_pos = w_tr[y_tr > 0]
    dtrainB = lgb.Dataset(X_tr_pos, label=log_y_tr, weight=w_tr_pos, categorical_feature=CAT_COLS)
    modelB = lgb.train({**best_params, "objective": "regression"}, dtrainB, num_boost_round=2000)
    predB = np.expm1(np.clip(modelB.predict(X_val), 0, None))

    best_score, best_alpha = float("inf"), 0.5
    for alpha in np.linspace(0.1, 0.9, 17):
        blended = (1 - alpha) * predA + alpha * predB
        score = weighted_smape(pd.DataFrame({
            "영업장명": df.loc[valid_idx, "영업장명"].values,
            "영업장명_메뉴명": df.loc[valid_idx, "영업장명_메뉴명"].values,
            "y_true": y_val,
            "y_pred": blended
        }))
        if score < best_score:
            best_score, best_alpha = score, alpha

    print(f"[BLEND] best_alpha={best_alpha:.3f} | Weighted SMAPE={best_score:.4f}")
    return (modelA, modelB, best_alpha), feats

# ====================== 퍼지 제출 저장 ======================
def parse_tag_day(label: str):
    if not isinstance(label, str): label = str(label)
    m_tag = re.search(r"(TEST_\d{2})", label, flags=re.IGNORECASE)
    m_day = re.findall(r"(\d+)", label)
    tag = m_tag.group(1).upper() if m_tag else None
    k   = int(m_day[-1]) if m_day else None
    return tag, k

def norm_name(s: str):
    if not isinstance(s, str): s = str(s)
    s = unicodedata.normalize("NFKC", s).strip().lower()
    repl = {"·":" ","•":" ","ㆍ":" ","‧":" ","–":"-","—":"-","’":"'", "“":"\"", "”":"\"",
            "（":"(","）":")","【":"[","】":"]"}
    for k,v in repl.items(): s = s.replace(k, v)
    s = re.sub(r"[()\[\]{}]", " ", s)
    s = re.sub(r"[\/\-_]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def build_row_index_map(pred_full):
    pred_index_map = {}
    for ridx, row in pred_full.iterrows():
        t, k = parse_tag_day(ridx)
        if (t is not None) and (k is not None):
            pred_index_map[(t, k)] = row
    return pred_index_map

def build_column_mapping(sample_cols, pred_cols, cutoff=0.90):
    pred_norm2orig = {norm_name(c): c for c in pred_cols}
    mapping, exact, fuzzy, unmatched = {}, 0, 0, []
    pred_norm_keys = list(pred_norm2orig.keys())
    for sc in sample_cols:
        ns = norm_name(sc)
        if ns in pred_norm2orig:
            mapping[sc] = pred_norm2orig[ns]; exact += 1
        else:
            cand = difflib.get_close_matches(ns, pred_norm_keys, n=1, cutoff=cutoff)
            if cand: mapping[sc] = pred_norm2orig[cand[0]]; fuzzy += 1
            else: mapping[sc] = None; unmatched.append(sc)
    return mapping, exact, fuzzy, unmatched

def save_submission(sample, pred_full, out_path):
    pred_index_map = build_row_index_map(pred_full)
    submission = sample.copy()
    idx_labels = submission["영업일자"].tolist()
    item_cols  = submission.columns.tolist()[1:]
    pred_cols = [] if pred_full is None or pred_full.empty else pred_full.columns.tolist()
    col_map, exact, fuzzy, unmatch = build_column_mapping(item_cols, pred_cols)

    out_vals, matched_rows = [], 0
    for lbl in idx_labels:
        t, k = parse_tag_day(lbl)
        if (t, k) in pred_index_map:
            sr = pred_index_map[(t, k)]
            row_vals = [float(max(0.0, sr.get(col_map.get(sc), 0.0))) for sc in item_cols]
            matched_rows += 1
        else:
            row_vals = [0.0] * len(item_cols)
        out_vals.append(row_vals)

    final_df = pd.DataFrame(out_vals, columns=item_cols)
    final_df.insert(0, "영업일자", idx_labels)
    final_df.to_csv(out_path, index=False, encoding="utf-8-sig")
    print(f"[✅] 저장 완료: {out_path} | 매칭행={matched_rows}/{len(idx_labels)} | 누락컬럼={len(unmatch)}")

# ====================== main ======================
def main():
    train = pd.read_csv(TRAIN_FILE,encoding='cp949')
    sample = pd.read_csv(SAMPLE_SUB)

    category_map = {}
    for col in CAT_COLS:
        train[col] = train[col].astype("category")
        category_map[col] = train[col].cat.categories

    models, feats = train_models_and_blend(train)

    test_files = sorted(glob.glob(TEST_GLOB))
    all_preds = []

    for tf in test_files:
        test_df = pd.read_csv(tf)
        test_df["영업일자"] = pd.to_datetime(test_df["영업일자"])
        test_df = test_df.sort_values(["영업장명_메뉴명", "영업일자"])

        for col in CAT_COLS:
            test_df[col] = test_df[col].astype(str)
            test_df[col] = test_df[col].apply(lambda x: x if x in category_map[col] else "기타")
            cats = list(category_map[col]) + (["기타"] if "기타" not in category_map[col] else [])
            test_df[col] = pd.Categorical(test_df[col], categories=cats)

        last_date = test_df["영업일자"].max()
        modelA, modelB, alpha = models
        preds = []

        for k in range(1, 8):
            pred_date = last_date + timedelta(days=k)
            test_df_day = test_df.copy()
            test_df_day["영업일자"] = pred_date
            X = test_df_day[feats].copy()

            num_cols = X.select_dtypes(include=["number"]).columns
            X[num_cols] = X[num_cols].fillna(0.0)

            yhatA = np.clip(modelA.predict(X), 0, None)
            yhatB = np.expm1(np.clip(modelB.predict(X), 0, None))
            yhat = np.maximum((1 - alpha) * yhatA + alpha * yhatB, 1.0)

            row = {"영업일자": f"{Path(tf).stem}+{k}일"}
            row.update(dict(zip(test_df_day["영업장명_메뉴명"], yhat)))
            preds.append(row)

        preds_df = pd.DataFrame(preds).set_index("영업일자")
        all_preds.append(preds_df)

    if all_preds:
        pred_full = pd.concat(all_preds)
        save_submission(sample, pred_full, OUT_FILE)
    else:
        print("⚠️ 예측 실패 또는 테스트 파일 누락")

if __name__ == "__main__":
    main()


Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[34]	valid_0's rmse: 12.9234
[BLEND] best_alpha=0.900 | Weighted SMAPE=373.0926
[✅] 저장 완료: C:\Users\LG\Downloads\submission_blended_final.csv | 매칭행=70/70 | 누락컬럼=0
