In [1]:
# ===============================================
# 2모델(KFold XGB+LGBM) vs 3모델(XGB+LGBM+DCN) 앙상블 비교
# ===============================================
import numpy as np, pandas as pd
from sklearn.metrics import average_precision_score, log_loss



In [2]:
# 1️⃣ 공통 검증 인덱스 / 라벨
val_idx = np.load("dcn_val_index.npy")
y_val   = np.load("dcn_val_true.npy").astype(int)

# 2️⃣ XGB / LGBM OOF 로드
oof_xgb = pd.read_parquet("train_input_2_oof_xgb.parquet")["oof_xgb"].values.astype(float)
oof_lgb = pd.read_parquet("train_input_2_oof_lgbm.parquet")["oof_lgbm"].values.astype(float)
p_xgb_val = oof_xgb[val_idx]
p_lgb_val = oof_lgb[val_idx]

# 3️⃣ DCN 검증 예측 로드
p_dcn_val = np.load("dcn_val_pred.npy").astype(float)
assert len(p_dcn_val) == len(y_val) == len(p_xgb_val)


In [3]:
# ==========================================================
# 📌 후보 A: XGB + LGBM (KFold)
# ==========================================================
alphas = np.linspace(0.0, 1.0, 41)
best_a = {"ap": -1, "a": None, "wll": None}

for a in alphas:
    blend = a*p_xgb_val + (1-a)*p_lgb_val
    blend = np.clip(blend, 1e-6, 1-1e-6)
    ap  = average_precision_score(y_val, blend)
    wll = log_loss(y_val, blend)
    if ap > best_a["ap"]:
        best_a.update({"ap": ap, "a": a, "wll": wll})

print(f"[2-Model] best α(XGB)={best_a['a']:.3f} | val_AP={best_a['ap']:.6f} | val_WLL={best_a['wll']:.6f}")


[2-Model] best α(XGB)=0.550 | val_AP=0.069153 | val_WLL=0.590187


In [4]:
# ==========================================================
# 📌 후보 B: XGB + LGBM + DCN
# ==========================================================
grid = np.linspace(0.0, 1.0, 41)
best_b = {"ap": -1, "a": None, "b": None, "wll": None}

for a in grid:
    for b in grid:
        if a + b > 1.0:  # γ = 1 - a - b
            continue
        blend = a*p_xgb_val + b*p_lgb_val + (1 - a - b)*p_dcn_val
        blend = np.clip(blend, 1e-6, 1-1e-6)
        ap  = average_precision_score(y_val, blend)
        wll = log_loss(y_val, blend)
        if ap > best_b["ap"]:
            best_b.update({"ap": ap, "a": a, "b": b, "wll": wll})

print(f"[3-Model] best α(XGB)={best_b['a']:.3f} | β(LGBM)={best_b['b']:.3f} | γ(DCN)={1-best_b['a']-best_b['b']:.3f} "
      f"| val_AP={best_b['ap']:.6f} | val_WLL={best_b['wll']:.6f}")

[3-Model] best α(XGB)=0.550 | β(LGBM)=0.400 | γ(DCN)=0.050 | val_AP=0.069158 | val_WLL=0.592502


In [5]:
# ==========================================================
# ⚖️ 비교 및 선택
# ==========================================================
use_three = best_b["ap"] > best_a["ap"]
print(f"\n[SELECT] {'3-Model (XGB+LGBM+DCN)' if use_three else '2-Model (XGB+LGBM)'} 선택!")


[SELECT] 3-Model (XGB+LGBM+DCN) 선택!


In [7]:


# ==========================================================
# 🧾 테스트 세트 혼합 및 제출 파일 생성
# ==========================================================
sub_xgb = pd.read_csv("toss_xgb_kfold_v7_submit.csv")
sub_lgb = pd.read_csv("toss_lgbm_kfold_v9_submit.csv")
sub_dcn = pd.read_csv("toss_dcn_v8_submit.csv")

def pick_id(df):
    for c in ["ID","id","row_id"]:
        if c in df.columns: return c
    raise KeyError("ID 컬럼 없음")

kx, kl, kd = map(pick_id, [sub_xgb, sub_lgb, sub_dcn])
sub_xgb = sub_xgb.sort_values(kx).reset_index(drop=True)
sub_lgb = sub_lgb.sort_values(kl).reset_index(drop=True)
sub_dcn = sub_dcn.sort_values(kd).reset_index(drop=True)
assert (sub_xgb[kx].values == sub_lgb[kl].values).all()
assert (sub_xgb[kx].values == sub_dcn[kd].values).all()

if use_three:
    a, b = best_b["a"], best_b["b"]
    g = 1 - a - b
    clicked = a*sub_xgb["clicked"] + b*sub_lgb["clicked"] + g*sub_dcn["clicked"]
    out_name = "toss_ensemble_3model_v11_submit.csv"
else:
    a = best_a["a"]
    clicked = a*sub_xgb["clicked"] + (1-a)*sub_lgb["clicked"]
    out_name = "toss_ensemble_2model_v11_submit.csv"

clicked = np.clip(clicked, 1e-4, 1-1e-4)
pd.DataFrame({"ID": sub_xgb[kx], "clicked": clicked}).to_csv(out_name, index=False)
print(f"[Saved] {out_name}")


[Saved] toss_ensemble_3model_v11_submit.csv
