In [1]:
# ===============================================
# DCN + XGB + LGBM 앙상블 (검증 α, β 탐색 → 테스트 혼합)
# ===============================================
import numpy as np, pandas as pd
from sklearn.metrics import average_precision_score, log_loss
from itertools import product

In [2]:
# 1️⃣ 검증 세트 로드 (DCN)
val_idx = np.load("dcn_val_index.npy")
p_dcn_val = np.load("dcn_val_pred.npy").astype(float)
y_val = np.load("dcn_val_true.npy").astype(int)
assert len(p_dcn_val) == len(y_val), "DCN val 길이 불일치"

In [3]:
# 2️⃣ XGB / LGBM OOF 로드 후 동일 검증 인덱스 매칭
oof_xgb = pd.read_parquet("train_input_2_oof_xgb.parquet")
oof_lgb = pd.read_parquet("train_input_2_oof_lgbm.parquet")
assert "oof_xgb" in oof_xgb.columns
assert "oof_lgbm" in oof_lgb.columns

p_xgb_val = oof_xgb["oof_xgb"].values[val_idx].astype(float)
p_lgb_val = oof_lgb["oof_lgbm"].values[val_idx].astype(float)
assert len(p_xgb_val) == len(p_lgb_val) == len(p_dcn_val), "OOF 길이 불일치"

In [4]:
# 3️⃣ α, β 탐색 (DCN + α·XGB + β·LGBM)
grid = np.linspace(0.0, 1.0, 41)   # ✅ 해상도 높이기 (21 → 41)
best = {"score": -1e9, "a": None, "b": None, "ap": None, "wll": None}

for a in grid:
    for b in grid:
        if a + b > 1.0:          # DCN weight = 1 - a - b
            continue
        if (1 - a - b) < 0.05:   # ✅ DCN 최소 5% 보장
            continue
        blend = a*p_xgb_val + b*p_lgb_val + (1 - a - b)*p_dcn_val
        blend = np.clip(blend, 1e-6, 1-1e-6)
        ap  = average_precision_score(y_val, blend)
        wll = log_loss(y_val, blend)
        score = ap*0.7 - wll*0.3   # ✅ AP 중심 가중 점수
        if score > best["score"]:
            best.update({"score": score, "a": a, "b": b, "ap": ap, "wll": wll})

print(f"[Blend-Search 41] best α(XGB)={best['a']:.3f} | β(LGBM)={best['b']:.3f} | "
      f"γ(DCN)={1-best['a']-best['b']:.3f} | val_AP={best['ap']:.6f} | val_WLL={best['wll']:.6f}")


[Blend-Search 41] best α(XGB)=0.075 | β(LGBM)=0.875 | γ(DCN)=0.050 | val_AP=0.068418 | val_WLL=0.588290


In [5]:
# 4️⃣ 테스트 제출 세 모델 로드
sub_xgb = pd.read_csv("toss_xgb_kfold_v7_submit.csv")
sub_lgb = pd.read_csv("toss_lgbm_kfold_v9_submit.csv")
sub_dcn = pd.read_csv("toss_dcn_v8_submit.csv")

def pick_id_col(df):
    for k in ["ID","id","row_id"]:
        if k in df.columns: return k
    raise KeyError("제출 파일에 ID/id/row_id 컬럼이 없습니다.")

kx, kl, kd = map(pick_id_col, (sub_xgb, sub_lgb, sub_dcn))
sub_xgb = sub_xgb.sort_values(kx).reset_index(drop=True)
sub_lgb = sub_lgb.sort_values(kl).reset_index(drop=True)
sub_dcn = sub_dcn.sort_values(kd).reset_index(drop=True)
assert (sub_xgb[kx].values == sub_lgb[kl].values).all()
assert (sub_xgb[kx].values == sub_dcn[kd].values).all()


In [6]:
# 5️⃣ 최적 가중치로 test blending
a, b = best["a"], best["b"]
g = 1 - a - b
clicked = a*sub_xgb["clicked"].values + b*sub_lgb["clicked"].values + g*sub_dcn["clicked"].values
clicked = np.clip(clicked, 1e-4, 1-1e-4)


In [8]:
out = pd.DataFrame({
    "ID": sub_dcn[kd].values,
    "clicked": clicked
})
out.to_csv("toss_ensemble_v10_submit.csv", index=False)
print("[Saved] toss_ensemble_v10_submit.csv")

[Saved] toss_ensemble_v10_submit.csv
