In [1]:
# ========== LGBM 3-Fold (fast & stable) ==========
import pandas as pd, numpy as np, lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score, log_loss

In [2]:
train = pd.read_parquet("train_input_2.parquet")
test  = pd.read_parquet("test_input_2.parquet")

In [3]:
# ============================================
# 🔧 ID & Target 처리 (최종 안전 버전)
# ============================================

# 1. ID 후보 목록 생성
id_cols = [c for c in ["row_id", "id"] if c in train.columns or c in test.columns]

# 2. target 설정
target_col = "clicked"
y = train[target_col].astype(int)

# 3. 실제 train에서 drop할 컬럼 확인
drop_cols = [c for c in id_cols + [target_col] if c in train.columns]
print("[check] dropping columns from train:", drop_cols)

X = train.drop(columns=drop_cols)

# 4. test에서 drop할 컬럼 확인
drop_test = [c for c in id_cols if c in test.columns]
print("[check] dropping columns from test:", drop_test)

X_test = test.drop(columns=drop_test)

print(X.dtypes.value_counts())
print(X_test.shape, y.shape)



[check] dropping columns from train: ['row_id', 'clicked']
[check] dropping columns from test: ['id']
float64    26
Name: count, dtype: int64
(1527298, 26) (10704168,)


In [4]:
bad_cols = X_test.select_dtypes(exclude=["int", "float", "bool"]).columns
if len(bad_cols):
    print("[WARN] Non-numeric columns detected:", bad_cols.tolist())

In [5]:
pos_weight = (len(y) - y.sum()) / y.sum()
pos_weight = max(25, min(pos_weight, 35))  # 25~35로 제한

params = {
    "objective": "binary",
    "metric": "average_precision",   #  AP 단일로 고정
    "boosting_type": "gbdt",
    "learning_rate": 0.07,
    "num_leaves": 128,
    "min_data_in_leaf": 80,
    "feature_fraction": 0.85,
    "bagging_fraction": 0.85,
    "bagging_freq": 5,
    "scale_pos_weight": pos_weight,
    "n_jobs": 4,
    "seed": 42,
    "verbose": -1
}

In [6]:
NFOLDS=3
skf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=7)
oof = np.zeros(len(X)); test_pred = np.zeros(len(X_test)); used=0

for f, (tr, va) in enumerate(skf.split(X, y), 1):
    print(f"\n🟦 LGBM Fold {f}/{NFOLDS}")
    dtr = lgb.Dataset(X.iloc[tr], y.iloc[tr])
    dva = lgb.Dataset(X.iloc[va], y.iloc[va], reference=dtr)

    spw = (len(tr)-y.iloc[tr].sum())/max(1,y.iloc[tr].sum())
    params["scale_pos_weight"] = float(spw)

    model = lgb.train(
    params,
    dtr,
    num_boost_round=2000,
    valid_sets=[dtr, dva],
    valid_names=["train", "valid"],
    callbacks=[
        lgb.early_stopping(600, first_metric_only=True),  # 🔥 AP만 early_stop 기준
        lgb.log_evaluation(200)
    ]
)

    vp = model.predict(X.iloc[va], num_iteration=model.best_iteration)
    tp = model.predict(X_test, num_iteration=model.best_iteration)
    oof[va]=vp; test_pred += tp; used+=1

    ap = average_precision_score(y.iloc[va], vp)
    wll= log_loss(y.iloc[va], np.clip(vp,1e-6,1-1e-6))
    print(f"[Fold {f}] AP={ap:.6f} | WLL={wll:.6f} | Score={ap-wll:.6f}")

test_pred /= used
test_pred = np.clip(test_pred, 1e-4, 1-1e-4)


🟦 LGBM Fold 1/3
Training until validation scores don't improve for 600 rounds
[200]	train's average_precision: 0.0759129	valid's average_precision: 0.0666753
[400]	train's average_precision: 0.0814807	valid's average_precision: 0.0664485
[600]	train's average_precision: 0.0876602	valid's average_precision: 0.0664003
[800]	train's average_precision: 0.0940579	valid's average_precision: 0.0662488
Early stopping, best iteration is:
[204]	train's average_precision: 0.0760739	valid's average_precision: 0.0666992
Evaluated only: average_precision
[Fold 1] AP=0.066699 | WLL=0.590783 | Score=-0.524084

🟦 LGBM Fold 2/3
Training until validation scores don't improve for 600 rounds
[200]	train's average_precision: 0.0744916	valid's average_precision: 0.068235
[400]	train's average_precision: 0.0801818	valid's average_precision: 0.0683029
[600]	train's average_precision: 0.0860635	valid's average_precision: 0.0681972
[800]	train's average_precision: 0.0933256	valid's average_precision: 0.0679812


In [7]:
# 저장
train_out = train.copy()
train_out["oof_lgbm"] = oof
train_out[["oof_lgbm"]].to_parquet("train_input_2_oof_lgbm.parquet", index=False)

key = "id" if "id" in test.columns else "row_id"
sub_lgb = pd.DataFrame({"ID": test[key], "clicked": test_pred})
sub_lgb.to_csv("toss_lgbm_kfold_v9_submit.csv", index=False)
print("[Saved] toss_lgbm_kfold_v9_submit.csv & train_input_2_oof_lgbm.parquet")

np.save("lgbm_test_pred.npy", test_pred)

[Saved] toss_lgbm_kfold_v9_submit.csv & train_input_2_oof_lgbm.parquet
