In [1]:
# ===============================================
# Toss CTR - XGBoost 3-Fold (CTR-safe)
# ===============================================
import pandas as pd, numpy as np, xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score, log_loss

print("📦 Loading data...")
train = pd.read_parquet("train_input_2.parquet")
test  = pd.read_parquet("test_input_2.parquet")

id_cols = [c for c in ["row_id","id"] if c in train.columns or c in test.columns]
target_col = "clicked"

X = train.drop(columns=[c for c in id_cols if c in train.columns] + [target_col])
y = train[target_col]
X_test = test.drop(columns=[c for c in id_cols if c in test.columns])

print(f"Train: {X.shape}, Test: {X_test.shape}, pos_ratio={y.mean():.4f}")



📦 Loading data...
Train: (10704168, 26), Test: (1527298, 26), pos_ratio=0.0191


In [2]:
params = {
    "objective": "binary:logistic",
    "eval_metric": ["logloss", "aucpr"],
    "learning_rate": 0.1,
    "max_depth": 8,
    "min_child_weight": 4,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "lambda": 0.0,
    "alpha": 0.0,
    "tree_method": "hist",
    "max_bin": 256,
    "nthread": -1,
    "seed": 42,
}



In [3]:
NFOLDS = 3
skf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
used = 0

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n🟦 Fold {fold}/{NFOLDS}")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval   = xgb.DMatrix(X_val, label=y_val)
    dtest  = xgb.DMatrix(X_test)

    pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()
    params["scale_pos_weight"] = float(pos_weight)

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=2000,
        evals=[(dtrain, "train"), (dval, "valid")],
        early_stopping_rounds=300,
        verbose_eval=200
    )

    val_pred = model.predict(dval, iteration_range=(0, model.best_iteration))
    test_pred = model.predict(dtest, iteration_range=(0, model.best_iteration))

    oof_preds[val_idx] = val_pred
    test_preds += test_pred
    used += 1

    ap = average_precision_score(y_val, val_pred)
    wll = log_loss(y_val, val_pred)
    score = 0.5 * ap - 0.5 * wll
    print(f"[Fold {fold}] AP={ap:.6f} | WLL={wll:.6f} | Score={score:.6f}")




🟦 Fold 1/3
[0]	train-logloss:0.68045	train-aucpr:0.05673	valid-logloss:0.68050	valid-aucpr:0.05532
[200]	train-logloss:0.57802	train-aucpr:0.08497	valid-logloss:0.58117	valid-aucpr:0.06825
[356]	train-logloss:0.55856	train-aucpr:0.09465	valid-logloss:0.56353	valid-aucpr:0.06766
[Fold 1] AP=0.068565 | WLL=0.598794 | Score=-0.265115

🟦 Fold 2/3
[0]	train-logloss:0.68062	train-aucpr:0.05672	valid-logloss:0.68065	valid-aucpr:0.05445
[200]	train-logloss:0.57843	train-aucpr:0.08556	valid-logloss:0.58126	valid-aucpr:0.06781
[395]	train-logloss:0.55430	train-aucpr:0.09820	valid-logloss:0.55930	valid-aucpr:0.06692
[Fold 2] AP=0.068119 | WLL=0.593799 | Score=-0.262840

🟦 Fold 3/3
[0]	train-logloss:0.68062	train-aucpr:0.05613	valid-logloss:0.68062	valid-aucpr:0.05464
[200]	train-logloss:0.57730	train-aucpr:0.08518	valid-logloss:0.57965	valid-aucpr:0.06876
[378]	train-logloss:0.55496	train-aucpr:0.09633	valid-logloss:0.55937	valid-aucpr:0.06785
[Fold 3] AP=0.069046 | WLL=0.595058 | Score=-0.26300

In [4]:
test_preds /= used
test_preds = np.clip(test_preds, 1e-4, 1-1e-4)

ap_total = average_precision_score(y, oof_preds)
wll_total = log_loss(y, oof_preds)
score_total = 0.5 * ap_total - 0.5 * wll_total
print(f"\n✅ K-Fold done | AP={ap_total:.6f} | WLL={wll_total:.6f} | Score={score_total:.6f}")




✅ K-Fold done | AP=0.068509 | WLL=0.595884 | Score=-0.263688


In [5]:
train["oof_xgb"] = oof_preds
test["clicked"] = test_preds
train.to_parquet("train_input_2_oof_xgb.parquet", index=False)

id_col = "id" if "id" in test.columns else "row_id"
test[[id_col, "clicked"]].to_csv("toss_xgb_kfold_v7_submit.csv", index=False)
print("[Saved] toss_xgb_kfold_v7_submit.csv")


[Saved] toss_xgb_kfold_v7_submit.csv
