In [1]:
# ===============================================
# Toss CTR — XGBoost v3 (Feature-Selection → CV → Submit)
# Author: 람쥐
# Notes:
# - Input files: train_input_2.parquet, test_input_2.parquet (already EDA & Preprocessing done)
# - Train-only fit for feature selection; then apply same selected columns to test
# - CV metric matches competition: 0.5*AP + 0.5*(1/(1+WeightedLogLoss))
# - Final submission: toss_xgb_v3_submit.csv with columns [ID, clicked]
# ===============================================

import os
import gc
import json
import time
import math
import numpy as np
import pandas as pd
from datetime import datetime


from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score
from sklearn.feature_selection import SelectFromModel

import xgboost as xgb
from xgboost import XGBClassifier

# -----------------------------------------------


In [2]:
# Config
# -----------------------------------------------
TRAIN_PATH = './train_input_2.parquet'
TEST_PATH  = './test_input_2.parquet'
SUBMIT_PATH = './toss_xgb_v3_submit.csv'
FEATURES_PATH = './xgb_v3_selected_features.txt'
MODEL_PATH = './xgb_v3_model.json'
LOG_PATH = './xgb_v3_log.txt'

N_FOLDS = 5
RANDOM_STATE = 42
EARLY_STOPPING_ROUNDS = 30
MAX_BOOST_ROUNDS = 500


In [3]:
# -----------------------------------------------
# Utils
# -----------------------------------------------
def weighted_logloss(y_true, y_pred, eps=1e-15):
    y_pred = np.clip(y_pred, eps, 1 - eps)
    mask0 = (y_true == 0)
    mask1 = (y_true == 1)
    ll0 = -np.mean(np.log(1 - y_pred[mask0])) if mask0.sum() else 0.0
    ll1 = -np.mean(np.log(y_pred[mask1])) if mask1.sum() else 0.0
    return 0.5 * ll0 + 0.5 * ll1

def competition_score(y_true, y_pred):
    ap = average_precision_score(y_true, y_pred)
    wll = weighted_logloss(y_true, y_pred)
    score = 0.5 * ap + 0.5 * (1.0 / (1.0 + wll))
    return score, ap, wll


def detect_tree_method():
    try:
        # Try a tiny GPU booster to check availability
        _ = xgb.Booster(params={'tree_method': 'gpu_hist'})
        return 'gpu_hist'
    except Exception:
        return 'hist'



In [4]:
# -----------------------------------------------
# Load
# -----------------------------------------------
print("[info] Loading parquet files...")
train = pd.read_parquet(TRAIN_PATH)
_test = pd.read_parquet(TEST_PATH)
print(f"[info] Train shape: {train.shape}, Test shape: {_test.shape}")

assert 'clicked' in train.columns, "Target column 'clicked' not found in train_input_2.parquet"

# Keep original test ID for submission
if 'ID' in _test.columns:
    test_id = _test['ID'].astype(str).copy()
elif 'id' in _test.columns:
    test_id = _test['id'].astype(str).copy()
else:
    # Fallback synthetic ID
    test_id = pd.Series([f"TEST_{i:07d}" for i in range(len(_test))], name='ID')


[info] Loading parquet files...
[info] Train shape: (10704168, 28), Test shape: (1527298, 27)


In [5]:

# -----------------------------------------------
# Split X, y
# -----------------------------------------------
TARGET_COL = 'clicked'
y = train[TARGET_COL].astype(np.int8).values
X = train.drop(columns=[TARGET_COL])
X_test = _test.copy()

# Downcast to float32 where possible
for df in (X, X_test):
    for c in df.columns:
        if df[c].dtype == 'float64':
            df[c] = df[c].astype('float32')
        elif str(df[c].dtype).startswith('int'):
            df[c] = df[c].astype('int32')

# Align train/test columns now (pre-selection)
missing_in_test = [c for c in X.columns if c not in X_test.columns]
extra_in_test = [c for c in X_test.columns if c not in X.columns]

if missing_in_test:
    for c in missing_in_test:
        X_test[c] = 0
if extra_in_test:
    X_test = X_test.drop(columns=extra_in_test)

X_test = X_test[X.columns]
print(f"[check] Aligned columns. Train features: {X.shape[1]}, Test features: {X_test.shape[1]}")


[check] Aligned columns. Train features: 27, Test features: 27


In [6]:
# Scale pos weight (class imbalance)
# -----------------------------------------------
pos_ratio = float(np.mean(y))
scale_pos_weight = (1.0 - pos_ratio) / pos_ratio if pos_ratio > 0 else 1.0
print(f"[info] Positive ratio: {pos_ratio:.6f}, scale_pos_weight: {scale_pos_weight:.3f}")

[info] Positive ratio: 0.019075, scale_pos_weight: 51.425


In [7]:
# Feature Selection (train only)
# -----------------------------------------------
print("[step] Feature selection fit on train only")

# detect_tree_method() 함수가 'gpu_hist'를 반환해도
# XGBoost 2.x에서는 tree_method='hist', device='cuda'로 써야 함
tree_method = "hist"
device_type = "cuda"  # GPU 사용 환경
# device_type = "cpu"  # GPU 없을 경우엔 이렇게 바꿔주면 됨

print(f"[info] device: {device_type}, tree_method: {tree_method}")

selector_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method=tree_method,
    device=device_type,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)

selector_model.fit(X, y)

selector = SelectFromModel(selector_model, prefit=True, threshold='median')
mask = selector.get_support()
selected_features = X.columns[mask].tolist()

with open(FEATURES_PATH, 'w', encoding='utf-8') as f:
    for ftr in selected_features:
        f.write(ftr + "\n")
print(f"[info] Selected {len(selected_features)} / {X.shape[1]} features → saved to {FEATURES_PATH}")

# Reduce to selected features
X_sel = X[selected_features].copy()
X_test_sel = X_test[selected_features].copy()


[step] Feature selection fit on train only
[info] device: cuda, tree_method: hist




[info] Selected 14 / 27 features → saved to ./xgb_v3_selected_features.txt


In [8]:
# -----------------------------------------------
# Cross-Validation
# -----------------------------------------------
print("[step] Stratified KFold CV")
params = {
    'objective': 'binary:logistic',
    'tree_method': tree_method,
    'max_depth': 10,
    'learning_rate': 0.025,
    'subsample': 0.75,
    'colsample_bytree': 0.65,
    'reg_lambda': 2.0,
    'min_child_weight': 8,
    'scale_pos_weight': scale_pos_weight,
    'gpu_id': 0,
    'verbosity': 0,
    'seed': RANDOM_STATE,
}

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
cv_scores, cv_ap, cv_wll, best_iters = [], [], [], []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_sel, y), 1):
    print(f"[fold {fold}] train={len(tr_idx):,}, valid={len(va_idx):,}")
    X_tr, X_va = X_sel.iloc[tr_idx], X_sel.iloc[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]

    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    dvalid = xgb.DMatrix(X_va, label=y_va)

    booster = xgb.train(
        params,
        dtrain,
        num_boost_round=MAX_BOOST_ROUNDS,
        evals=[(dvalid, 'valid')],
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        verbose_eval=False,
    )

    y_hat = booster.predict(dvalid)
    score, ap, wll = competition_score(y_va, y_hat)
    cv_scores.append(score)
    cv_ap.append(ap)
    cv_wll.append(wll)
    best_iters.append(booster.best_iteration)

    print(f"[fold {fold}] score={score:.6f} | AP={ap:.6f} | WLL={wll:.6f} | best_iter={booster.best_iteration}")

    del dtrain, dvalid, booster, X_tr, X_va, y_tr, y_va, y_hat
    gc.collect()

print("[cv] score=%.6f ± %.6f | AP=%.6f | WLL=%.6f" % (
    np.mean(cv_scores), np.std(cv_scores), np.mean(cv_ap), np.mean(cv_wll)))



[step] Stratified KFold CV
[fold 1] train=8,563,334, valid=2,140,834




[fold 1] score=0.344284 | AP=0.069632 | WLL=0.615677 | best_iter=499
[fold 2] train=8,563,334, valid=2,140,834
[fold 2] score=0.344149 | AP=0.068936 | WLL=0.614563 | best_iter=499
[fold 3] train=8,563,334, valid=2,140,834
[fold 3] score=0.343590 | AP=0.068132 | WLL=0.615386 | best_iter=499
[fold 4] train=8,563,335, valid=2,140,833
[fold 4] score=0.344392 | AP=0.069420 | WLL=0.614559 | best_iter=499
[fold 5] train=8,563,335, valid=2,140,833
[fold 5] score=0.344777 | AP=0.070538 | WLL=0.615468 | best_iter=499
[cv] score=0.344238 ± 0.000386 | AP=0.069332 | WLL=0.615131


In [9]:
# -----------------------------------------------
# Train final model on full data
# -----------------------------------------------
print("[step] Train final model on full train with selected features")
final_rounds = int(np.mean(best_iters)) if best_iters else MAX_BOOST_ROUNDS
final_rounds = max(final_rounds, 50)
print(f"[info] Using num_boost_round={final_rounds}")

D_full = xgb.DMatrix(X_sel, label=y)
final_booster = xgb.train(
    params,
    D_full,
    num_boost_round=final_rounds,
    verbose_eval=False,
)

final_booster.save_model(MODEL_PATH)
print(f"[info] Saved model to {MODEL_PATH}")


[step] Train final model on full train with selected features
[info] Using num_boost_round=499
[info] Saved model to ./xgb_v3_model.json


In [10]:
# Inference on test
# -----------------------------------------------
print("[step] Inference on test_input_2")
D_test = xgb.DMatrix(X_test_sel)
proba = final_booster.predict(D_test)


# Clipping for safety
proba = np.clip(proba, 0.0, 1.0)


# Build submission
submit = pd.DataFrame({
'ID': test_id.values,
'clicked': proba
})


# Ensure correct dtypes and ordering
submit['ID'] = submit['ID'].astype(str)
submit = submit[['ID', 'clicked']]


submit.to_csv(SUBMIT_PATH, index=False)
print(f"[save] Submission saved → {SUBMIT_PATH}")


[step] Inference on test_input_2
[save] Submission saved → ./toss_xgb_v3_submit.csv


In [12]:
# Sanity checks and logging
# -----------------------------------------------
print("[check] Submission preview:")
print(submit.head())
print("[check] Range of clicked: %.4f ~ %.4f" % (submit['clicked'].min(), submit['clicked'].max()))


with open(LOG_PATH, 'a', encoding='utf-8') as f:
 f.write(
 f"{datetime.now()} | score={np.mean(cv_scores):.6f}±{np.std(cv_scores):.6f} | "
 f"AP={np.mean(cv_ap):.6f} | WLL={np.mean(cv_wll):.6f} | "
 f"features={len(selected_features)} | rounds={final_rounds} | tree_method={tree_method}\n"
)


print("[done] XGBoost v3 pipeline complete")

[check] Submission preview:
             ID   clicked
0  TEST_0000000  0.344309
1  TEST_0000001  0.383575
2  TEST_0000002  0.453257
3  TEST_0000003  0.380793
4  TEST_0000004  0.279646
[check] Range of clicked: 0.0091 ~ 0.9934
[done] XGBoost v3 pipeline complete
