## Import Lib

In [1]:
# Import
import pandas as pd
import numpy as np
import os
import random
import pyarrow.dataset as ds
import xgboost as xgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import optuna
from tqdm import tqdm
import gc
import warnings
warnings.filterwarnings('ignore')

## Config

In [2]:
# Enhanced Settings
CFG = {
    'BATCH_SIZE': 800_000,
    'NUM_BOOST_ROUND': 2000,
    'LEARNING_RATE': 0.05,
    'SEED': 42,
    'TEST_SIZE': 0.2,
    'EARLY_STOPPING': 100,
    'N_FOLDS': 5,
    'OPTUNA_TRIALS': 20
}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(CFG['SEED'])

## Feature Engineering

In [3]:
def create_additional_features(df):
    # 1. age_is_young (10대, 20대 구분)
    if "age_group" in df.columns:
        df["age_is_young"] = df["age_group"].isin([10, 20]).astype(int)

    # 2. inventory_freq (지면 ID 빈도 비율)
    if "inventory_id" in df.columns:
        inv_freq = df["inventory_id"].value_counts(normalize=True)
        df["inventory_freq"] = df["inventory_id"].map(inv_freq)

    # 3. history_mean (과거 인기도 평균)
    history_cols = [col for col in df.columns if col.startswith("history_a_")]
    if history_cols:
        df["history_mean"] = df[history_cols].mean(axis=1)

    # 4. history_trend (최근 인기도 변화량)
    # 숫자 suffix 기준 정렬 (예: history_a_1, history_a_2, ...)
    if history_cols:
        sorted_history_cols = sorted(
            history_cols,
            key=lambda x: int(x.split("_")[-1]) if x.split("_")[-1].isdigit() else 0
        )
        if len(sorted_history_cols) >= 2:
            last_col = sorted_history_cols[-1]
            prev_col = sorted_history_cols[-2]
            df["history_trend"] = df[last_col] - df[prev_col]
        else:
            df["history_trend"] = 0  # 충분한 피처가 없으면 기본값
    return df

In [4]:
def create_interaction_features(df, cat_cols):
    interaction_features = []
    for i, col1 in enumerate(cat_cols):
        for col2 in cat_cols[i+1:]:
            if col1 in df.columns and col2 in df.columns:
                new_col = f"{col1}_{col2}_interaction"
                df[new_col] = df[col1].astype(str) + "_" + df[col2].astype(str)
                interaction_features.append(new_col)
    le = LabelEncoder()
    for col in interaction_features:
        df[col] = le.fit_transform(df[col].astype(str))
    return df, interaction_features

In [5]:
def create_time_features(df):
    if 'hour' in df.columns:
        try:
            df['hour'] = pd.to_numeric(df['hour'], errors='coerce').fillna(0).astype(int)
            df['is_morning'] = ((df['hour'] >=6) & (df['hour']<12)).astype(int)
            df['is_afternoon'] = ((df['hour']>=12)&(df['hour']<18)).astype(int)
            df['is_evening'] = ((df['hour']>=18)&(df['hour']<24)).astype(int)
            df['is_night'] = ((df['hour']>=0)&(df['hour']<6)).astype(int)
            df['is_peak_hour'] = df['hour'].isin([9,10,11,14,15,16,19,20,21]).astype(int)
            df['hour_sin'] = np.sin(2*np.pi*df['hour']/24)
            df['hour_cos'] = np.cos(2*np.pi*df['hour']/24)
        except:
            pass
    if 'day_of_week' in df.columns:
        try:
            df['day_of_week'] = pd.to_numeric(df['day_of_week'], errors='coerce').fillna(0).astype(int)
            df['is_weekend'] = df['day_of_week'].isin([5,6]).astype(int)
            df['is_monday'] = (df['day_of_week']==0).astype(int)
            df['is_friday'] = (df['day_of_week']==4).astype(int)
            df['dow_sin'] = np.sin(2*np.pi*df['day_of_week']/7)
            df['dow_cos'] = np.cos(2*np.pi*df['day_of_week']/7)
        except:
            pass
    return df

In [6]:
def preprocess_features(df, feature_cols, seq_col, is_train=True, target_encoders=None):
    cat_cols = ["gender","age_group","inventory_id","day_of_week","hour"]
    df, interaction_features = create_interaction_features(df, cat_cols)
    df = create_time_features(df)
    df = create_additional_features(df)
    del df['seq']
    if is_train and 'clicked' in df.columns:
        for col in cat_cols:
            if col in df.columns:
                try:
                    target_mean = df.groupby(col)['clicked'].mean()
                    df[f'{col}_target_mean'] = df[col].map(target_mean).fillna(df['clicked'].mean())
                except:
                    pass
    elif target_encoders is not None:
        for col, encoder_dict in target_encoders.items():
            if col in df.columns and encoder_dict:
                df[f'{col}_target_mean'] = df[col].map(encoder_dict).fillna(0.5)
    new_feature_cols = [col for col in df.columns if col not in ['clicked','ID']]
    return df, new_feature_cols

def encode_categorical_features(df, cat_cols):
    for col in cat_cols:
        if col in df.columns:
            try:
                freq_encoding = df[col].value_counts().to_dict()
                df[f'{col}_freq'] = df[col].map(freq_encoding).fillna(0)
                df[col] = df[col].astype('category').cat.codes
            except:
                df[col] = 0
                df[f'{col}_freq'] = 0
    return df

## Optuna

In [7]:
def optimize_xgb_params(X_train, y_train, X_val, y_val):
    def objective(trial):
        params = {
            "objective": "binary:logistic",
            "eval_metric": "auc",
            "booster": "gbtree",
            "tree_method": "hist",
            "learning_rate": trial.suggest_float("learning_rate",0.01,0.2),
            "max_depth": trial.suggest_int("max_depth",3,12),
            "min_child_weight": trial.suggest_int("min_child_weight",1,10),
            "subsample": trial.suggest_float("subsample",0.4,1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree",0.4,1.0),
            "gamma": trial.suggest_float("gamma",0.0,5.0),
            "lambda": trial.suggest_float("lambda",0.0,5.0),
            "alpha": trial.suggest_float("alpha",0.0,5.0),
            "seed": CFG['SEED']
        }
        dtrain = xgb.DMatrix(X_train,label=y_train)
        dval = xgb.DMatrix(X_val,label=y_val)
        bst = xgb.train(
            params,dtrain,num_boost_round=1000,
            evals=[(dval,"val")],
            early_stopping_rounds=50,
            verbose_eval=False
        )
        preds = bst.predict(dval)
        auc = roc_auc_score(y_val,preds)
        return auc
    study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=CFG['SEED']))
    study.optimize(objective,n_trials=CFG['OPTUNA_TRIALS'], show_progress_bar=True)
    return study.best_params

## Train

In [None]:
def downsample_data(df):
    clicked_1 = df[df['clicked']==1]
    clicked_0 = df[df['clicked']==0]
    if len(clicked_0) > len(clicked_1)*1:
        clicked_0 = clicked_0.sample(n=len(clicked_1)*1, random_state=CFG['SEED'])
    return pd.concat([clicked_1,clicked_0],axis=0).sample(frac=1,random_state=CFG['SEED']).reset_index(drop=True)

: 

In [None]:
import gc
import numpy as np
import pandas as pd
import pyarrow.dataset as ds
import xgboost as xgb

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

def train_xgb_with_cv(data_path):
    dataset = ds.dataset(data_path, format="parquet")
    models = []
    feature_names = None
    target_encoders = {}
    batch_num = 0
    best_params = None

    for batch in dataset.to_batches(batch_size=CFG['BATCH_SIZE']):
        batch_num += 1
        df = batch.to_pandas()

        # 다운샘플링
        df = downsample_data(df)

        # 특징 추출
        feature_cols = [c for c in df.columns if c not in {'clicked', 'seq', 'ID'}]
        df, feature_cols = preprocess_features(df, feature_cols, 'seq', is_train=True)

        # Target Encoding 준비
        if batch_num == 1:
            cat_cols = ["gender", "age_group", "inventory_id", "day_of_week", "hour"]
            for col in cat_cols:
                if col in df.columns and 'clicked' in df.columns:
                    try:
                        target_encoders[col] = df.groupby(col)['clicked'].mean().to_dict()
                    except Exception:
                        target_encoders[col] = {}

        # 카테고리 인코딩
        df = encode_categorical_features(df, ["gender","age_group","inventory_id","day_of_week","hour"])

        available_features = [col for col in feature_cols if col in df.columns]
        if feature_names is None:
            feature_names = available_features.copy()

        X = df[available_features].fillna(0)
        y = df['clicked']

        # 첫 배치에서 하이퍼파라미터 최적화
        if batch_num == 1:
            sample_size = min(50000, len(X))
            sample_indices = np.random.choice(len(X), sample_size, replace=False)
            X_sample = X.iloc[sample_indices]
            y_sample = y.iloc[sample_indices]

            X_temp, X_val_temp, y_temp, y_val_temp = train_test_split(
                X_sample, y_sample, test_size=0.2,
                random_state=CFG['SEED'], stratify=y_sample
            )
            best_params = optimize_xgb_params(X_temp, y_temp, X_val_temp, y_val_temp)
            del X_temp, X_val_temp, y_temp, y_val_temp, X_sample, y_sample
            gc.collect()

        # Stratified K-Fold
        skf = StratifiedKFold(n_splits=CFG['N_FOLDS'], shuffle=True, random_state=CFG['SEED'])
        fold_aucs = []

        for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            dtrain = xgb.DMatrix(X_train, label=y_train, nthread=-1, feature_names=available_features)
            dval = xgb.DMatrix(X_val, label=y_val, nthread=-1, feature_names=available_features)

            params = best_params or {
                "objective": "binary:logistic",
                "eval_metric": "auc",
                "tree_method": "gpu_hist",  # GPU 학습
                "predictor": "gpu_predictor",
                "learning_rate": CFG['LEARNING_RATE'],
                "max_depth": 6,
                "subsample": 0.8,
                "colsample_bytree": 0.8,
                "seed": CFG['SEED']
            }

            model = xgb.train(
                params,
                dtrain,
                num_boost_round=CFG['NUM_BOOST_ROUND'],
                evals=[(dtrain, "train"), (dval, "valid")],
                early_stopping_rounds=CFG['EARLY_STOPPING'],
                verbose_eval=200
            )

            val_pred = model.predict(dval)
            fold_auc = roc_auc_score(y_val, val_pred)
            fold_aucs.append(fold_auc)

            models.append(model)

            # 메모리 해제
            del X_train, X_val, y_train, y_val, dtrain, dval
            gc.collect()

        print(f"Batch {batch_num} Average CV AUC: {np.mean(fold_aucs):.4f} (+/- {np.std(fold_aucs):.4f})")

        del df, X, y
        gc.collect()

    return models, feature_names, target_encoders
 

models, feature_names, target_encoders = train_xgb_with_cv("./data/train.parquet")
print("=== High Performance XGBoost CTR Prediction ===")

## Predict

In [None]:
def predict_test_data(models,test_path,feature_names,target_encoders):
    test_df = pd.read_parquet(test_path, engine="pyarrow")
    if 'ID' in test_df.columns:
        test_ids = test_df['ID'].copy()
        test_df = test_df.drop(columns=['ID'])
    feature_cols = [c for c in test_df.columns if c!='seq']
    test_df, feature_cols = preprocess_features(test_df, feature_cols,'seq',is_train=False,target_encoders=target_encoders)
    test_df = encode_categorical_features(test_df, ["gender","age_group","inventory_id","day_of_week","hour"])
    for feature in feature_names:
        if feature not in test_df.columns:
            test_df[feature]=0
    test_features = test_df[feature_names].fillna(0)
    predictions = np.zeros(len(test_df))
    for model in models:
        dtest = xgb.DMatrix(test_features)
        predictions += model.predict(dtest)
    predictions /= len(models)
    return predictions

test_predictions = predict_test_data(models,"./data/test.parquet",feature_names,target_encoders)

## Submission

In [None]:
submission = pd.read_csv('./sample_submission.csv')
submission['clicked'] = test_predictions
submission.to_csv('./high_performance_xgb_submit.csv', index=False)