In [None]:
import os
import numpy as np
import polars as pl
import pandas as pd
import joblib

from sklearn.metrics import average_precision_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import sklearn
import gc
import lightgbm as lgb

import optuna
from optuna.integration import LightGBMPruningCallback

In [None]:
# score 계산
def calculate_weighted_logloss(y_true, y_pred, eps=1e-15):
    y_pred = np.clip(y_pred, eps, 1 - eps)
    
    mask_0 = (y_true == 0)
    mask_1 = (y_true == 1)
    
    ll_0 = -np.mean(np.log(1 - y_pred[mask_0])) if mask_0.sum() > 0 else 0
    ll_1 = -np.mean(np.log(y_pred[mask_1])) if mask_1.sum() > 0 else 0
    
    return 0.5 * ll_0 + 0.5 * ll_1

def calculate_competition_score(y_true, y_pred):
    ap = average_precision_score(y_true, y_pred)
    wll = calculate_weighted_logloss(y_true, y_pred)
    score = 0.5 * ap + 0.5 * (1 / (1 + wll))
    return score, ap, wll

In [None]:
CFG = {
    'SEED': 42, # Seed 고정
    'VALIDATION_SIZE': 0.2,
    'LGB_EARLY_STOPPING': 300, # LGB 조기 종료 설정
    'OPTUNA_TRIALS': 50,  # Optuna trials 횟수
    'N_SPLITS': 3, # Fold 횟수
    'LGB_SEEDS': [42, 2024, 1004] # LGB를 수행할 각 Seed
}

In [None]:
# 데이터 로드
all_train = pl.read_parquet('./train.parquet')

# clicked == 1 데이터 필터링
clicked_1 = all_train.filter(pl.col('clicked') == 1)

# clicked == 0 데이터 다운샘플링 clicked : non-clicked = 1 : 3
clicked_0 = all_train.filter(pl.col('clicked') == 0).sample(n=clicked_1.height * 3, seed=CFG['SEED'])

# 두 데이터프레임 합치기
train = pl.concat([clicked_1, clicked_0])

# 최종 데이터프레임 전체 셔플
train = train.sample(fraction=1, shuffle=True, seed=CFG['SEED'])

In [None]:
# categorical, numerical 분류
target_col = 'clicked'
cols_to_drop = ['seq', 'clicked']
cat_cols = ['gender', 'age_group', 'inventory_id', 'day_of_week', 'hour']
features = [col for col in train.columns if col not in cols_to_drop]
numeric_cols = [col for col in features if col not in cat_cols]

In [None]:
train_pd = train.to_pandas() # pandas로 변환
X = train_pd[features].copy() 
y = train_pd[target_col]

X[cat_cols] = X[cat_cols].astype('category') # category 타입으로 변환
X[numeric_cols] = X[numeric_cols].astype('float32')

In [None]:
# optuna에 수행할 train, val 분할
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=CFG['VALIDATION_SIZE'],
    random_state=CFG['SEED'],
    stratify=y
)

In [None]:
# pos_weight 설정
pos_count = (y == 1).sum()
neg_count = (y == 0).sum()
pos_weight_value = neg_count / pos_count if pos_count > 0 else 1

In [None]:
def objective(trial: optuna.Trial):
    """Optuna가 최적화할 목적 함수"""
    # 하이퍼파라미터 탐색 공간 정의
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'device': 'cpu',
        'seed': CFG['SEED'],
        'num_threads': -1,
        'verbosity': -1,
        'scale_pos_weight': pos_weight_value,
        'boosting_type': 'gbdt',
        'feature_pre_filter': False,
        
        # 튜닝할 하이퍼파라미터들
        'num_leaves': trial.suggest_int('num_leaves', 128, 320),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.05, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-2, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-2, 10.0, log=True),
        'min_child_samples': trial.suggest_int('min_child_samples', 200, 1000),
    }
    print(f"--- Trial {trial.number} Parameters ---")
    print(params)
    print("---------------------------------")

    # Pruning Callback (성능이 좋지 않은 trial 조기 중단)
    pruning_callback = LightGBMPruningCallback(trial, 'auc',valid_name='valid_1')
    lgb_train = lgb.Dataset(X_train, y_train, feature_name=features, categorical_feature=cat_cols)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train, feature_name=features, categorical_feature=cat_cols)

    # 모델 학습
    bst = lgb.train(
        params,
        lgb_train,
        num_boost_round=100000,
        valid_sets=[lgb_train, lgb_eval],
        callbacks=[lgb.log_evaluation(period=1000), lgb.early_stopping(CFG['LGB_EARLY_STOPPING'], verbose=True), pruning_callback]
    )
    auc = bst.best_score['valid_1']['auc']
    trial.set_user_attr('best_iteration', bst.best_iteration)

    del lgb_train, lgb_eval, bst

    # 검증 데이터 AUC 점수 반환
    return auc

In [None]:
study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=10000))
study.optimize(objective, n_trials=CFG['OPTUNA_TRIALS'], show_progress_bar=True)

In [None]:
print("\n==================================")
print("Optuna 최종 최적 결과 (완료된 Trial 기준)")
print("==================================")
print(f"최고 점수 (AUC): {study.best_value:.5f}")
print("최적 하이퍼파라미터:")
for key, value in study.best_params.items():
    print(f"  '{key}': {value}")
print(f"해당 Trial 번호: {study.best_trial.number}")
print("==================================")

In [None]:
# optuna로 탐색한 파라미터(Trial 50)
#   'num_leaves': 190, 
#   'learning_rate': 0.0016691651006236685, 
#   'colsample_bytree': 0.686387931453528, 
#   'subsample': 0.6314368795644892, 
#   'reg_alpha': 3.113738376402282, 
#   'reg_lambda': 0.0654507771673197, 
#   'min_child_samples': 468,

In [None]:
oof_preds = np.zeros((len(X), len(CFG['LGB_SEEDS'])))
models = []

LGB_params = {
    'objective': 'binary',
    'metric': 'auc',
    'num_threads': -1,
    'scale_pos_weight': pos_weight_value,
    'num_leaves': 190, 
    'learning_rate': 0.0016691651006236685, 
    'colsample_bytree': 0.686387931453528, 
    'subsample': 0.6314368795644892, 
    'reg_alpha': 3.113738376402282, 
    'reg_lambda': 0.0654507771673197, 
    'min_child_samples': 468,
    'verbosity': -1
}
oof_preds = np.zeros((len(X), len(CFG['LGB_SEEDS'])))
os.makedirs('lgbm_models', exist_ok=True) # model 저장 폴더 생성

In [None]:
for seed_idx, seed in enumerate(CFG['LGB_SEEDS']):
    print(f"===== SEED: {seed} =====")
    
    # StratifiedKFold 설정
    skf = StratifiedKFold(n_splits=CFG['N_SPLITS'], shuffle=True, random_state=seed)
    
    # StratifiedKFold 교차 검증 루프
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"--- Fold: {fold+1}/{CFG['N_SPLITS']} ---")
        
        # 데이터 분할
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        # 파라미터에 현재 시드 적용
        params = LGB_params.copy()
        params['seed'] = seed
        
        # LightGBM 데이터셋 생성
        lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=cat_cols)
        lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train, categorical_feature=cat_cols)
        
        # 모델 학습
        model = lgb.train(
            params=params,
            train_set=lgb_train,
            valid_sets=[lgb_train, lgb_eval],
            num_boost_round=10000,
            callbacks=[lgb.early_stopping(CFG['LGB_EARLY_STOPPING'], verbose=False)]
        )
        
        # OOF 예측값 저장
        oof_preds[val_idx, seed_idx] = model.predict(X_val, num_iteration=model.best_iteration)
        
        model_path = f'lgbm_models/lgbm_seed_{seed}_fold_{fold}.joblib' # 각 seed, fold 별 모델 저장
        joblib.dump(model, model_path)

        del X_train, y_train, X_val, y_val, lgb_train, lgb_eval, model
        gc.collect() # 가비지 컬렉터 호출
        
    print(f"All models for SEED {seed} saved.")

final_oof = oof_preds.mean(axis=1)

In [None]:
score, ap, wll = calculate_competition_score(y, final_oof)
print("===== 최종 OOF 성능 =====")
print(f"Competition Score: {score:.6f}")
print(f"Average Precision: {ap:.6f}")
print(f"Weighted LogLoss: {wll:.6f}")