# EXP-007: XGBoost 41 отдельная модель

- 41 бинарная модель XGBoost (binary:logistic) вместо одной MultiLogloss
- GPU ускорение (tree_method=hist, device=cuda)
- Валидация → полное обучение на 750k → сабмит
- **LB: 0.8412** (лучший результат)

In [None]:
import polars as pl
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import pyarrow as pa
import pyarrow.parquet as pq
import time

# Загрузка данных
train_main = pl.read_parquet('../data/raw/train_main_features.parquet')
test_main = pl.read_parquet('../data/raw/test_main_features.parquet')
train_extra = pl.read_parquet('../data/raw/train_extra_features.parquet')
test_extra = pl.read_parquet('../data/raw/test_extra_features.parquet')
target = pl.read_parquet('../data/raw/train_target.parquet')

cat_features = [col for col in train_main.columns if col.startswith('cat_feature')]
target_columns = [col for col in target.columns if col.startswith('target')]

train_main = train_main.with_columns(pl.col(cat_features).cast(pl.Int32))
test_main = test_main.with_columns(pl.col(cat_features).cast(pl.Int32))

train_full = train_main.join(train_extra, on='customer_id', how='left')
test_full = test_main.join(test_extra, on='customer_id', how='left')
feature_columns = [col for col in train_full.columns if col != 'customer_id']

print(f'Train: {train_full.shape}, Test: {test_full.shape}')
print(f'Признаков: {len(feature_columns)}, Таргетов: {len(target_columns)}')

In [None]:
# Train/Val split (тот же random_state=42 что везде)
train_idx, val_idx = train_test_split(np.arange(len(train_full)), test_size=0.2, random_state=42)

X_train = train_full[train_idx].select(feature_columns).to_pandas()
X_val = train_full[val_idx].select(feature_columns).to_pandas()
X_test = test_full.select(feature_columns).to_pandas()
y_train_all = target[train_idx].select(target_columns).to_pandas()
y_val_all = target[val_idx].select(target_columns).to_pandas()

print(f'X_train: {X_train.shape}, X_val: {X_val.shape}, X_test: {X_test.shape}')

In [None]:
# Шаг 1: Обучение на 600k с валидацией — получаем скоры и best_iterations
xgb_val_predictions = np.zeros((len(X_val), len(target_columns)))
xgb_val_scores = {}
best_iters = []

start = time.time()

for i, target_col in enumerate(target_columns):
    y_tr = y_train_all[target_col].values
    y_vl = y_val_all[target_col].values

    dtrain = xgb.DMatrix(X_train, label=y_tr, enable_categorical=False)
    dval = xgb.DMatrix(X_val, label=y_vl, enable_categorical=False)

    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate': 0.05,
        'max_depth': 6,
        'min_child_weight': 5,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'tree_method': 'hist',
        'device': 'cuda',
        'seed': 42,
        'verbosity': 0,
    }

    try:
        model = xgb.train(
            params, dtrain,
            num_boost_round=3000,
            evals=[(dval, 'val')],
            early_stopping_rounds=100,
            verbose_eval=False
        )
        val_pred = model.predict(dval)
        best_iter = model.best_iteration
    except Exception as e:
        print(f'  FALLBACK: {e}')
        val_pred = np.full(len(X_val), y_tr.mean())
        best_iter = 0

    xgb_val_predictions[:, i] = val_pred
    best_iters.append(best_iter)

    score = roc_auc_score(y_vl, val_pred)
    xgb_val_scores[target_col] = score

    elapsed = time.time() - start
    print(f'[{i+1:2d}/41] {target_col:15s} | ROC-AUC: {score:.6f} | best_iter: {best_iter:4d} | {elapsed/60:.1f} мин')

macro_score = np.mean(list(xgb_val_scores.values()))
print(f'\n>>> Macro ROC-AUC (val): {macro_score:.6f}')
print(f'>>> Время: {(time.time()-start)/60:.1f} мин')
print(f'>>> best_iters: {best_iters}')

In [None]:
# Шаг 2: Полное обучение на 750k с итерациями x2.0 (min 500)
X_full = train_full.select(feature_columns).to_pandas()
y_full_all = target.select(target_columns).to_pandas()

xgb_full_test_predictions = np.zeros((len(X_test), len(target_columns)))

start = time.time()

for i, target_col in enumerate(target_columns):
    y_full = y_full_all[target_col].values
    n_rounds = max(int(best_iters[i] * 2.0), 500)

    dfull = xgb.DMatrix(X_full, label=y_full, enable_categorical=False)
    dtest = xgb.DMatrix(X_test, enable_categorical=False)

    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'learning_rate': 0.05,
        'max_depth': 6,
        'min_child_weight': 5,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'tree_method': 'hist',
        'device': 'cuda',
        'seed': 42,
        'verbosity': 0,
    }

    model = xgb.train(params, dfull, num_boost_round=n_rounds)
    xgb_full_test_predictions[:, i] = model.predict(dtest)

    elapsed = time.time() - start
    print(f'[{i+1:2d}/41] {target_col:15s} | iters: {n_rounds:4d} | {elapsed/60:.1f} мин')

print(f'\n>>> Время: {(time.time()-start)/60:.1f} мин')

In [None]:
# Шаг 3: Формирование сабмита
predict_columns = [col.replace('target_', 'predict_') for col in target_columns]
submit = test_full.select('customer_id').to_pandas().copy()

for j, col in enumerate(predict_columns):
    submit[col] = xgb_full_test_predictions[:, j]

# Сохраняем через pyarrow (pandas может давать ошибку utf-8 на платформе)
table = pa.Table.from_pandas(submit, preserve_index=False)
pq.write_table(table, '../submissions/exp007_xgb_41models.parquet')

print(f'Сабмит: {submit.shape}')
print(submit.head())