In [1]:
import polars as pl                                                                                                                             
import numpy as np                                                                                                                              
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import time

# Загрузка данных
train_main = pl.read_parquet('../data/raw/train_main_features.parquet')
test_main = pl.read_parquet('../data/raw/test_main_features.parquet')
train_extra = pl.read_parquet('../data/raw/train_extra_features.parquet')
test_extra = pl.read_parquet('../data/raw/test_extra_features.parquet')
target = pl.read_parquet('../data/raw/train_target.parquet')

# Списки признаков
cat_features = [col for col in train_main.columns if col.startswith('cat_feature')]
target_columns = [col for col in target.columns if col.startswith('target')]

# Каст категориальных
train_main = train_main.with_columns(pl.col(cat_features).cast(pl.Int32))
test_main = test_main.with_columns(pl.col(cat_features).cast(pl.Int32))

# Объединяем main + extra
train_full = train_main.join(train_extra, on='customer_id', how='left')
test_full = test_main.join(test_extra, on='customer_id', how='left')
feature_columns = [col for col in train_full.columns if col != 'customer_id']

print(f'Train: {train_full.shape}, Test: {test_full.shape}')
print(f'Признаков: {len(feature_columns)}, Таргетов: {len(target_columns)}')

Train: (750000, 2441), Test: (250000, 2441)
Признаков: 2440, Таргетов: 41


In [2]:
# Split
train_idx, val_idx = train_test_split(np.arange(len(train_full)), test_size=0.2, random_state=42)

X_train = train_full[train_idx].select(feature_columns).to_pandas()
X_val = train_full[val_idx].select(feature_columns).to_pandas()

# Таргеты — все столбцы, потом будем брать по одному
y_train_all = target[train_idx].select(target_columns).to_pandas()
y_val_all = target[val_idx].select(target_columns).to_pandas()

print(f'X_train: {X_train.shape}, X_val: {X_val.shape}')
print(f'y_train: {y_train_all.shape}, y_val: {y_val_all.shape}')

X_train: (600000, 2440), X_val: (150000, 2440)
y_train: (600000, 41), y_val: (150000, 41)


In [3]:
val_predictions = np.zeros((len(X_val), len(target_columns)))
val_scores = {}

start = time.time()

for i, target_col in enumerate(target_columns):
    y_tr = y_train_all[target_col].values
    y_vl = y_val_all[target_col].values

    train_pool = Pool(X_train, label=y_tr, cat_features=cat_features)
    val_pool = Pool(X_val, label=y_vl, cat_features=cat_features)

    model = CatBoostClassifier(
        iterations=3000,
        depth=6,
        learning_rate=0.05,
        loss_function='Logloss',
        nan_mode='Min',
        task_type='GPU',
        devices='0',
        random_seed=42,
        verbose=0,
        early_stopping_rounds=100
    )

    model.fit(train_pool, eval_set=val_pool)

    pred = model.predict(val_pool, prediction_type='RawFormulaVal')
    val_predictions[:, i] = pred

    score = roc_auc_score(y_vl, pred)
    val_scores[target_col] = score

    elapsed = time.time() - start
    print(f'[{i+1:2d}/41] {target_col:15s} | ROC-AUC: {score:.6f} | best_iter: {model.get_best_iteration():4d} | {elapsed/60:.1f} мин')

# Macro ROC-AUC
macro_score = np.mean(list(val_scores.values()))
print(f'\n>>> Macro ROC-AUC: {macro_score:.6f}')
print(f'>>> Время: {(time.time()-start)/60:.1f} мин')



[ 1/41] target_1_1      | ROC-AUC: 0.907565 | best_iter: 2989 | 11.4 мин




[ 2/41] target_1_2      | ROC-AUC: 0.822491 | best_iter: 1791 | 18.8 мин




[ 3/41] target_1_3      | ROC-AUC: 0.870120 | best_iter: 2978 | 30.3 мин




[ 4/41] target_1_4      | ROC-AUC: 0.829304 | best_iter: 2990 | 41.8 мин




[ 5/41] target_1_5      | ROC-AUC: 0.876469 | best_iter: 2019 | 49.9 мин




[ 6/41] target_2_1      | ROC-AUC: 0.829245 | best_iter: 2560 | 60.0 мин




[ 7/41] target_2_2      | ROC-AUC: 0.934715 | best_iter: 2997 | 71.6 мин




[ 8/41] target_2_3      | ROC-AUC: 0.767239 | best_iter:  746 | 75.2 мин




[ 9/41] target_2_4      | ROC-AUC: 0.757441 | best_iter: 2424 | 85.5 мин




[10/41] target_2_5      | ROC-AUC: 0.734061 | best_iter: 2031 | 93.8 мин




[11/41] target_2_6      | ROC-AUC: 0.732549 | best_iter: 1349 | 99.5 мин




[12/41] target_2_7      | ROC-AUC: 0.855675 | best_iter:  717 | 102.7 мин




[13/41] target_2_8      | ROC-AUC: 0.998055 | best_iter:  939 | 106.8 мин




[14/41] target_3_1      | ROC-AUC: 0.700595 | best_iter: 2989 | 118.7 мин




[15/41] target_3_2      | ROC-AUC: 0.913975 | best_iter: 2999 | 130.6 мин




[16/41] target_3_3      | ROC-AUC: 0.760569 | best_iter:  935 | 134.7 мин




[17/41] target_3_4      | ROC-AUC: 0.933780 | best_iter: 1384 | 140.4 мин




[18/41] target_3_5      | ROC-AUC: 0.969683 | best_iter: 1393 | 146.1 мин




[19/41] target_4_1      | ROC-AUC: 0.849600 | best_iter: 2069 | 154.4 мин




[20/41] target_5_1      | ROC-AUC: 0.749048 | best_iter: 2969 | 165.8 мин




[21/41] target_5_2      | ROC-AUC: 0.726043 | best_iter: 1094 | 170.6 мин




[22/41] target_6_1      | ROC-AUC: 0.720590 | best_iter: 1977 | 178.5 мин




[23/41] target_6_2      | ROC-AUC: 0.722325 | best_iter: 2207 | 187.3 мин




[24/41] target_6_3      | ROC-AUC: 0.757562 | best_iter: 2763 | 198.1 мин




[25/41] target_6_4      | ROC-AUC: 0.846674 | best_iter: 2009 | 206.2 мин




[26/41] target_6_5      | ROC-AUC: 0.895109 | best_iter:  887 | 210.0 мин




[27/41] target_7_1      | ROC-AUC: 0.806269 | best_iter: 2977 | 221.9 мин




[28/41] target_7_2      | ROC-AUC: 0.844616 | best_iter: 2998 | 233.5 мин




[29/41] target_7_3      | ROC-AUC: 0.798989 | best_iter: 2106 | 241.9 мин




[30/41] target_8_1      | ROC-AUC: 0.981611 | best_iter: 2999 | 253.5 мин




[31/41] target_8_2      | ROC-AUC: 0.854198 | best_iter: 2997 | 265.3 мин




[32/41] target_8_3      | ROC-AUC: 0.896196 | best_iter: 2995 | 276.8 мин




[33/41] target_9_1      | ROC-AUC: 0.778449 | best_iter: 1427 | 282.8 мин




[34/41] target_9_2      | ROC-AUC: 0.839472 | best_iter: 2396 | 292.6 мин




[35/41] target_9_3      | ROC-AUC: 0.686632 | best_iter: 1867 | 300.2 мин




[36/41] target_9_4      | ROC-AUC: 0.917571 | best_iter: 1152 | 305.0 мин




[37/41] target_9_5      | ROC-AUC: 0.841170 | best_iter: 2356 | 314.4 мин




[38/41] target_9_6      | ROC-AUC: 0.693967 | best_iter: 2999 | 326.6 мин




[39/41] target_9_7      | ROC-AUC: 0.765073 | best_iter: 2999 | 338.4 мин




[40/41] target_9_8      | ROC-AUC: 0.922422 | best_iter: 2932 | 349.9 мин




[41/41] target_10_1     | ROC-AUC: 0.759191 | best_iter: 2999 | 362.3 мин

>>> Macro ROC-AUC: 0.825520
>>> Время: 362.3 мин


In [None]:
import json

# Сохраняем val предсказания
np.save('../data/processed/cb_val_predictions.npy', val_predictions)

# Сохраняем скоры
with open('../data/processed/cb_val_scores.json', 'w') as f:
    json.dump(val_scores, f)

macro = np.mean(list(val_scores.values()))
print(f'Сохранено! Macro: {macro:.6f}')
print(f'val_predictions shape: {val_predictions.shape}')

Сохранено! Macro: 0.825520
val_predictions shape: (150000, 41)


: 