In [1]:
import polars as pl                                                                                                                             
import numpy as np                                                                                                                              
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import time

# Загрузка данных
train_main = pl.read_parquet('../data/raw/train_main_features.parquet')
test_main = pl.read_parquet('../data/raw/test_main_features.parquet')
train_extra = pl.read_parquet('../data/raw/train_extra_features.parquet')
test_extra = pl.read_parquet('../data/raw/test_extra_features.parquet')
target = pl.read_parquet('../data/raw/train_target.parquet')

# Списки признаков
cat_features = [col for col in train_main.columns if col.startswith('cat_feature')]
target_columns = [col for col in target.columns if col.startswith('target')]

# Каст категориальных
train_main = train_main.with_columns(pl.col(cat_features).cast(pl.Int32))
test_main = test_main.with_columns(pl.col(cat_features).cast(pl.Int32))

# Объединяем main + extra
train_full = train_main.join(train_extra, on='customer_id', how='left')
test_full = test_main.join(test_extra, on='customer_id', how='left')
feature_columns = [col for col in train_full.columns if col != 'customer_id']

print(f'Train: {train_full.shape}, Test: {test_full.shape}')
print(f'Признаков: {len(feature_columns)}, Таргетов: {len(target_columns)}')

Train: (750000, 2441), Test: (250000, 2441)
Признаков: 2440, Таргетов: 41


In [2]:
# Split
train_idx, val_idx = train_test_split(np.arange(len(train_full)), test_size=0.2, random_state=42)

X_train = train_full[train_idx].select(feature_columns).to_pandas()
X_val = train_full[val_idx].select(feature_columns).to_pandas()

# Таргеты — все столбцы, потом будем брать по одному
y_train_all = target[train_idx].select(target_columns).to_pandas()
y_val_all = target[val_idx].select(target_columns).to_pandas()

print(f'X_train: {X_train.shape}, X_val: {X_val.shape}')
print(f'y_train: {y_train_all.shape}, y_val: {y_val_all.shape}')

X_train: (600000, 2440), X_val: (150000, 2440)
y_train: (600000, 41), y_val: (150000, 41)


In [None]:
val_predictions = np.zeros((len(X_val), len(target_columns)))
val_scores = {}

start = time.time()

for i, target_col in enumerate(target_columns):
    y_tr = y_train_all[target_col].values
    y_vl = y_val_all[target_col].values

    train_pool = Pool(X_train, label=y_tr, cat_features=cat_features)
    val_pool = Pool(X_val, label=y_vl, cat_features=cat_features)

    model = CatBoostClassifier(
        iterations=3000,
        depth=6,
        learning_rate=0.05,
        loss_function='Logloss',
        nan_mode='Min',
        task_type='GPU',
        devices='0',
        random_seed=42,
        verbose=0,
        early_stopping_rounds=100
    )

    model.fit(train_pool, eval_set=val_pool)

    pred = model.predict(val_pool, prediction_type='RawFormulaVal')
    val_predictions[:, i] = pred

    score = roc_auc_score(y_vl, pred)
    val_scores[target_col] = score

    elapsed = time.time() - start
    print(f'[{i+1:2d}/41] {target_col:15s} | ROC-AUC: {score:.6f} | best_iter: {model.get_best_iteration():4d} | {elapsed/60:.1f} мин')

# Macro ROC-AUC
macro_score = np.mean(list(val_scores.values()))
print(f'\n>>> Macro ROC-AUC: {macro_score:.6f}')
print(f'>>> Время: {(time.time()-start)/60:.1f} мин')



[ 1/41] target_1_1      | ROC-AUC: 0.907565 | best_iter: 2989 | 11.4 мин




[ 2/41] target_1_2      | ROC-AUC: 0.822491 | best_iter: 1791 | 18.8 мин




[ 3/41] target_1_3      | ROC-AUC: 0.870120 | best_iter: 2978 | 30.3 мин




[ 4/41] target_1_4      | ROC-AUC: 0.829304 | best_iter: 2990 | 41.8 мин




[ 5/41] target_1_5      | ROC-AUC: 0.876469 | best_iter: 2019 | 49.9 мин




[ 6/41] target_2_1      | ROC-AUC: 0.829245 | best_iter: 2560 | 60.0 мин




[ 7/41] target_2_2      | ROC-AUC: 0.934715 | best_iter: 2997 | 71.6 мин
