In [2]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt

from catboost import Pool, CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [5]:
# Основные признаки
train_main = pl.read_parquet('../data/raw/train_main_features.parquet')
test_main = pl.read_parquet('../data/raw/test_main_features.parquet')

# Целевые переменные
target = pl.read_parquet('../data/raw/train_target.parquet')

print('Train main:', train_main.shape)
print('Test main:', test_main.shape)
print('Target:', target.shape)

Train main: (750000, 200)
Test main: (250000, 200)
Target: (750000, 42)


In [6]:
# Список категориальных признаков
cat_features = [col for col in train_main.columns if col.startswith('cat_feature')]

# Список целевых переменных
target_columns = [col for col in target.columns if col.startswith('target')]

# Каст категориальных в Int32 (требование CatBoost)
train_main = train_main.with_columns(pl.col(cat_features).cast(pl.Int32))
test_main = test_main.with_columns(pl.col(cat_features).cast(pl.Int32))

print(f'Категориальных признаков: {len(cat_features)}')
print(f'Целевых переменных: {len(target_columns)}')

Категориальных признаков: 67
Целевых переменных: 41


In [9]:
# Признаки без customer_id
feature_columns = [col for col in train_main.columns if col != 'customer_id']

# Разбиваем индексы на train/val
train_idx, val_idx = train_test_split(
    np.arange(len(train_main)),
    test_size=0.2,
    random_state=42
)

# Разделяем данные
X_train = train_main[train_idx].select(feature_columns).to_pandas()
X_val = train_main[val_idx].select(feature_columns).to_pandas()

y_train = target[train_idx].select(target_columns).to_pandas()
y_val = target[val_idx].select(target_columns).to_pandas()

print(f'Train: {X_train.shape[0]} клиентов')
print(f'Val: {X_val.shape[0]} клиентов')

Train: 600000 клиентов
Val: 150000 клиентов


In [5]:
# Создаём Pool
train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
val_pool = Pool(X_val, label=y_val, cat_features=cat_features)

# Модель с увеличенными итерациями + early stopping
model = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.1,
    loss_function='MultiLogloss',
    nan_mode='Min',
    task_type='GPU',
    devices='0',
    random_seed=42,
    verbose=100,
    early_stopping_rounds=100
)

model.fit(train_pool, eval_set=val_pool)

0:	learn: 0.4835518	test: 0.4835598	best: 0.4835598 (0)	total: 2.55s	remaining: 42m 29s
100:	learn: 0.0860745	test: 0.0867703	best: 0.0867703 (100)	total: 1m 48s	remaining: 16m 2s
200:	learn: 0.0841261	test: 0.0855081	best: 0.0855081 (200)	total: 3m 34s	remaining: 14m 14s
300:	learn: 0.0830283	test: 0.0849741	best: 0.0849741 (300)	total: 5m 21s	remaining: 12m 26s
400:	learn: 0.0823365	test: 0.0847189	best: 0.0847189 (400)	total: 7m 7s	remaining: 10m 38s
500:	learn: 0.0818089	test: 0.0845607	best: 0.0845607 (500)	total: 8m 53s	remaining: 8m 51s
600:	learn: 0.0813445	test: 0.0844315	best: 0.0844315 (600)	total: 10m 39s	remaining: 7m 4s
700:	learn: 0.0808729	test: 0.0843286	best: 0.0843286 (700)	total: 12m 25s	remaining: 5m 17s
800:	learn: 0.0804464	test: 0.0842606	best: 0.0842606 (800)	total: 14m 11s	remaining: 3m 31s
900:	learn: 0.0800362	test: 0.0842024	best: 0.0842024 (900)	total: 15m 57s	remaining: 1m 45s
999:	learn: 0.0796449	test: 0.0841510	best: 0.0841510 (999)	total: 17m 42s	rema

CatBoostClassifier(depth=6, devices='0', early_stopping_rounds=100, iterations=1000, learning_rate=0.1, loss_function='MultiLogloss', nan_mode='Min', random_seed=42, task_type='GPU', verbose=100)

In [6]:
# Предсказания на валидации
val_predict = model.predict(val_pool, prediction_type='RawFormulaVal')

# Считаем macro ROC-AUC
val_score = roc_auc_score(y_val, val_predict, average='macro')

print(f'Local Val macro ROC-AUC: {val_score:.6f}')

Local Val macro ROC-AUC: 0.818632


### Обучение на полном train + сабмит

In [7]:
# Pool на полном train
full_train_pool = Pool(
    train_main.drop('customer_id').to_pandas(),
    label=target.drop('customer_id').to_pandas(),
    cat_features=cat_features
)

# Обучаем с тем же числом итераций (bestIteration = 999)
full_model = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.1,
    loss_function='MultiLogloss',
    nan_mode='Min',
    task_type='GPU',
    devices='0',
    random_seed=42,
    verbose=100
)

full_model.fit(full_train_pool)

0:	learn: 0.4835486	total: 1.31s	remaining: 21m 47s
100:	learn: 0.0861666	total: 2m 11s	remaining: 19m 31s
200:	learn: 0.0843553	total: 4m 24s	remaining: 17m 30s
300:	learn: 0.0833107	total: 6m 36s	remaining: 15m 20s
400:	learn: 0.0826093	total: 8m 48s	remaining: 13m 8s
500:	learn: 0.0820619	total: 10m 59s	remaining: 10m 56s
600:	learn: 0.0816123	total: 13m 10s	remaining: 8m 44s
700:	learn: 0.0812439	total: 15m 21s	remaining: 6m 33s
800:	learn: 0.0808803	total: 17m 32s	remaining: 4m 21s
900:	learn: 0.0805243	total: 19m 43s	remaining: 2m 10s
999:	learn: 0.0801962	total: 21m 53s	remaining: 0us


CatBoostClassifier(depth=6, devices='0', iterations=1000, learning_rate=0.1, loss_function='MultiLogloss', nan_mode='Min', random_seed=42, task_type='GPU', verbose=100)

### Предсказания и сохранение сабмита

In [8]:
# Pool для test
test_pool = Pool(                                                                                                                                     
    test_main.drop('customer_id').to_pandas(),
    cat_features=cat_features
)

# Предсказания
test_predict = full_model.predict(test_pool, prediction_type='RawFormulaVal')

# Собираем сабмит
predict_columns = [col.replace('target_', 'predict_') for col in target_columns]
submit = pl.DataFrame(test_predict, schema=predict_columns)
submit = test_main.select('customer_id').hstack(submit)

# Сохраняем
submit.write_parquet('../submissions/exp002_catboost_1000iter.parquet')

print(f'Сабмит: {submit.shape}')
submit.head(3)

Сабмит: (250000, 42)


customer_id,predict_1_1,predict_1_2,predict_1_3,predict_1_4,predict_1_5,predict_2_1,predict_2_2,predict_2_3,predict_2_4,predict_2_5,predict_2_6,predict_2_7,predict_2_8,predict_3_1,predict_3_2,predict_3_3,predict_3_4,predict_3_5,predict_4_1,predict_5_1,predict_5_2,predict_6_1,predict_6_2,predict_6_3,predict_6_4,predict_6_5,predict_7_1,predict_7_2,predict_7_3,predict_8_1,predict_8_2,predict_8_3,predict_9_1,predict_9_2,predict_9_3,predict_9_4,predict_9_5,predict_9_6,predict_9_7,predict_9_8,predict_10_1
i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1750001,-7.332218,-6.299495,-4.842557,-4.867542,-7.386244,-5.00224,-5.326583,-7.864383,-4.692032,-7.274273,-6.165732,-8.546608,-11.841169,-2.648119,-4.623307,-8.403746,-9.816422,-11.20148,-4.581812,-6.05982,-6.522009,-6.884865,-6.287058,-5.093145,-7.024809,-10.478626,-3.111942,-4.656291,-4.933049,-6.552387,-3.815468,-3.447692,-5.830013,-2.456217,-3.769044,-10.820624,-7.215448,-0.349959,-2.275141,-7.344312,-0.532022
1750002,-5.523283,-5.269433,-3.395585,-4.150186,-6.464329,-3.740087,-4.25903,-7.781865,-4.495006,-5.935803,-5.901087,-9.068531,-14.59801,-2.930967,-5.440595,-7.50088,-9.971712,-12.677142,-6.74989,-5.411628,-6.621785,-6.04922,-5.350833,-6.151602,-6.578981,-10.23646,-3.16174,-4.61147,-6.624071,-4.075221,-2.680757,-5.368947,-6.762131,-1.179617,-3.92596,-5.864417,-4.375507,-0.843226,-2.131622,-10.783713,-1.18645
1750003,-5.483051,-5.294465,-4.271373,-3.462061,-4.67623,-5.38953,-4.804943,-6.744537,-5.774792,-5.832038,-5.746157,-12.028043,-12.178956,-2.170234,-4.286224,-7.456886,-8.729267,-12.570523,-4.167843,-5.67511,-8.990753,-5.129736,-6.55882,-6.850206,-5.615061,-10.869659,-6.033784,-3.687506,-6.029813,-3.478728,-3.560814,-4.581912,-4.981161,-3.541798,-4.38541,-10.208579,-8.46083,-1.382998,-3.28305,-7.538651,-0.237438


### Загрузка extra features

In [10]:
# Загружаем дополнительные признаки
train_extra = pl.read_parquet('../data/raw/train_extra_features.parquet')
test_extra = pl.read_parquet('../data/raw/test_extra_features.parquet')

print(f'Train extra: {train_extra.shape}')
print(f'Test extra: {test_extra.shape}')

Train extra: (750000, 2242)
Test extra: (250000, 2242)


In [11]:
# Объединяем main + extra по customer_id
train_full = train_main.join(train_extra, on='customer_id', how='left')
test_full = test_main.join(test_extra, on='customer_id', how='left')

# Обновляем список признаков (без customer_id)
feature_columns_full = [col for col in train_full.columns if col != 'customer_id']

print(f'Train full: {train_full.shape}')
print(f'Test full: {test_full.shape}')
print(f'Всего признаков: {len(feature_columns_full)}')

Train full: (750000, 2441)
Test full: (250000, 2441)
Всего признаков: 2440


In [12]:
# Split на тех же индексах (random_state=42 — те же самые клиенты в val)
X_train_f = train_full[train_idx].select(feature_columns_full).to_pandas()
X_val_f = train_full[val_idx].select(feature_columns_full).to_pandas()

# Pool
train_pool_f = Pool(X_train_f, label=y_train, cat_features=cat_features)
val_pool_f = Pool(X_val_f, label=y_val, cat_features=cat_features)

# Модель — те же параметры
model_f = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.1,
    loss_function='MultiLogloss',
    nan_mode='Min',
    task_type='GPU',
    devices='0',
    random_seed=42,
    verbose=100,
    early_stopping_rounds=100
)

model_f.fit(train_pool_f, eval_set=val_pool_f)



0:	learn: 0.4840313	test: 0.4840415	best: 0.4840415 (0)	total: 1.58s	remaining: 26m 21s
100:	learn: 0.0855444	test: 0.0861755	best: 0.0861755 (100)	total: 2m 37s	remaining: 23m 22s
200:	learn: 0.0831657	test: 0.0844912	best: 0.0844912 (200)	total: 5m 14s	remaining: 20m 50s
300:	learn: 0.0818619	test: 0.0837660	best: 0.0837660 (300)	total: 7m 49s	remaining: 18m 10s
400:	learn: 0.0810707	test: 0.0833768	best: 0.0833768 (400)	total: 10m 20s	remaining: 15m 26s
500:	learn: 0.0805574	test: 0.0831411	best: 0.0831411 (500)	total: 12m 46s	remaining: 12m 43s
600:	learn: 0.0801479	test: 0.0829953	best: 0.0829953 (600)	total: 15m 10s	remaining: 10m 4s
700:	learn: 0.0797823	test: 0.0828743	best: 0.0828743 (700)	total: 17m 35s	remaining: 7m 30s
800:	learn: 0.0794394	test: 0.0827588	best: 0.0827588 (800)	total: 20m	remaining: 4m 58s
900:	learn: 0.0791332	test: 0.0826763	best: 0.0826763 (900)	total: 22m 25s	remaining: 2m 27s
999:	learn: 0.0788326	test: 0.0825988	best: 0.0825988 (999)	total: 24m 48s	re

CatBoostClassifier(depth=6, devices='0', early_stopping_rounds=100, iterations=1000, learning_rate=0.1, loss_function='MultiLogloss', nan_mode='Min', random_seed=42, task_type='GPU', verbose=100)

### Локальный скор на extra фичах

In [15]:
val_predict_f = model_f.predict(val_pool_f, prediction_type='RawFormulaVal')                                                                                               
val_score_f = roc_auc_score(y_val, val_predict_f, average='macro')
                                                                                                                                                                            
print(f'Local Val (main only):      0.818632')
print(f'Local Val (main + extra):   {val_score_f:.6f}')
print(f'Прирост:                    +{val_score_f - 0.818632:.6f}')

Local Val (main only):      0.818632
Local Val (main + extra):   0.828181
Прирост:                    +0.009549


### обучаем на всех 750k с extra фичами, делаем предсказания, сохраняем

In [16]:
# Pool на полном train
full_train_pool_f = Pool(
    train_full.drop('customer_id').to_pandas(),
    label=target.drop('customer_id').to_pandas(),
    cat_features=cat_features
)

# Обучаем
full_model_f = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.1,
    loss_function='MultiLogloss',
    nan_mode='Min',
    task_type='GPU',
    devices='0',
    random_seed=42,
    verbose=100
)

full_model_f.fit(full_train_pool_f)



0:	learn: 0.4839985	total: 1.96s	remaining: 32m 33s
100:	learn: 0.0856071	total: 3m 16s	remaining: 29m 11s
200:	learn: 0.0833481	total: 6m 32s	remaining: 25m 59s
300:	learn: 0.0820430	total: 9m 47s	remaining: 22m 44s
400:	learn: 0.0812556	total: 12m 58s	remaining: 19m 23s
500:	learn: 0.0806441	total: 16m 5s	remaining: 16m 1s
600:	learn: 0.0802139	total: 19m 8s	remaining: 12m 42s
700:	learn: 0.0798810	total: 22m 10s	remaining: 9m 27s
800:	learn: 0.0796250	total: 25m 8s	remaining: 6m 14s
900:	learn: 0.0793331	total: 28m 9s	remaining: 3m 5s
999:	learn: 0.0790779	total: 31m 6s	remaining: 0us


CatBoostClassifier(depth=6, devices='0', iterations=1000, learning_rate=0.1, loss_function='MultiLogloss', nan_mode='Min', random_seed=42, task_type='GPU', verbose=100)

In [18]:
# Pool для test                                                                                                                                                            
test_pool_f = Pool(                                                                                                                                                      
    test_full.drop('customer_id').to_pandas(),
    cat_features=cat_features
)

# Предсказания
test_predict_f = full_model_f.predict(test_pool_f, prediction_type='RawFormulaVal')

# Пересоздаём список имён предсказаний                                                                                                                                     
predict_columns = [col.replace('target_', 'predict_') for col in target_columns]                                                                                           
                                                                                                                                                                            
# Собираем сабмит
submit_f = pl.DataFrame(test_predict_f, schema=predict_columns)
submit_f = test_full.select('customer_id').hstack(submit_f)

# Сохраняем
submit_f.write_parquet('../submissions/exp003_catboost_extra_features.parquet')

print(f'Сабмит: {submit_f.shape}')

Сабмит: (250000, 42)
