In [1]:
import polars as pl
import numpy as np
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# Загрузка данных
train_main = pl.read_parquet('../data/raw/train_main_features.parquet')
test_main = pl.read_parquet('../data/raw/test_main_features.parquet')
train_extra = pl.read_parquet('../data/raw/train_extra_features.parquet')
test_extra = pl.read_parquet('../data/raw/test_extra_features.parquet')
target = pl.read_parquet('../data/raw/train_target.parquet')

# Списки признаков
cat_features = [col for col in train_main.columns if col.startswith('cat_feature')]
target_columns = [col for col in target.columns if col.startswith('target')]

# Каст категориальных
train_main = train_main.with_columns(pl.col(cat_features).cast(pl.Int32))
test_main = test_main.with_columns(pl.col(cat_features).cast(pl.Int32))

# Объединяем main + extra
train_full = train_main.join(train_extra, on='customer_id', how='left')
test_full = test_main.join(test_extra, on='customer_id', how='left')

print(f'Train: {train_full.shape}, Test: {test_full.shape}')

Train: (750000, 2441), Test: (250000, 2441)


In [2]:
# Числовые признаки (всё кроме customer_id и cat_feature)
num_features = [col for col in train_full.columns
                if col != 'customer_id' and not col.startswith('cat_feature')]

# Количество пропусков у клиента
train_full = train_full.with_columns(
    pl.sum_horizontal([pl.col(c).is_null().cast(pl.Int32) for c in num_features]).alias('null_count')
)
test_full = test_full.with_columns(
    pl.sum_horizontal([pl.col(c).is_null().cast(pl.Int32) for c in num_features]).alias('null_count')
)

print(f'Новый признак: null_count')
print(train_full.select('null_count').describe())

Новый признак: null_count
shape: (9, 2)
┌────────────┬─────────────┐
│ statistic  ┆ null_count  │
│ ---        ┆ ---         │
│ str        ┆ f64         │
╞════════════╪═════════════╡
│ count      ┆ 750000.0    │
│ null_count ┆ 0.0         │
│ mean       ┆ 1370.376988 │
│ std        ┆ 401.208412  │
│ min        ┆ 464.0       │
│ 25%        ┆ 1052.0      │
│ 50%        ┆ 1304.0      │
│ 75%        ┆ 1633.0      │
│ max        ┆ 2372.0      │
└────────────┴─────────────┘


In [3]:
# Среднее и std по числовым признакам
train_full = train_full.with_columns([
    pl.mean_horizontal([pl.col(c) for c in num_features]).alias('num_mean'),
    pl.concat_list([pl.col(c) for c in num_features]).list.eval(pl.element().std()).list.first().alias('num_std'),
])
test_full = test_full.with_columns([
    pl.mean_horizontal([pl.col(c) for c in num_features]).alias('num_mean'),
    pl.concat_list([pl.col(c) for c in num_features]).list.eval(pl.element().std()).list.first().alias('num_std'),
])

print('Новые признаки: num_mean, num_std')
print(train_full.select(['num_mean', 'num_std']).describe())

Новые признаки: num_mean, num_std
shape: (9, 3)
┌────────────┬───────────┬────────────┐
│ statistic  ┆ num_mean  ┆ num_std    │
│ ---        ┆ ---       ┆ ---        │
│ str        ┆ f64       ┆ f64        │
╞════════════╪═══════════╪════════════╡
│ count      ┆ 750000.0  ┆ 749959.0   │
│ null_count ┆ 0.0       ┆ 41.0       │
│ mean       ┆ -0.01191  ┆ 0.646844   │
│ std        ┆ 0.115554  ┆ 0.671791   │
│ min        ┆ -1.099871 ┆ 0.048756   │
│ 25%        ┆ -0.063703 ┆ 0.357135   │
│ 50%        ┆ -0.030433 ┆ 0.517561   │
│ 75%        ┆ 0.018041  ┆ 0.744472   │
│ max        ┆ 33.753505 ┆ 166.163467 │
└────────────┴───────────┴────────────┘


In [4]:
# Частоты категориальных признаков
for col in cat_features:
    # Считаем частоты на train
    freq = train_full.group_by(col).agg(pl.len().alias('_cnt'))
    freq = freq.with_columns((pl.col('_cnt') / len(train_full)).alias(f'{col}_freq')).drop('_cnt')

    train_full = train_full.join(freq, on=col, how='left')
    test_full = test_full.join(freq, on=col, how='left')

print(f'Добавлено {len(cat_features)} частотных признаков')
print(f'Train: {train_full.shape}, Test: {test_full.shape}')

Добавлено 67 частотных признаков
Train: (750000, 2511), Test: (250000, 2511)


In [5]:
# Список всех признаков (без customer_id)
feature_columns_fe = [col for col in train_full.columns if col != 'customer_id']

# Split (те же индексы что раньше)
train_idx, val_idx = train_test_split(np.arange(len(train_full)), test_size=0.2, random_state=42)

X_train = train_full[train_idx].select(feature_columns_fe).to_pandas()
X_val = train_full[val_idx].select(feature_columns_fe).to_pandas()
y_train = target[train_idx].select(target_columns).to_pandas()
y_val = target[val_idx].select(target_columns).to_pandas()

# Pool (cat_features список тот же — новые фичи все числовые)
train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
val_pool = Pool(X_val, label=y_val, cat_features=cat_features)

# Модель — те же параметры что EXP-004
model_fe = CatBoostClassifier(
    iterations=3000,
    depth=6,
    learning_rate=0.05,
    loss_function='MultiLogloss',
    nan_mode='Min',
    task_type='GPU',
    devices='0',
    random_seed=42,
    verbose=100,
    early_stopping_rounds=100
)

model_fe.fit(train_pool, eval_set=val_pool)



0:	learn: 0.5796405	test: 0.5796362	best: 0.5796362 (0)	total: 1.62s	remaining: 1h 21m 4s
100:	learn: 0.0879936	test: 0.0882682	best: 0.0882682 (100)	total: 2m 40s	remaining: 1h 16m 46s
200:	learn: 0.0855703	test: 0.0861986	best: 0.0861986 (200)	total: 5m 19s	remaining: 1h 14m 15s
300:	learn: 0.0841534	test: 0.0851357	best: 0.0851357 (300)	total: 8m 1s	remaining: 1h 11m 53s
400:	learn: 0.0831748	test: 0.0844931	best: 0.0844931 (400)	total: 10m 40s	remaining: 1h 9m 14s
500:	learn: 0.0823917	test: 0.0840259	best: 0.0840259 (500)	total: 13m 20s	remaining: 1h 6m 34s
600:	learn: 0.0818170	test: 0.0837142	best: 0.0837142 (600)	total: 15m 57s	remaining: 1h 3m 40s
700:	learn: 0.0813630	test: 0.0834918	best: 0.0834918 (700)	total: 18m 30s	remaining: 1h 42s
800:	learn: 0.0810096	test: 0.0833228	best: 0.0833228 (800)	total: 21m 1s	remaining: 57m 42s
900:	learn: 0.0807030	test: 0.0831785	best: 0.0831785 (900)	total: 23m 30s	remaining: 54m 45s
1000:	learn: 0.0804554	test: 0.0830738	best: 0.0830738 

CatBoostClassifier(depth=6, devices='0', early_stopping_rounds=100, iterations=3000, learning_rate=0.05, loss_function='MultiLogloss', nan_mode='Min', random_seed=42, task_type='GPU', verbose=100)

In [None]:
val_predict_fe = model_fe.predict(val_pool, prediction_type='RawFormulaVal')                                                                    
val_score_fe = roc_auc_score(y_val, val_predict_fe, average='macro')                                                                            

print(f'Local Val EXP-004 (без FE):  0.831442')
print(f'Local Val EXP-005 (с FE):    {val_score_fe:.6f}')
print(f'Прирост:                     {val_score_fe - 0.831442:+.6f}')
print(f'Best iteration: {model_fe.get_best_iteration()}')

Local Val EXP-004 (без FE):  0.831442
Local Val EXP-005 (с FE):    0.831116
Прирост:                     -0.000326
Best iteration: 2999


: 