In [1]:

from catboost import CatBoostClassifier, Pool
import numpy as np
import pandas as pd
import polars as pl
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.simplefilter('ignore')

In [2]:
train = pl.read_csv('/home/zhenghao/kaggle/train.csv')
test = pl.read_csv('/home/zhenghao/kaggle/test.csv')

# 在测试集中添加 'Response' 列，值为 0，类型为 Int64
test = test.with_columns(pl.lit(0).cast(pl.Int64).alias('Response'))

In [3]:
df = pl.concat([train, test])

df = df.with_columns([
    pl.col('Gender').replace({'Male': 0, 'Female': 1}).cast(pl.Int32),
    pl.col('Region_Code').cast(int),
    pl.col('Vehicle_Age').replace({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}).cast(pl.Int32),
    pl.col('Vehicle_Damage').replace({'No': 0, 'Yes': 1}).cast(pl.Int32),
    pl.col('Annual_Premium').cast(int),
    pl.col('Policy_Sales_Channel').cast(int)
])

df = df.with_columns([
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Annual_Premium'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Annual_Premium'),
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Vehicle_Age'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Vehicle_Age'),
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Vehicle_Damage'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Vehicle_Damage'),
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Vintage'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Vintage')
])

train = df[:train.shape[0]].to_pandas()
test = df[train.shape[0]:].to_pandas()

train

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Previously_Insured_Annual_Premium,Previously_Insured_Vehicle_Age,Previously_Insured_Vehicle_Damage,Previously_Insured_Vintage
0,0,0,21,1,35,0,1,1,65101,124,187,0,0,0,0,0
1,1,0,43,1,28,0,2,1,58911,26,288,1,1,1,0,1
2,2,1,25,1,14,1,0,0,38043,152,254,0,2,2,1,2
3,3,1,35,1,1,0,1,1,2630,156,76,0,3,0,0,3
4,4,1,36,1,15,1,1,0,31951,152,294,0,4,3,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11504793,11504793,0,48,1,6,0,1,1,27412,26,218,0,5210,0,0,144
11504794,11504794,1,26,1,36,0,0,1,29509,152,115,1,23274,4,0,176
11504795,11504795,1,29,1,32,1,0,0,2630,152,189,0,18,2,1,456
11504796,11504796,1,51,1,28,0,1,1,48443,26,274,1,14121,0,0,124


In [4]:
## train model
aucs = []
preds = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['Response'])):
    print(f'### Fold {fold+1} Training ###')

    X_train = train.loc[train_idx, [c for c in train.columns if c not in ['id', 'Response']]]
    y_train = train.loc[train_idx, 'Response']
    X_valid = train.loc[valid_idx, X_train.columns]
    y_valid = train.loc[valid_idx, 'Response']
    X_test = test[X_train.columns]

    X_train_pool = Pool(X_train, y_train, cat_features=X_train.columns.values)
    X_valid_pool = Pool(X_valid, y_valid, cat_features=X_valid.columns.values)
    X_test_pool = Pool(X_test, cat_features=X_test.columns.values)

    model = CatBoostClassifier(
        loss_function='Logloss',
        eval_metric='Logloss',
        learning_rate=0.084708066913,
        iterations=4600,
        depth=8,
        random_strength=0.064286863309,
        l2_leaf_reg=0.7732321956643,
        task_type='GPU',
        devices='1:2:3:4:5:6',
        random_seed=42,
        verbose=False
    )

    model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=1000, early_stopping_rounds=200)

    pred_valid = model.predict_proba(X_valid_pool)[:, 1]
    preds.append(model.predict_proba(X_test_pool)[:, 1])

    auc = roc_auc_score(y_valid, pred_valid)
    aucs.append(auc)

    print(f'Fold {fold+1} AUC: {auc:.5f}\n')

print(f'\nOverall AUC: {np.mean(aucs):.5f} +/- {np.std(aucs):.5f}')

### Fold 1 Training ###
0:	learn: 0.5508404	test: 0.5508754	best: 0.5508754 (0)	total: 539ms	remaining: 41m 21s
1000:	learn: 0.2407191	test: 0.2403677	best: 0.2403677 (1000)	total: 8m 44s	remaining: 31m 26s
2000:	learn: 0.2389666	test: 0.2401627	best: 0.2401627 (2000)	total: 17m 17s	remaining: 22m 27s
3000:	learn: 0.2374210	test: 0.2400902	best: 0.2400901 (2999)	total: 25m 51s	remaining: 13m 46s
bestTest = 0.240081504
bestIteration = 3387
Shrink model to first 3388 iterations.
Fold 1 AUC: 0.89518

### Fold 2 Training ###
0:	learn: 0.5542217	test: 0.5541692	best: 0.5541692 (0)	total: 511ms	remaining: 39m 9s
1000:	learn: 0.2407050	test: 0.2407709	best: 0.2407707 (999)	total: 8m 35s	remaining: 30m 55s
2000:	learn: 0.2389363	test: 0.2405415	best: 0.2405415 (2000)	total: 17m 6s	remaining: 22m 13s
3000:	learn: 0.2373972	test: 0.2404866	best: 0.2404862 (2999)	total: 25m 35s	remaining: 13m 38s
bestTest = 0.2404792728
bestIteration = 3403
Shrink model to first 3404 iterations.
Fold 2 AUC: 0.894

In [6]:
submission = test[['id']]
submission['Response'] = np.mean(preds, axis=0)
submission.to_csv('submission_5_autoparam.csv', index=False)
submission

Unnamed: 0,id,Response
0,11504798,0.004967
1,11504799,0.665279
2,11504800,0.243507
3,11504801,0.000087
4,11504802,0.221603
...,...,...
7669861,19174659,0.191297
7669862,19174660,0.000171
7669863,19174661,0.000503
7669864,19174662,0.580391
