In [1]:
from catboost import CatBoostClassifier, Pool
from hyperopt import hp, tpe, fmin, Trials
import numpy as np
import pandas as pd
import polars as pl
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter('ignore')

In [2]:
train = pl.read_csv('C:/Users/16010/Desktop/Deep learning from Scratch/kaggle-S4E7/data/train.csv')
test = pl.read_csv('C:/Users/16010/Desktop/Deep learning from Scratch/kaggle-S4E7/data/test.csv')

# 在测试集中添加 'Response' 列，值为 0，类型为 Int64
test = test.with_columns(pl.lit(0).cast(pl.Int64).alias('Response'))

In [3]:
df = pl.concat([train, test])

df = df.with_columns([
    pl.col('Gender').replace({'Male': 0, 'Female': 1}).cast(pl.Int32),
    pl.col('Region_Code').cast(int),
    pl.col('Vehicle_Age').replace({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}).cast(pl.Int32),
    pl.col('Vehicle_Damage').replace({'No': 0, 'Yes': 1}).cast(pl.Int32),
    pl.col('Annual_Premium').cast(int),
    pl.col('Policy_Sales_Channel').cast(int)
])

df = df.with_columns([
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Annual_Premium'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Annual_Premium'),
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Vehicle_Age'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Vehicle_Age'),
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Vehicle_Damage'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Vehicle_Damage'),
    (pl.Series(pd.factorize((df['Previously_Insured'].cast(str) + df['Vintage'].cast(str)).to_numpy())[0])).alias('Previously_Insured_Vintage')
])

train = df[:train.shape[0]].to_pandas()
test = df[train.shape[0]:].to_pandas()

train

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Previously_Insured_Annual_Premium,Previously_Insured_Vehicle_Age,Previously_Insured_Vehicle_Damage,Previously_Insured_Vintage
0,0,0,21,1,35,0,1,1,65101,124,187,0,0,0,0,0
1,1,0,43,1,28,0,2,1,58911,26,288,1,1,1,0,1
2,2,1,25,1,14,1,0,0,38043,152,254,0,2,2,1,2
3,3,1,35,1,1,0,1,1,2630,156,76,0,3,0,0,3
4,4,1,36,1,15,1,1,0,31951,152,294,0,4,3,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11504793,11504793,0,48,1,6,0,1,1,27412,26,218,0,5210,0,0,144
11504794,11504794,1,26,1,36,0,0,1,29509,152,115,1,23274,4,0,176
11504795,11504795,1,29,1,32,1,0,0,2630,152,189,0,18,2,1,456
11504796,11504796,1,51,1,28,0,1,1,48443,26,274,1,14121,0,0,124


In [4]:
'''使用 polars 库读取 CSV 数据。
合并训练集和测试集以便统一预处理。
对类别特征进行编码，如性别、地区代码、车辆年龄、车辆损坏情况等。
创建一些组合特征以增加模型的表达能力。
将预处理后的数据分割回训练集和测试集，并转换为 Pandas 数据框。
Previously_Insured 和 Annual_Premium 的组合

组合方式：将 Previously_Insured 和 Annual_Premium 转换为字符串形式后相加。
编码方式：使用 pd.factorize 对组合后的字符串进行编码。
生成新特征 Previously_Insured_Annual_Premium。
Previously_Insured 和 Vehicle_Age 的组合

组合方式：将 Previously_Insured 和 Vehicle_Age 转换为字符串形式后相加。
编码方式：使用 pd.factorize 对组合后的字符串进行编码。
生成新特征 Previously_Insured_Vehicle_Age。
Previously_Insured 和 Vehicle_Damage 的组合

组合方式：将 Previously_Insured 和 Vehicle_Damage 转换为字符串形式后相加。
编码方式：使用 pd.factorize 对组合后的字符串进行编码。
生成新特征 Previously_Insured_Vehicle_Damage。
Previously_Insured 和 Vintage 的组合

组合方式：将 Previously_Insured 和 Vintage 转换为字符串形式后相加。
编码方式：使用 pd.factorize 对组合后的字符串进行编码。
生成新特征 Previously_Insured_Vintage。
通过这种方式生成的组合特征，能够捕捉到原始特征之间的相互关系，提高模型的表达能力，从而提升模型的预测性能。'''

'使用 polars 库读取 CSV 数据。\n合并训练集和测试集以便统一预处理。\n对类别特征进行编码，如性别、地区代码、车辆年龄、车辆损坏情况等。\n创建一些组合特征以增加模型的表达能力。\n将预处理后的数据分割回训练集和测试集，并转换为 Pandas 数据框。\nPreviously_Insured 和 Annual_Premium 的组合\n\n组合方式：将 Previously_Insured 和 Annual_Premium 转换为字符串形式后相加。\n编码方式：使用 pd.factorize 对组合后的字符串进行编码。\n生成新特征 Previously_Insured_Annual_Premium。\nPreviously_Insured 和 Vehicle_Age 的组合\n\n组合方式：将 Previously_Insured 和 Vehicle_Age 转换为字符串形式后相加。\n编码方式：使用 pd.factorize 对组合后的字符串进行编码。\n生成新特征 Previously_Insured_Vehicle_Age。\nPreviously_Insured 和 Vehicle_Damage 的组合\n\n组合方式：将 Previously_Insured 和 Vehicle_Damage 转换为字符串形式后相加。\n编码方式：使用 pd.factorize 对组合后的字符串进行编码。\n生成新特征 Previously_Insured_Vehicle_Damage。\nPreviously_Insured 和 Vintage 的组合\n\n组合方式：将 Previously_Insured 和 Vintage 转换为字符串形式后相加。\n编码方式：使用 pd.factorize 对组合后的字符串进行编码。\n生成新特征 Previously_Insured_Vintage。\n通过这种方式生成的组合特征，能够捕捉到原始特征之间的相互关系，提高模型的表达能力，从而提升模型的预测性能。'

In [5]:
'''aucs = []
preds = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['Response'])):
    print(f'### Fold {fold+1} Training ###')

    X_train = train.loc[train_idx, [c for c in train.columns if c not in ['id', 'Response']]]
    y_train = train.loc[train_idx, 'Response']
    X_valid = train.loc[valid_idx, X_train.columns]
    y_valid = train.loc[valid_idx, 'Response']
    X_test = test[X_train.columns]

    X_train_pool = Pool(X_train, y_train, cat_features=X_train.columns.values)
    X_valid_pool = Pool(X_valid, y_valid, cat_features=X_valid.columns.values)
    X_test_pool = Pool(X_test, cat_features=X_test.columns.values)

    model = CatBoostClassifier(
        loss_function='Logloss',
        eval_metric='AUC',
        learning_rate=0.05,
        iterations=5000,
        depth=9,
        random_strength=0,
        l2_leaf_reg=0.5,
        task_type='GPU',
        devices='1:2:3:4:5:6',
        random_seed=42,
        verbose=False
    )

    model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=1000, early_stopping_rounds=200)

    pred_valid = model.predict_proba(X_valid_pool)[:, 1]
    preds.append(model.predict_proba(X_test_pool)[:, 1])

    auc = roc_auc_score(y_valid, pred_valid)
    aucs.append(auc)

    print(f'Fold {fold+1} AUC: {auc:.5f}\n')

print(f'\nOverall AUC: {np.mean(aucs):.5f} +/- {np.std(aucs):.5f}')'''

"aucs = []\npreds = []\n\nskf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n\nfor fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['Response'])):\n    print(f'### Fold {fold+1} Training ###')\n\n    X_train = train.loc[train_idx, [c for c in train.columns if c not in ['id', 'Response']]]\n    y_train = train.loc[train_idx, 'Response']\n    X_valid = train.loc[valid_idx, X_train.columns]\n    y_valid = train.loc[valid_idx, 'Response']\n    X_test = test[X_train.columns]\n\n    X_train_pool = Pool(X_train, y_train, cat_features=X_train.columns.values)\n    X_valid_pool = Pool(X_valid, y_valid, cat_features=X_valid.columns.values)\n    X_test_pool = Pool(X_test, cat_features=X_test.columns.values)\n\n    model = CatBoostClassifier(\n        loss_function='Logloss',\n        eval_metric='AUC',\n        learning_rate=0.05,\n        iterations=5000,\n        depth=9,\n        random_strength=0,\n        l2_leaf_reg=0.5,\n        task_type='GPU',\n        devic

In [6]:
'''使用 StratifiedKFold 进行5折交叉验证，以确保各折中类别比例相同。
在每一折中，分割训练和验证数据，并创建 CatBoost 数据池。
定义 CatBoostClassifier 模型，并设置参数如损失函数、评估指标、学习率、迭代次数、树深度等。
训练模型，并在验证集上评估模型性能（AUC）。
记录每折的预测结果和AUC值。'''

'使用 StratifiedKFold 进行5折交叉验证，以确保各折中类别比例相同。\n在每一折中，分割训练和验证数据，并创建 CatBoost 数据池。\n定义 CatBoostClassifier 模型，并设置参数如损失函数、评估指标、学习率、迭代次数、树深度等。\n训练模型，并在验证集上评估模型性能（AUC）。\n记录每折的预测结果和AUC值。'

In [7]:
X = train.drop(['id', 'Response'], axis=1)
y = train['Response']
X_test = test.drop(['id', 'Response'], axis=1)

In [8]:
# 定义超参数优化目标函数
def hyperopt_objective(params):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    
    train_pool = Pool(X_train, y_train, cat_features=X_train.columns.values)
    valid_pool = Pool(X_valid, y_valid, cat_features=X_valid.columns.values)

    model = CatBoostClassifier(
        loss_function='Logloss',
        eval_metric='AUC',
        learning_rate=params['learning_rate'],
        iterations=int(params['iterations']),
        depth=int(params['depth']),
        l2_leaf_reg=params['l2_leaf_reg'],
        random_strength=params['random_strength'],
        task_type='GPU',
        devices='0',
        random_seed=42,
        verbose=False
    )

    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=200, verbose=False)
    y_pred = model.predict_proba(valid_pool)[:, 1]
    auc = roc_auc_score(y_valid, y_pred)

    return -auc

# 定义参数空间
param_space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.1),
    'iterations': hp.quniform('iterations', 4000, 6000, 200),
    'depth': hp.quniform('depth', 8, 11, 1),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 0.1, 1),
    'random_strength': hp.uniform('random_strength', 0, 1)
}


In [9]:
# 使用 Hyperopt 进行超参数优化
trials = Trials()
best = fmin(
    fn=hyperopt_objective,
    space=param_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=trials,
    rstate=np.random.default_rng(42)
)

# 转换超参数为整数
best['iterations'] = int(best['iterations'])
best['depth'] = int(best['depth'])
print("Best parameters found: ", best)

  0%|                                                                                                                                              | 0/20 [00:00<?, ?trial/s, best loss=?]

Default metric period is 5 because AUC
 is/are not implemented for GPU



  5%|█████▌                                                                                                         | 1/20 [21:48<6:54:16, 1308.24s/trial, best loss: -0.8950924446423459]

Default metric period is 5 because AUC
 is/are not implemented for GPU



 10%|███████████                                                                                                    | 2/20 [45:37<6:53:45, 1379.19s/trial, best loss: -0.8951119947355725]

Default metric period is 5 because AUC
 is/are not implemented for GPU



 15%|████████████████▎                                                                                            | 3/20 [1:32:21<9:35:12, 2030.16s/trial, best loss: -0.8951509122554926]

Default metric period is 5 because AUC
 is/are not implemented for GPU



 20%|█████████████████████▊                                                                                       | 4/20 [2:22:02<10:41:28, 2405.56s/trial, best loss: -0.895310614255763]

Default metric period is 5 because AUC
 is/are not implemented for GPU



 25%|███████████████████████████▎                                                                                 | 5/20 [3:06:55<10:27:17, 2509.20s/trial, best loss: -0.895310614255763]

Default metric period is 5 because AUC
 is/are not implemented for GPU



 30%|████████████████████████████████▋                                                                            | 6/20 [3:55:18<10:16:39, 2642.83s/trial, best loss: -0.895310614255763]

Default metric period is 5 because AUC
 is/are not implemented for GPU



 35%|██████████████████████████████████████▌                                                                       | 7/20 [4:08:31<7:21:36, 2038.18s/trial, best loss: -0.895310614255763]

Default metric period is 5 because AUC
 is/are not implemented for GPU



 40%|███████████████████████████████████████████▌                                                                 | 8/20 [4:40:04<6:38:25, 1992.11s/trial, best loss: -0.8953288856657594]

Default metric period is 5 because AUC
 is/are not implemented for GPU



 45%|█████████████████████████████████████████████████                                                            | 9/20 [4:53:00<4:55:30, 1611.88s/trial, best loss: -0.8953288856657594]

Default metric period is 5 because AUC
 is/are not implemented for GPU



 50%|██████████████████████████████████████████████████████                                                      | 10/20 [5:19:30<4:27:31, 1605.13s/trial, best loss: -0.8953288856657594]

Default metric period is 5 because AUC
 is/are not implemented for GPU



 55%|███████████████████████████████████████████████████████████▍                                                | 11/20 [6:30:10<6:01:44, 2411.61s/trial, best loss: -0.8953288856657594]

Default metric period is 5 because AUC
 is/are not implemented for GPU



 60%|████████████████████████████████████████████████████████████████▊                                           | 12/20 [7:01:29<4:59:54, 2249.29s/trial, best loss: -0.8953288856657594]

Default metric period is 5 because AUC
 is/are not implemented for GPU



 65%|██████████████████████████████████████████████████████████████████████▏                                     | 13/20 [7:23:01<3:48:36, 1959.43s/trial, best loss: -0.8953288856657594]

Default metric period is 5 because AUC
 is/are not implemented for GPU



 70%|███████████████████████████████████████████████████████████████████████████▌                                | 14/20 [7:48:33<3:03:01, 1830.22s/trial, best loss: -0.8953288856657594]

Default metric period is 5 because AUC
 is/are not implemented for GPU



 75%|█████████████████████████████████████████████████████████████████████████████████                           | 15/20 [8:14:24<2:25:31, 1746.20s/trial, best loss: -0.8953288856657594]

Default metric period is 5 because AUC
 is/are not implemented for GPU



 80%|██████████████████████████████████████████████████████████████████████████████████████▍                     | 16/20 [8:24:46<1:33:51, 1407.88s/trial, best loss: -0.8953288856657594]

Default metric period is 5 because AUC
 is/are not implemented for GPU



 85%|███████████████████████████████████████████████████████████████████████████████████████████▊                | 17/20 [8:47:12<1:09:27, 1389.10s/trial, best loss: -0.8953288856657594]

Default metric period is 5 because AUC
 is/are not implemented for GPU



 90%|█████████████████████████████████████████████████████████████████████████████████████████████████▏          | 18/20 [9:41:18<1:04:54, 1947.13s/trial, best loss: -0.8953288856657594]

Default metric period is 5 because AUC
 is/are not implemented for GPU



 95%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌     | 19/20 [10:03:03<29:14, 1754.33s/trial, best loss: -0.8953288856657594]

Default metric period is 5 because AUC
 is/are not implemented for GPU



100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [10:30:22<00:00, 1891.12s/trial, best loss: -0.8953288856657594]
Best parameters found:  {'depth': 8, 'iterations': 4600, 'l2_leaf_reg': 0.7732321956643607, 'learning_rate': 0.08470806691352324, 'random_strength': 0.06428686330954925}


In [10]:
# 使用最佳参数训练最终模型并预测
train_pool = Pool(X, y, cat_features=X.columns.values)
test_pool = Pool(X_test, cat_features=X_test.columns.values)

model = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='AUC',
    learning_rate=best['learning_rate'],
    iterations=best['iterations'],
    depth=best['depth'],
    random_strength=best['random_strength'],
    l2_leaf_reg=best['l2_leaf_reg'],
    task_type='GPU',
    devices='0',
    random_seed=42,
    verbose=1000
)

model.fit(train_pool, verbose=1000)


# 预测测试集
test_preds = model.predict_proba(test_pool)[:, 1]

Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 725ms	remaining: 55m 32s
1000:	total: 10m 21s	remaining: 37m 13s
2000:	total: 20m 40s	remaining: 26m 51s
3000:	total: 31m 4s	remaining: 16m 33s
4000:	total: 41m 30s	remaining: 6m 12s
4599:	total: 47m 45s	remaining: 0us


In [11]:
submission = test[['id']]
submission['Response'] = test_preds
submission.to_csv('C:/Users/16010/Desktop/Deep learning from Scratch/kaggle-S4E7/submission/submission.csv', index=False)
submission

Unnamed: 0,id,Response
0,11504798,0.005448
1,11504799,0.669330
2,11504800,0.249240
3,11504801,0.000071
4,11504802,0.215197
...,...,...
7669861,19174659,0.187353
7669862,19174660,0.000130
7669863,19174661,0.000458
7669864,19174662,0.558409
