In [1]:
import polars as pl
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score





In [2]:
df = pl.read_csv('/home/zhenghao/kaggle/train.csv')  # 替换为你的数据路径
df = df.with_columns([
    pl.col('Gender').replace({'Male': 0, 'Female': 1}).cast(pl.Int32),
    pl.col('Region_Code').cast(pl.Int32),
    pl.col('Vehicle_Age').replace({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}).cast(pl.Int32),
    pl.col('Vehicle_Damage').replace({'No': 0, 'Yes': 1}).cast(pl.Int32),
    pl.col('Annual_Premium').cast(pl.Int32),
    pl.col('Policy_Sales_Channel').cast(pl.Int32)
])

In [3]:
df_pandas = df.to_pandas()

In [4]:
X = df_pandas.drop('Response', axis=1)  # 替换'target'为你的目标列名
y = df_pandas['Response']  # 替换'target'为你的目标列名


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
def objective(trial):
    param = {
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-4, 1e1, log=True),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "random_strength": trial.suggest_float("random_strength", 0, 10),
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 10),
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "verbose": 0,
        "task_type": "GPU"  # 使用GPU进行加速
    }

    model = CatBoostClassifier(**param)
    model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=100, verbose=False)
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    
    return accuracy

# 使用Optuna进行超参数搜索
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# 输出最优超参数
print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


[I 2024-07-11 19:50:38,931] A new study created in memory with name: no-name-6adc023f-4c3b-429f-9b8e-70773daca2f8
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-07-11 19:50:48,615] Trial 0 finished with value: 0.8769278909672484 and parameters: {'iterations': 213, 'depth': 7, 'learning_rate': 0.004041690330476372, 'l2_leaf_reg': 0.7906166088756248, 'border_count': 43, 'bagging_temperature': 0.595490226437705, 'random_strength': 1.4233544723388736, 'one_hot_max_size': 3}. Best is trial 0 with value: 0.8769278909672484.
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-07-11 19:51:17,799] Trial 1 finished with value: 0.878521139002851 and parameters: {'iterations': 992, 'depth': 7, 'learning_rate': 0.05786747820025674, 'l2_leaf_reg': 0.19239530025397358, 'border_count': 100, 'bagging_temperature': 0.5246168868195776, 'random_strength': 9.637136513457401, 'one_hot_max_size': 7}. Best is trial 1 with value: 0.878521139002851.
Defau

Best trial:
  Value: 0.8807667234545581
  Params: 
    iterations: 959
    depth: 10
    learning_rate: 0.08721525726369066
    l2_leaf_reg: 9.655749947475009
    border_count: 236
    bagging_temperature: 0.37414911758345976
    random_strength: 1.2986868492796555
    one_hot_max_size: 5


In [8]:
best_params = study.best_params
final_model = CatBoostClassifier(**best_params)
final_model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=100)



0:	learn: 0.5363170	test: 0.5361937	best: 0.5361937 (0)	total: 482ms	remaining: 7m 41s
1:	learn: 0.4401349	test: 0.4400047	best: 0.4400047 (1)	total: 935ms	remaining: 7m 27s
2:	learn: 0.3833957	test: 0.3832568	best: 0.3832568 (2)	total: 1.38s	remaining: 7m 18s
3:	learn: 0.3475033	test: 0.3473876	best: 0.3473876 (3)	total: 1.79s	remaining: 7m 8s
4:	learn: 0.3247883	test: 0.3246848	best: 0.3246848 (4)	total: 2.21s	remaining: 7m 2s
5:	learn: 0.3099776	test: 0.3098950	best: 0.3098950 (5)	total: 2.61s	remaining: 6m 54s
6:	learn: 0.2996358	test: 0.2995747	best: 0.2995747 (6)	total: 3.02s	remaining: 6m 51s
7:	learn: 0.2919192	test: 0.2918734	best: 0.2918734 (7)	total: 3.46s	remaining: 6m 51s
8:	learn: 0.2863526	test: 0.2863239	best: 0.2863239 (8)	total: 3.87s	remaining: 6m 48s
9:	learn: 0.2818731	test: 0.2818605	best: 0.2818605 (9)	total: 4.31s	remaining: 6m 49s
10:	learn: 0.2784710	test: 0.2784610	best: 0.2784610 (10)	total: 4.73s	remaining: 6m 47s
11:	learn: 0.2757601	test: 0.2757666	best: 

<catboost.core.CatBoostClassifier at 0x7fea07dc2270>

In [9]:
# 预测和评估
final_preds = final_model.predict(X_test)
final_accuracy = accuracy_score(y_test, final_preds)
print(f"Final model accuracy: {final_accuracy}")

Final model accuracy: 0.8807349975662332


In [10]:
test_df = pl.read_csv('/home/zhenghao/kaggle/test.csv')  # 替换为你的test.csv文件路径
test_df = test_df.with_columns([
    pl.col('Gender').replace({'Male': 0, 'Female': 1}).cast(pl.Int32),
    pl.col('Region_Code').cast(pl.Int32),
    pl.col('Vehicle_Age').replace({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}).cast(pl.Int32),
    pl.col('Vehicle_Damage').replace({'No': 0, 'Yes': 1}).cast(pl.Int32),
    pl.col('Annual_Premium').cast(pl.Int32),
    pl.col('Policy_Sales_Channel').cast(pl.Int32)
])
test_df_pandas = test_df.to_pandas()
test_preds = final_model.predict(test_df_pandas)

submission = pd.DataFrame({'id': test_df_pandas['id'], 'target': test_preds})  # 替换'id'和'target'为你的实际列名
submission.to_csv('sub.csv', index=False)