In [1]:
# Import Packages
from catboost import CatBoostClassifier, Pool
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler

In [2]:
import warnings
warnings.simplefilter('ignore')

In [3]:
# Load Data
train = pd.read_csv('/kaggle/input/playground-series-s4e7/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e7/test.csv')

# Add 'Response' Column to Test Data
test['Response'] = 0

train.head(), test.head()

(   id  Gender  Age  Driving_License  Region_Code  Previously_Insured  \
 0   0    Male   21                1         35.0                   0   
 1   1    Male   43                1         28.0                   0   
 2   2  Female   25                1         14.0                   1   
 3   3  Female   35                1          1.0                   0   
 4   4  Female   36                1         15.0                   1   
 
   Vehicle_Age Vehicle_Damage  Annual_Premium  Policy_Sales_Channel  Vintage  \
 0    1-2 Year            Yes         65101.0                 124.0      187   
 1   > 2 Years            Yes         58911.0                  26.0      288   
 2    < 1 Year             No         38043.0                 152.0      254   
 3    1-2 Year            Yes          2630.0                 156.0       76   
 4    1-2 Year             No         31951.0                 152.0      294   
 
    Response  
 0         0  
 1         1  
 2         0  
 3         0  
 4 

In [4]:
# Prepare Data
def prepare_data(df):
    copied_df = df.copy()

    copied_df['Gender'] = copied_df['Gender'].replace({'Male': 0, 'Female': 1}).astype(int)
    copied_df['Region_Code'] = copied_df['Region_Code'].astype(int)
    copied_df['Vehicle_Age'] = copied_df['Vehicle_Age'].replace({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}).astype(int)
    copied_df['Vehicle_Damage'] = copied_df['Vehicle_Damage'].replace({'No': 0, 'Yes': 1}).astype(int)
    copied_df['Policy_Sales_Channel'] = copied_df['Policy_Sales_Channel'].astype(int)
    copied_df['Annual_Premium'] = copied_df['Annual_Premium'].astype(int)
    
    # Derived Variables (2024-07-30)
    copied_df['Previously_Insured_Annual_Premium'] = pd.factorize((copied_df['Previously_Insured'].astype(str) + copied_df['Annual_Premium'].astype(str)).to_numpy())[0]
    copied_df['Previously_Insured_Vehicle_Age'] = pd.factorize((copied_df['Previously_Insured'].astype(str) + copied_df['Vehicle_Age'].astype(str)).to_numpy())[0]
    copied_df['Previously_Insured_Vehicle_Damage'] = pd.factorize((copied_df['Previously_Insured'].astype(str) + copied_df['Vehicle_Damage'].astype(str)).to_numpy())[0]
    copied_df['Previously_Insured_Vintage'] = pd.factorize((copied_df['Previously_Insured'].astype(str) + copied_df['Vintage'].astype(str)).to_numpy())[0]

    return copied_df

In [5]:
# Prepare Train / Test Data
train = prepare_data(train)
test = prepare_data(test)

train

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Previously_Insured_Annual_Premium,Previously_Insured_Vehicle_Age,Previously_Insured_Vehicle_Damage,Previously_Insured_Vintage
0,0,0,21,1,35,0,1,1,65101,124,187,0,0,0,0,0
1,1,0,43,1,28,0,2,1,58911,26,288,1,1,1,0,1
2,2,1,25,1,14,1,0,0,38043,152,254,0,2,2,1,2
3,3,1,35,1,1,0,1,1,2630,156,76,0,3,0,0,3
4,4,1,36,1,15,1,1,0,31951,152,294,0,4,3,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11504793,11504793,0,48,1,6,0,1,1,27412,26,218,0,5210,0,0,144
11504794,11504794,1,26,1,36,0,0,1,29509,152,115,1,23274,4,0,176
11504795,11504795,1,29,1,32,1,0,0,2630,152,189,0,18,2,1,456
11504796,11504796,1,51,1,28,0,1,1,48443,26,274,1,14121,0,0,124


In [6]:
'''
# Normalization (2024-07-30)
scaler = MinMaxScaler()
train[['Age', 'Annual_Premium']] = scaler.fit_transform(train[['Age', 'Annual_Premium']])
test[['Age', 'Annual_Premium']] = scaler.transform(test[['Age', 'Annual_Premium']])

print(train[['Age', 'Annual_Premium']].head())
'''

"\n# Normalization (2024-07-30)\nscaler = MinMaxScaler()\ntrain[['Age', 'Annual_Premium']] = scaler.fit_transform(train[['Age', 'Annual_Premium']])\ntest[['Age', 'Annual_Premium']] = scaler.transform(test[['Age', 'Annual_Premium']])\n\nprint(train[['Age', 'Annual_Premium']].head())\n"

In [7]:
'''
# Ensure categorical variables are treated as strings (2024-07-30)
categorical_features = ['Gender', 'Region_Code', 'Vehicle_Age', 'Vehicle_Damage', 'Policy_Sales_Channel',
                        'Previously_Insured_Annual_Premium', 'Previously_Insured_Vehicle_Age',
                        'Previously_Insured_Vehicle_Damage', 'Previously_Insured_Vintage']

for feature in categorical_features:
    train[feature] = train[feature].astype(str)
    test[feature] = test[feature].astype(str)
'''

"\n# Ensure categorical variables are treated as strings (2024-07-30)\ncategorical_features = ['Gender', 'Region_Code', 'Vehicle_Age', 'Vehicle_Damage', 'Policy_Sales_Channel',\n                        'Previously_Insured_Annual_Premium', 'Previously_Insured_Vehicle_Age',\n                        'Previously_Insured_Vehicle_Damage', 'Previously_Insured_Vintage']\n\nfor feature in categorical_features:\n    train[feature] = train[feature].astype(str)\n    test[feature] = test[feature].astype(str)\n"

In [8]:
# HYPER PARAMETERS

# KFold
n_splits = 5

# CatBoost
loss_function = 'Logloss'
eval_metric = 'AUC'
learning_rate = 0.075
iterations = 1000
depth = 5
random_strength = 0
l2_leaf_reg = 0.45
random_seed = 42
verbose = False
bagging_temperature = 0.5

# Model Fitting
early_stopping_rounds = 200

# GPU or CPU
task_type = 'GPU'

In [9]:
# Train Model
preds = []

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

print("Start Training...")

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['Response'])):
    X_train = train.loc[train_idx, [c for c in train.columns if c not in ['id', 'Response']]]
    y_train = train.loc[train_idx, 'Response']
    X_valid = train.loc[valid_idx, X_train.columns]
    y_valid = train.loc[valid_idx, 'Response']

    X_train_pool = Pool(X_train, y_train, cat_features=X_train.columns.values)
    X_valid_pool = Pool(X_valid, y_valid, cat_features=X_valid.columns.values)

    model = CatBoostClassifier(
        loss_function=loss_function,                 # 기존: Logloss
        eval_metric=eval_metric,                     # 기존: AUC
        learning_rate=learning_rate,                 # 기존: 0.05
        iterations=iterations,                       # 기존: 5000
        depth=depth,                                 # 기존: 9
        random_strength=random_strength,             # 기존: 0
        l2_leaf_reg=l2_leaf_reg,                     # 기존: 0.5
        task_type=task_type,                         # 기존: GPU
        random_seed=random_seed,                     # 기존: 42
        verbose=verbose,                             # 기존: False
        bagging_temperature=bagging_temperature      # 기존: 없음
    )

    model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=1000, early_stopping_rounds=early_stopping_rounds)

    # Perform Prediction
    X_test = test[X_train.columns]
    X_test_pool = Pool(X_test, cat_features=X_test.columns.values)
    pred_valid = model.predict_proba(X_valid_pool)[:, 1]
    preds.append(model.predict_proba(X_test_pool)[:, 1])

    # Output Accuracy
    auc = roc_auc_score(y_valid, pred_valid)
    print(f'Fold {fold+1} AUC: {auc:.5f}\n')

Start Training...


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8669865	best: 0.8669865 (0)	total: 1.37s	remaining: 22m 48s
999:	test: 0.8944119	best: 0.8944119 (999)	total: 22m 52s	remaining: 0us
bestTest = 0.8944119215
bestIteration = 999
Fold 1 AUC: 0.89441



Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8658625	best: 0.8658625 (0)	total: 1.16s	remaining: 19m 21s
999:	test: 0.8939996	best: 0.8939996 (998)	total: 22m 53s	remaining: 0us
bestTest = 0.8939995766
bestIteration = 998
Shrink model to first 999 iterations.
Fold 2 AUC: 0.89400



Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8664176	best: 0.8664176 (0)	total: 1.16s	remaining: 19m 19s
999:	test: 0.8943298	best: 0.8943298 (999)	total: 22m 40s	remaining: 0us
bestTest = 0.8943297863
bestIteration = 999
Fold 3 AUC: 0.89433



Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8655235	best: 0.8655235 (0)	total: 1.28s	remaining: 21m 18s
999:	test: 0.8941033	best: 0.8941033 (999)	total: 22m 50s	remaining: 0us
bestTest = 0.8941033483
bestIteration = 999
Fold 4 AUC: 0.89410



Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8674299	best: 0.8674299 (0)	total: 1.16s	remaining: 19m 15s
999:	test: 0.8947929	best: 0.8947929 (999)	total: 22m 43s	remaining: 0us
bestTest = 0.8947929144
bestIteration = 999
Fold 5 AUC: 0.89479



In [10]:
'''
# Train Model (Normalize Version)
preds = []

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

print("Start Training...")

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train['Response'])):
    X_train = train.loc[train_idx, [c for c in train.columns if c not in ['id', 'Response']]]
    y_train = train.loc[train_idx, 'Response']
    X_valid = train.loc[valid_idx, X_train.columns]
    y_valid = train.loc[valid_idx, 'Response']

    X_train_pool = Pool(X_train, y_train, cat_features=categorical_features)
    X_valid_pool = Pool(X_valid, y_valid, cat_features=categorical_features)

    model = CatBoostClassifier(
        loss_function=loss_function,                 # 기존: Logloss
        eval_metric=eval_metric,                     # 기존: AUC
        learning_rate=learning_rate,                 # 기존: 0.05
        iterations=iterations,                       # 기존: 5000
        depth=depth,                                 # 기존: 9
        random_strength=random_strength,             # 기존: 0
        l2_leaf_reg=l2_leaf_reg,                     # 기존: 0.5
        task_type=task_type,                         # 기존: GPU
        random_seed=random_seed,                     # 기존: 42
        verbose=verbose,                             # 기존: False
        bagging_temperature=bagging_temperature      # 기존: 없음
    )

    model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=1000, early_stopping_rounds=early_stopping_rounds)

    # Perform prediction
    X_test = test[X_train.columns]
    X_test_pool = Pool(X_test, cat_features=categorical_features)
    pred_valid = model.predict_proba(X_valid_pool)[:, 1]
    preds.append(model.predict_proba(X_test_pool)[:, 1])

    # Output Accuracy
    auc = roc_auc_score(y_valid, pred_valid)
    print(f'Fold {fold+1} AUC: {auc:.5f}\n')
'''

'\n# Train Model (Normalize Version)\npreds = []\n\nskf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)\n\nprint("Start Training...")\n\nfor fold, (train_idx, valid_idx) in enumerate(skf.split(train, train[\'Response\'])):\n    X_train = train.loc[train_idx, [c for c in train.columns if c not in [\'id\', \'Response\']]]\n    y_train = train.loc[train_idx, \'Response\']\n    X_valid = train.loc[valid_idx, X_train.columns]\n    y_valid = train.loc[valid_idx, \'Response\']\n\n    X_train_pool = Pool(X_train, y_train, cat_features=categorical_features)\n    X_valid_pool = Pool(X_valid, y_valid, cat_features=categorical_features)\n\n    model = CatBoostClassifier(\n        loss_function=loss_function,                 # 기존: Logloss\n        eval_metric=eval_metric,                     # 기존: AUC\n        learning_rate=learning_rate,                 # 기존: 0.05\n        iterations=iterations,                       # 기존: 5000\n        depth=depth,                             

In [11]:
# Create Submission
submission = test[['id']]
submission['Response'] = np.mean(preds, axis=0)

submission.to_csv('submission.csv', index=False)
submission

Unnamed: 0,id,Response
0,11504798,0.015542
1,11504799,0.103709
2,11504800,0.020480
3,11504801,0.000348
4,11504802,0.302103
...,...,...
7669861,19174659,0.048475
7669862,19174660,0.000598
7669863,19174661,0.000677
7669864,19174662,0.096565
