In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
from category_encoders import TargetEncoder
from tqdm import tqdm
from lightgbm import LGBMClassifier
from category_encoders import TargetEncoder

# Read the data
train_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/train.csv')

In [2]:
# 데이터 샘플링
train = train_origin.sample(frac=0.01, random_state = 42)

# 예측에 필요 없는 'id'와 'Annual_Premium' 변수를 드롭
train = train.drop(columns=['id'])

# 범주형 변수 인코딩
def encoding(train):
    gender_mapping = {'Male': 0, 'Female': 1}
    vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    vehicle_damage_mapping = {'No': 0, 'Yes': 1}

    train['Gender'] = train['Gender'].map(gender_mapping)
    train['Vehicle_Age'] = train['Vehicle_Age'].map(vehicle_age_mapping)
    train['Vehicle_Damage'] = train['Vehicle_Damage'].map(vehicle_damage_mapping)

    return train

train = encoding(train)

# 범주형 변수 타겟 인코딩
cat_columns = ['Region_Code', 'Policy_Sales_Channel', 'Vintage']
train.loc[:,cat_columns] = train.loc[:,cat_columns].astype('category')

target_encoder = TargetEncoder()
train[cat_columns] = target_encoder.fit_transform(train[cat_columns],train['Response'])

#수치형 변수: Age, Region_Code, Annual Premium, Policy_Sales_Channel, Vintage -> normalize
scaler = MinMaxScaler()
num_columns = ['Age', 'Annual_Premium']
train[num_columns] = scaler.fit_transform(train[num_columns])

# XGBoost에서 발생하는 문제 해결
train.columns = train.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '')

# 예측변수 분리 및 train, valid set 분리
X = train.drop(['Response'], axis=1)
y = train['Response']

In [10]:
def modeling(model, X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state =42)
    train_scores = []
    valid_scores = []

    for fold, (train_index, valid_index) in enumerate(tqdm(skf.split(X_train, y_train), total=skf.get_n_splits(), desc="Folds"), 1):
        X_skf_train, X_skf_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_skf_train, y_skf_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

        model.fit(X_skf_train, y_skf_train)

        train_preds = model.predict_proba(X_skf_train)[:, 1]
        train_auc = roc_auc_score(y_skf_train, train_preds)
        train_scores.append(train_auc)

        valid_preds = model.predict_proba(X_skf_valid)[:, 1]
        valid_auc = roc_auc_score(y_skf_valid, valid_preds)
        valid_scores.append(valid_auc)

        print(f'Fold {fold}: Train ROC AUC: {train_auc:.4f}, Validation ROC AUC: {valid_auc:.4f}')

    print(f'Average Train ROC AUC: {sum(train_scores)/len(train_scores):.4f}')
    print(f'Average Validation ROC AUC: {sum(valid_scores)/len(valid_scores):.4f}')

    test_preds = model.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, test_preds)
    print(f'Test ROC AUC: {test_auc:.4f}')


    return train_scores, valid_scores, test_auc

In [14]:
# LightGBM
print("\nLightGBM:")
best_param = {'lambda_l1': 0.1, 'lambda_l2': 0.1, 'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 100, 'num_leaves': 20}
lgbm_model = LGBMClassifier(**best_param, random_state=42)
lgbm_train_scores, lgbm_valid_scores, lgbm_test_auc = modeling(lgbm_model, X, y)


LightGBM:


Folds:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001157 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 707
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  20%|██        | 1/5 [00:01<00:07,  1.78s/it]

Fold 1: Train ROC AUC: 0.8877, Validation ROC AUC: 0.8741
[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001156 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 711
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  40%|████      | 2/5 [00:03<00:04,  1.50s/it]

Fold 2: Train ROC AUC: 0.8881, Validation ROC AUC: 0.8719
[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.082269 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 710
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  60%|██████    | 3/5 [00:05<00:03,  1.99s/it]

Fold 3: Train ROC AUC: 0.8861, Validation ROC AUC: 0.8775
[LightGBM] [Info] Number of positive: 9055, number of negative: 64576
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040716 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 707
[LightGBM] [Info] Number of data points in the train set: 73631, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122978 -> initscore=-1.964526
[LightGBM] [Info] Start training from score -1.964526


Folds:  80%|████████  | 4/5 [00:11<00:03,  3.42s/it]

Fold 4: Train ROC AUC: 0.8889, Validation ROC AUC: 0.8702
[LightGBM] [Info] Number of positive: 9055, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001285 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 708
[LightGBM] [Info] Number of data points in the train set: 73631, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122978 -> initscore=-1.964526
[LightGBM] [Info] Start training from score -1.964526


Folds: 100%|██████████| 5/5 [00:12<00:00,  2.53s/it]

Fold 5: Train ROC AUC: 0.8878, Validation ROC AUC: 0.8715
Average Train ROC AUC: 0.8877
Average Validation ROC AUC: 0.8730





Test ROC AUC: 0.8776


In [15]:
# LightGBM
print("\nLightGBM:")
params_optuna = {'num_leaves': 31, 'max_depth': 4, 'learning_rate': 0.09052003360634207, 'n_estimators': 200, 'lambda_l1': 0.3182477714588782, 'lambda_l2': 0.4879151959075913}
lgbm_model = lgbm_model = LGBMClassifier(**params_optuna, random_state=42)
lgbm_train_scores, lgbm_valid_scores, lgbm_test_auc = modeling(lgbm_model, X, y)


LightGBM:


Folds:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008619 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 707
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  20%|██        | 1/5 [00:02<00:10,  2.63s/it]

Fold 1: Train ROC AUC: 0.8862, Validation ROC AUC: 0.8750
[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001141 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 711
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  40%|████      | 2/5 [00:06<00:10,  3.38s/it]

Fold 2: Train ROC AUC: 0.8862, Validation ROC AUC: 0.8726
[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009327 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 710
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  60%|██████    | 3/5 [00:08<00:05,  2.85s/it]

Fold 3: Train ROC AUC: 0.8848, Validation ROC AUC: 0.8785
[LightGBM] [Info] Number of positive: 9055, number of negative: 64576
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 707
[LightGBM] [Info] Number of data points in the train set: 73631, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122978 -> initscore=-1.964526
[LightGBM] [Info] Start training from score -1.964526


Folds:  80%|████████  | 4/5 [00:11<00:02,  2.69s/it]

Fold 4: Train ROC AUC: 0.8877, Validation ROC AUC: 0.8710
[LightGBM] [Info] Number of positive: 9055, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001233 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 708
[LightGBM] [Info] Number of data points in the train set: 73631, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122978 -> initscore=-1.964526
[LightGBM] [Info] Start training from score -1.964526


Folds: 100%|██████████| 5/5 [00:13<00:00,  2.67s/it]

Fold 5: Train ROC AUC: 0.8864, Validation ROC AUC: 0.8714
Average Train ROC AUC: 0.8863
Average Validation ROC AUC: 0.8737





Test ROC AUC: 0.8778
