In [35]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
from category_encoders import TargetEncoder
from tqdm import tqdm
from lightgbm import LGBMClassifier

# Read the data
train_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/train.csv')

In [36]:
# 데이터 샘플링
train = train_origin.sample(frac=0.01, random_state = 42)

# 예측에 필요 없는 'id'와 'Annual_Premium' 변수를 드롭
train = train.drop(columns=['id'])

# 범주형 변수 인코딩
def encoding(train):
    gender_mapping = {'Male': 0, 'Female': 1}
    vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    vehicle_damage_mapping = {'No': 0, 'Yes': 1}

    train['Gender'] = train['Gender'].map(gender_mapping)
    train['Vehicle_Age'] = train['Vehicle_Age'].map(vehicle_age_mapping)
    train['Vehicle_Damage'] = train['Vehicle_Damage'].map(vehicle_damage_mapping)

    return train

train = encoding(train)

#수치형 변수: Annual, Region_Code, Annual_Premium, Policy_Sales_Channel, Vintage -> normalize
scaler = MinMaxScaler()
num_columns = ['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']
train[num_columns] = scaler.fit_transform(train[num_columns])

# XGBoost에서 발생하는 문제 해결
train.columns = train.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '')

# 예측변수 분리 및 train, valid set 분리
X = train.drop(['Response'], axis=1)
y = train['Response']

In [37]:
def modeling(model, X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state =42)
    train_scores = []
    valid_scores = []

    for fold, (train_index, valid_index) in enumerate(tqdm(skf.split(X_train, y_train), total=skf.get_n_splits(), desc="Folds"), 1):
        X_skf_train, X_skf_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_skf_train, y_skf_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

        model.fit(X_skf_train, y_skf_train)

        train_preds = model.predict_proba(X_skf_train)[:, 1]
        train_auc = roc_auc_score(y_skf_train, train_preds)
        train_scores.append(train_auc)

        valid_preds = model.predict_proba(X_skf_valid)[:, 1]
        valid_auc = roc_auc_score(y_skf_valid, valid_preds)
        valid_scores.append(valid_auc)

        print(f'Fold {fold}: Train ROC AUC: {train_auc:.4f}, Validation ROC AUC: {valid_auc:.4f}')

    print(f'Average Train ROC AUC: {sum(train_scores)/len(train_scores):.4f}')
    print(f'Average Validation ROC AUC: {sum(valid_scores)/len(valid_scores):.4f}')

    test_preds = model.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, test_preds)
    print(f'Test ROC AUC: {test_auc:.4f}')


    return train_scores, valid_scores, test_auc

In [38]:
# LightGBM
print("\nLightGBM:")
lgbm_model = LGBMClassifier(random_state=42)
lgbm_train_scores, lgbm_valid_scores, lgbm_test_auc = modeling(lgbm_model, X, y)


LightGBM:


Folds:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.030284 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 715
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  20%|██        | 1/5 [00:03<00:15,  3.78s/it]

Fold 1: Train ROC AUC: 0.8873, Validation ROC AUC: 0.8659
[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.069997 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 716
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  40%|████      | 2/5 [00:05<00:08,  2.71s/it]

Fold 2: Train ROC AUC: 0.8931, Validation ROC AUC: 0.8621
[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033680 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 715
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  60%|██████    | 3/5 [00:07<00:04,  2.46s/it]

Fold 3: Train ROC AUC: 0.8920, Validation ROC AUC: 0.8688
[LightGBM] [Info] Number of positive: 9055, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003198 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 715
[LightGBM] [Info] Number of data points in the train set: 73631, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122978 -> initscore=-1.964526
[LightGBM] [Info] Start training from score -1.964526


Folds:  80%|████████  | 4/5 [00:09<00:02,  2.23s/it]

Fold 4: Train ROC AUC: 0.8910, Validation ROC AUC: 0.8598
[LightGBM] [Info] Number of positive: 9055, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002101 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 712
[LightGBM] [Info] Number of data points in the train set: 73631, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122978 -> initscore=-1.964526
[LightGBM] [Info] Start training from score -1.964526


Folds: 100%|██████████| 5/5 [00:12<00:00,  2.45s/it]

Fold 5: Train ROC AUC: 0.8926, Validation ROC AUC: 0.8607
Average Train ROC AUC: 0.8912
Average Validation ROC AUC: 0.8634





Test ROC AUC: 0.8667


In [39]:
from category_encoders import TargetEncoder

# 데이터 샘플링
train = train_origin.sample(frac=0.01, random_state = 42)

# 예측에 필요 없는 'id'와 'Annual_Premium' 변수를 드롭
train = train.drop(columns=['id'])

# 범주형 변수 인코딩
def encoding(train):
    gender_mapping = {'Male': 0, 'Female': 1}
    vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    vehicle_damage_mapping = {'No': 0, 'Yes': 1}

    train['Gender'] = train['Gender'].map(gender_mapping)
    train['Vehicle_Age'] = train['Vehicle_Age'].map(vehicle_age_mapping)
    train['Vehicle_Damage'] = train['Vehicle_Damage'].map(vehicle_damage_mapping)

    return train

train = encoding(train)

# 범주형 변수 타겟 인코딩
cat_columns = ['Region_Code', 'Policy_Sales_Channel', 'Vintage']
train.loc[:,cat_columns] = train.loc[:,cat_columns].astype('category')

target_encoder = TargetEncoder()
train[cat_columns] = target_encoder.fit_transform(train[cat_columns],train['Response'])

#수치형 변수+타겟인코딩 변수
scaler = MinMaxScaler()
num_columns = ['Age', 'Annual_Premium', 'Region_Code', 'Policy_Sales_Channel', 'Vintage']
train[num_columns] = scaler.fit_transform(train[num_columns])


# XGBoost에서 발생하는 문제 해결
train.columns = train.columns.str.replace('[', '').str.replace(']', '').str.replace('<', '')

# 예측변수 분리 및 train, valid set 분리
X = train.drop(['Response'], axis=1)
y = train['Response']

In [40]:
# LightGBM
lgbm_model = LGBMClassifier(random_state=42)
print("\nLightGBM:")
lgbm_train_scores, lgbm_valid_scores, lgbm_test_auc = modeling(lgbm_model, X, y)


LightGBM:


Folds:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070066 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 705
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  20%|██        | 1/5 [00:02<00:11,  2.88s/it]

Fold 1: Train ROC AUC: 0.8950, Validation ROC AUC: 0.8734
[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001180 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 709
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  40%|████      | 2/5 [00:04<00:06,  2.33s/it]

Fold 2: Train ROC AUC: 0.8974, Validation ROC AUC: 0.8711
[LightGBM] [Info] Number of positive: 9054, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001228 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 707
[LightGBM] [Info] Number of data points in the train set: 73630, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122966 -> initscore=-1.964636
[LightGBM] [Info] Start training from score -1.964636


Folds:  60%|██████    | 3/5 [00:06<00:03,  1.88s/it]

Fold 3: Train ROC AUC: 0.8942, Validation ROC AUC: 0.8775
[LightGBM] [Info] Number of positive: 9055, number of negative: 64576
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066524 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 705
[LightGBM] [Info] Number of data points in the train set: 73631, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122978 -> initscore=-1.964526
[LightGBM] [Info] Start training from score -1.964526


Folds:  80%|████████  | 4/5 [00:08<00:02,  2.17s/it]

Fold 4: Train ROC AUC: 0.8974, Validation ROC AUC: 0.8692
[LightGBM] [Info] Number of positive: 9055, number of negative: 64576
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002073 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 706
[LightGBM] [Info] Number of data points in the train set: 73631, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122978 -> initscore=-1.964526
[LightGBM] [Info] Start training from score -1.964526


Folds: 100%|██████████| 5/5 [00:10<00:00,  2.02s/it]

Fold 5: Train ROC AUC: 0.8970, Validation ROC AUC: 0.8706
Average Train ROC AUC: 0.8962
Average Validation ROC AUC: 0.8724





Test ROC AUC: 0.8772
