In [65]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
from category_encoders import TargetEncoder
from tqdm import tqdm
from lightgbm import LGBMClassifier
from category_encoders import TargetEncoder

# Read the data
train_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/train.csv')

In [67]:
# 데이터 샘플링
#train = train_origin.sample(frac=0.01, random_state = 42)
train = train_origin

# 예측에 필요 없는 'id'와 'Annual_Premium' 변수를 드롭
train = train.drop(columns=['id'])

# 범주형 변수(2~3개 클래스) 인코딩
def encoding(train):
    gender_mapping = {'Male': 0, 'Female': 1}
    vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    vehicle_damage_mapping = {'No': 0, 'Yes': 1}

    train['Gender'] = train['Gender'].map(gender_mapping)
    train['Vehicle_Age'] = train['Vehicle_Age'].map(vehicle_age_mapping)
    train['Vehicle_Damage'] = train['Vehicle_Damage'].map(vehicle_damage_mapping)

    return train

train = encoding(train)

'''(같은 건데 왠지 위 방식이 미세하게 잘 나온다)
cat_columns_simple = ['Gender','Vehicle_Age','Vehicle_Damage']
train[cat_columns_simple] = train[cat_columns_simple].astype('category')
train['Gender'] = train['Gender'].cat.codes
train['Vehicle_Age'] = train['Vehicle_Age'].cat.codes
train['Vehicle_Damage'] = train['Vehicle_Damage'].cat.codes
'''

#Previously_Insured 교차항
'''
train['Previously_Insured_Annual_Premium'] = pd.factorize(train['Previously_Insured'].astype(str) + train['Annual_Premium'].astype(str))[0]
train['Previously_Insured_Vehicle_Age'] = pd.factorize(train['Previously_Insured'].astype(str) + train['Vehicle_Age'].astype(str))[0]
train['Previously_Insured_Vehicle_Damage'] = pd.factorize(train['Previously_Insured'].astype(str) + train['Vehicle_Damage'].astype(str))[0]
train['Previously_Insured_Vintage'] = pd.factorize(train['Previously_Insured'].astype(str) + train['Vintage'].astype(str))[0]
'''

#Age 교차항
train['Age_bins'] = pd.cut(train['Age'], bins=7).cat.codes
train['Age_x_Vehicle_Age'] = train['Age_bins'] * train['Vehicle_Age']
train['Age_x_Vehicle_Damage'] = train['Age_bins'] * train['Vehicle_Damage']
train['Age_x_Previously_Insured'] = train['Age_bins'] * train['Previously_Insured']

# 범주형 변수 타겟 인코딩
cat_columns = ['Region_Code', 'Policy_Sales_Channel', 'Vintage']
train.loc[:,cat_columns] = train.loc[:,cat_columns].astype('category')

target_encoder = TargetEncoder()
train[cat_columns] = target_encoder.fit_transform(train[cat_columns],train['Response'])

#수치형 변수 + 타겟 인코딩 변수 표준화
scaler = MinMaxScaler()
num_columns = ['Age', 'Annual_Premium','Region_Code', 'Policy_Sales_Channel', 'Vintage']
train[num_columns] = scaler.fit_transform(train[num_columns])

# 예측변수 분리 및 train, valid set 분리
X = train.drop(['Response'], axis=1)
y = train['Response']

In [68]:
train.head()

Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Age_bins,Age_x_Vehicle_Age,Age_x_Vehicle_Damage,Age_x_Previously_Insured
0,0,0.015385,1,0.565024,0,1,1,0.116218,0.349785,0.151074,0,0,0,0,0
1,0,0.353846,1,1.0,0,2,1,0.104702,0.377687,0.474082,1,2,4,2,0
2,1,0.076923,1,0.345656,1,0,0,0.06588,0.045898,0.109383,0,0,0,0,0
3,1,0.230769,1,0.446617,0,1,1,0.0,0.372504,0.210514,0,1,1,1,0
4,1,0.246154,1,0.233017,1,1,0,0.054547,0.045898,0.231829,0,1,1,0,1


In [69]:
def modeling(model, X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state =42)
    train_scores = []
    valid_scores = []

    for fold, (train_index, valid_index) in enumerate(tqdm(skf.split(X_train, y_train), total=skf.get_n_splits(), desc="Folds"), 1):
        X_skf_train, X_skf_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_skf_train, y_skf_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

        model.fit(X_skf_train, y_skf_train)

        train_preds = model.predict_proba(X_skf_train)[:, 1]
        train_auc = roc_auc_score(y_skf_train, train_preds)
        train_scores.append(train_auc)

        valid_preds = model.predict_proba(X_skf_valid)[:, 1]
        valid_auc = roc_auc_score(y_skf_valid, valid_preds)
        valid_scores.append(valid_auc)

        print(f'Fold {fold}: Train ROC AUC: {train_auc:.4f}, Validation ROC AUC: {valid_auc:.4f}')

    print(f'Average Train ROC AUC: {sum(train_scores)/len(train_scores):.4f}')
    print(f'Average Validation ROC AUC: {sum(valid_scores)/len(valid_scores):.4f}')

    test_preds = model.predict_proba(X_test)[:, 1]
    test_auc = roc_auc_score(y_test, test_preds)
    print(f'Test ROC AUC: {test_auc:.4f}')


    return train_scores, valid_scores, test_auc

In [70]:
# LightGBM
print("\nLightGBM:")
best_param = {'reg_alpha': 0.1, 'reg_lambda': 0.1, 'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 100, 'num_leaves': 20, 'force_col_wise': True}
lgbm_model = LGBMClassifier(**best_param, random_state=42)
lgbm_train_scores, lgbm_valid_scores, lgbm_test_auc = modeling(lgbm_model, X, y)


LightGBM:


Folds:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 905500, number of negative: 6457570
[LightGBM] [Info] Total Bins 759
[LightGBM] [Info] Number of data points in the train set: 7363070, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122979 -> initscore=-1.964521
[LightGBM] [Info] Start training from score -1.964521


Folds:  20%|██        | 1/5 [02:57<11:51, 177.89s/it]

Fold 1: Train ROC AUC: 0.8771, Validation ROC AUC: 0.8776
[LightGBM] [Info] Number of positive: 905500, number of negative: 6457570
[LightGBM] [Info] Total Bins 757
[LightGBM] [Info] Number of data points in the train set: 7363070, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122979 -> initscore=-1.964521
[LightGBM] [Info] Start training from score -1.964521


Folds:  40%|████      | 2/5 [05:43<08:32, 170.73s/it]

Fold 2: Train ROC AUC: 0.8774, Validation ROC AUC: 0.8768
[LightGBM] [Info] Number of positive: 905500, number of negative: 6457570
[LightGBM] [Info] Total Bins 760
[LightGBM] [Info] Number of data points in the train set: 7363070, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122979 -> initscore=-1.964521
[LightGBM] [Info] Start training from score -1.964521
