In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
from category_encoders import TargetEncoder
from tqdm import tqdm
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

# Read the data
train_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/train.csv')

In [2]:
# 데이터 샘플링
train = train_origin.sample(frac=0.01, random_state = 42)

# 예측에 필요 없는 'id'와 'Annual_Premium' 변수를 드롭
train = train.drop(columns=['id'])

# 범주형 변수(2~3개 클래스) 인코딩
def encoding(train):
    gender_mapping = {'Male': 0, 'Female': 1}
    vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    vehicle_damage_mapping = {'No': 0, 'Yes': 1}

    train['Gender'] = train['Gender'].map(gender_mapping)
    train['Vehicle_Age'] = train['Vehicle_Age'].map(vehicle_age_mapping)
    train['Vehicle_Damage'] = train['Vehicle_Damage'].map(vehicle_damage_mapping)

    return train

train = encoding(train)

#Age 교차항
train['Age_bins'] = pd.cut(train['Age'], bins=7).cat.codes
train['Age_x_Vehicle_Age'] = train['Age_bins'] * train['Vehicle_Age']
train['Age_x_Vehicle_Damage'] = train['Age_bins'] * train['Vehicle_Damage']
train['Age_x_Previously_Insured'] = train['Age_bins'] * train['Previously_Insured']

# 범주형 변수 타겟 인코딩
cat_columns = ['Region_Code', 'Policy_Sales_Channel', 'Vintage']
train.loc[:,cat_columns] = train.loc[:,cat_columns].astype('category')

target_encoder = TargetEncoder()
train[cat_columns] = target_encoder.fit_transform(train[cat_columns],train['Response'])

#수치형 변수 + 타겟 인코딩 변수 표준화
scaler = MinMaxScaler()
num_columns = ['Age', 'Annual_Premium','Region_Code', 'Policy_Sales_Channel', 'Vintage']
train[num_columns] = scaler.fit_transform(train[num_columns])

# 예측변수 분리 및 train, valid set 분리
X = train.drop(['Response'], axis=1)
y = train['Response']

In [3]:
# 하이퍼파라미터 공간 정의
def sample_hyperparameters():
    space = {
        'num_leaves': np.random.choice([20, 31, 50, 100]),
        'max_depth': np.random.choice([-1, 20]),
        'learning_rate': np.exp(np.random.uniform(np.log(0.01), np.log(0.2))),
        'n_estimators': np.random.choice([30, 50, 100, 200, 300]),
        'lambda_l1': np.random.uniform(0.0, 0.2),
        'lambda_l2': np.random.uniform(0.0, 0.2)
    }
    return space

# LGBMClassifier 생성 함수
def create_lgbm_classifier(params):
    return LGBMClassifier(
        num_leaves=params['num_leaves'],
        max_depth=params['max_depth'],
        learning_rate=params['learning_rate'],
        n_estimators=params['n_estimators'],
        lambda_l1=params['lambda_l1'],
        lambda_l2=params['lambda_l2'],
        random_state=np.random.randint(0, 10000),
        verbose=-1  # 로그 출력을 최소화하기 위해 verbose 설정
    )

# K-Fold 교차 검증을 사용하여 Soft Voting Classifier를 생성하고 AUC를 계산하는 함수
def evaluate_voting_classifier_kfold(num_of_clf, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits)
    aucs = []

    for train_index, valid_index in skf.split(X, y):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        classifiers = []
        for _ in range(num_of_clf):
            params = sample_hyperparameters()
            lgbm_clf = create_lgbm_classifier(params)
            classifiers.append(('lgbm_' + str(np.random.randint(0, 10000)), lgbm_clf))

        voting_clf_soft = VotingClassifier(estimators=classifiers, voting='soft')
        voting_clf_soft.fit(X_train, y_train)
        test_preds = voting_clf_soft.predict_proba(X_valid)[:, 1]
        test_auc = roc_auc_score(y_valid, test_preds)
        aucs.append(test_auc)

    mean_auc = np.mean(aucs)
    print(f'Soft Voting Classifier(n={num_of_clf}) Mean AUC: {mean_auc:.4f}')
    return mean_auc

# 결과 저장용 DataFrame
results = []

# num_of_clf에 따른 AUC 계산
for num_of_clf in [1, 10, 20]:
    auc = evaluate_voting_classifier_kfold(num_of_clf)
    results.append({'num_of_clf': num_of_clf, 'AUC': auc})

# 결과 DataFrame 생성
results_df = pd.DataFrame(results)
results_df

Soft Voting Classifier(n=1) Mean AUC: 0.8699
Soft Voting Classifier(n=10) Mean AUC: 0.8737
Soft Voting Classifier(n=20) Mean AUC: 0.8739


Unnamed: 0,num_of_clf,AUC
0,1,0.869932
1,10,0.87365
2,20,0.873906


In [5]:
from scipy.stats import lognorm

# 하이퍼파라미터 공간 정의
def sample_hyperparameters():
    # learning_rate : 0.1에서 극값을 가지고 0.01보다 크고 0.2보다 큰 값이 거의 없도록 분포 설정
    s = 0.4  # shape parameter (표준 편차)
    scale = np.exp(-2)  # 위치 파라미터 (확률 밀도가 최대가 되는 위치의 로그 스케일)
    learning_rate = lognorm(s, scale=scale).rvs()
    while learning_rate < 0.01 or learning_rate > 0.2:
        learning_rate = lognorm(s, scale=scale).rvs()

    space = {
        'num_leaves': np.random.choice([20, 31, 50, 100]),
        'max_depth': np.random.choice([-1, 20]),
        'learning_rate': learning_rate,  # 로그 정규 분포에서 샘플링
        'n_estimators': np.random.choice([30, 50, 100, 200, 300]),
        'lambda_l1': np.random.uniform(0.0, 0.2),
        'lambda_l2': np.random.uniform(0.0, 0.2)
    }
    return space

# LGBMClassifier 생성 함수
def create_lgbm_classifier(params):
    return LGBMClassifier(
        num_leaves=params['num_leaves'],
        max_depth=params['max_depth'],
        learning_rate=params['learning_rate'],
        n_estimators=params['n_estimators'],
        lambda_l1=params['lambda_l1'],
        lambda_l2=params['lambda_l2'],
        random_state=np.random.randint(0, 10000),
        verbose=-1  # 로그 출력을 최소화하기 위해 verbose 설정
    )

# Soft Voting Classifier를 생성하고 AUC를 계산하는 함수
def modeling_voting_classifier(num_of_clf, X, y):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    train_scores = []
    valid_scores = []

    for fold, (train_index, valid_index) in enumerate(tqdm(skf.split(X_train, y_train), total=skf.get_n_splits(), desc="Folds"), 1):
        X_skf_train, X_skf_valid = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_skf_train, y_skf_valid = y_train.iloc[train_index], y_train.iloc[valid_index]

        classifiers = []
        for _ in range(num_of_clf):
            params = sample_hyperparameters()
            lgbm_clf = create_lgbm_classifier(params)
            classifiers.append(('lgbm_' + str(np.random.randint(0, 1000000)), lgbm_clf))

        voting_clf_soft = VotingClassifier(estimators=classifiers, voting='soft')
        voting_clf_soft.fit(X_skf_train, y_skf_train)

        train_preds = voting_clf_soft.predict_proba(X_skf_train)[:, 1]
        train_auc = roc_auc_score(y_skf_train, train_preds)
        train_scores.append(train_auc)

        valid_preds = voting_clf_soft.predict_proba(X_skf_valid)[:, 1]
        valid_auc = roc_auc_score(y_skf_valid, valid_preds)
        valid_scores.append(valid_auc)

    #print(f'Average Train ROC AUC: {np.mean(train_scores):.4f}')
    #print(f'Average Validation ROC AUC: {np.mean(valid_scores):.4f}')

    voting_clf_soft.fit(X_train, y_train)
    y_preds = voting_clf_soft.predict_proba(X_valid)[:, 1]
    valid_auc = roc_auc_score(y_valid, y_preds)

    print(f'Soft Voting Classifier(n={num_of_clf}) Mean AUC: {valid_auc:.4f}')

    return train_scores, valid_scores, valid_auc

# 결과 저장용 DataFrame
results = []

# num_of_clf에 따른 AUC 계산
for num_of_clf in [1, 2, 4, 8, 16, 32, 64, 128]:
    train_scores, valid_scores, valid_auc = modeling_voting_classifier(num_of_clf, X, y)
    results.append({'num_of_clf': num_of_clf, 'Train AUC': np.mean(train_scores), 'Valid AUC': np.mean(valid_scores), 'Valid AUC': valid_auc})

# 결과 DataFrame 생성
results_df = pd.DataFrame(results)
results_df

Folds: 100%|██████████| 5/5 [00:15<00:00,  3.05s/it]


Soft Voting Classifier(n=1) Mean AUC: 0.8777


Folds: 100%|██████████| 5/5 [00:26<00:00,  5.31s/it]


Soft Voting Classifier(n=2) Mean AUC: 0.8749


Folds: 100%|██████████| 5/5 [00:31<00:00,  6.40s/it]


Soft Voting Classifier(n=4) Mean AUC: 0.8769


Folds: 100%|██████████| 5/5 [01:03<00:00, 12.77s/it]


Soft Voting Classifier(n=8) Mean AUC: 0.8767


Folds: 100%|██████████| 5/5 [02:10<00:00, 26.04s/it]


Soft Voting Classifier(n=16) Mean AUC: 0.8776


Folds: 100%|██████████| 5/5 [03:45<00:00, 45.09s/it]


Soft Voting Classifier(n=32) Mean AUC: 0.8781


Folds: 100%|██████████| 5/5 [07:40<00:00, 92.11s/it]


Soft Voting Classifier(n=64) Mean AUC: 0.8775


Folds: 100%|██████████| 5/5 [24:33<00:00, 294.68s/it]


Soft Voting Classifier(n=128) Mean AUC: 0.8778


Unnamed: 0,num_of_clf,Train AUC,Valid AUC
0,1,0.900441,0.877714
1,2,0.945759,0.874911
2,4,0.925566,0.876869
3,8,0.935281,0.876735
4,16,0.91947,0.877564
5,32,0.925944,0.878059
6,64,0.926415,0.877487
7,128,0.928309,0.877783
