In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
from category_encoders import TargetEncoder
from tqdm import tqdm
from category_encoders import TargetEncoder

# Read the data
train_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/train.csv')

In [2]:
# 데이터 샘플링
train = train_origin.sample(frac=0.01, random_state = 42)

# 예측에 필요 없는 'id'와 'Annual_Premium' 변수를 드롭
train = train.drop(columns=['id'])

# 범주형 변수(2~3개 클래스) 인코딩
def encoding(train):
    gender_mapping = {'Male': 0, 'Female': 1}
    vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    vehicle_damage_mapping = {'No': 0, 'Yes': 1}

    train['Gender'] = train['Gender'].map(gender_mapping)
    train['Vehicle_Age'] = train['Vehicle_Age'].map(vehicle_age_mapping)
    train['Vehicle_Damage'] = train['Vehicle_Damage'].map(vehicle_damage_mapping)

    return train

train = encoding(train)

'''(같은 건데 왠지 위 방식이 미세하게 잘 나온다)
cat_columns_simple = ['Gender','Vehicle_Age','Vehicle_Damage']
train[cat_columns_simple] = train[cat_columns_simple].astype('category')
train['Gender'] = train['Gender'].cat.codes
train['Vehicle_Age'] = train['Vehicle_Age'].cat.codes
train['Vehicle_Damage'] = train['Vehicle_Damage'].cat.codes
'''

#Previously_Insured 교차항
'''
train['Previously_Insured_Annual_Premium'] = pd.factorize(train['Previously_Insured'].astype(str) + train['Annual_Premium'].astype(str))[0]
train['Previously_Insured_Vehicle_Age'] = pd.factorize(train['Previously_Insured'].astype(str) + train['Vehicle_Age'].astype(str))[0]
train['Previously_Insured_Vehicle_Damage'] = pd.factorize(train['Previously_Insured'].astype(str) + train['Vehicle_Damage'].astype(str))[0]
train['Previously_Insured_Vintage'] = pd.factorize(train['Previously_Insured'].astype(str) + train['Vintage'].astype(str))[0]
'''

#Age 교차항
train['Age_bins'] = pd.cut(train['Age'], bins=7).cat.codes
train['Age_x_Vehicle_Age'] = train['Age_bins'] * train['Vehicle_Age']
train['Age_x_Vehicle_Damage'] = train['Age_bins'] * train['Vehicle_Damage']
train['Age_x_Previously_Insured'] = train['Age_bins'] * train['Previously_Insured']

# 범주형 변수 타겟 인코딩
cat_columns = ['Region_Code', 'Policy_Sales_Channel', 'Vintage']
train.loc[:,cat_columns] = train.loc[:,cat_columns].astype('category')

target_encoder = TargetEncoder()
train[cat_columns] = target_encoder.fit_transform(train[cat_columns],train['Response'])

#수치형 변수 + 타겟 인코딩 변수 표준화
scaler = MinMaxScaler()
num_columns = ['Age', 'Annual_Premium','Region_Code', 'Policy_Sales_Channel', 'Vintage']
train[num_columns] = scaler.fit_transform(train[num_columns])

# 예측변수 분리 및 train, valid set 분리
X = train.drop(['Response'], axis=1)
y = train['Response']

In [3]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
import numpy as np
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier

# LGBMClassifier 생성 함수
def create_lgbm_classifier(random_state):
    return LGBMClassifier(random_state=random_state, verbose=-1)

# N개의 서로 다른 random_state로 분류기 생성
num_of_clf=10
classifiers = [('lgbm_' + str(i), create_lgbm_classifier(i)) for i in range(num_of_clf)]

# 투표 분류기 생성
voting_clf = VotingClassifier(estimators=classifiers, voting='soft')

# 모델 학습
voting_clf.fit(X_train, y_train)

# 예측
y_pred = voting_clf.predict(X_valid)

# 확률 평균 방식 (Soft Voting)
voting_clf_soft = VotingClassifier(estimators=classifiers, voting='soft')
voting_clf_soft.fit(X_train, y_train)

test_preds = voting_clf_soft.predict_proba(X_valid)[:, 1]
test_auc = roc_auc_score(y_valid, test_preds)
print(f'Soft Voting Classifier(n={num_of_clf}) Accuracy: {test_auc:.4f}')

Soft Voting Classifier(n=10) Accuracy: 0.8773


In [5]:
# LGBMClassifier 생성 함수
def create_lgbm_classifier(random_state, n_estimators, learning_rate, max_depth):
    return LGBMClassifier(
        random_state=random_state,
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        verbose=-1  # 로그 출력을 최소화하기 위해 verbose 설정
    )

# N개의 서로 다른 random_state와 hyperparameters로 분류기 생성
num_of_clf = 30
classifiers = []

for i in range(num_of_clf):
    random_state = i
    n_estimators = int(100 + 50 * np.random.randn())
    learning_rate = np.abs(0.1 * np.random.randn())
    max_depth = int(3 + 3 * np.random.randn())
    classifiers.append(('lgbm_' + str(i), create_lgbm_classifier(random_state, n_estimators, learning_rate, max_depth)))

# 확률 평균 방식 (Soft Voting)
voting_clf_soft = VotingClassifier(estimators=classifiers, voting='soft')
voting_clf_soft.fit(X_train, y_train)

# 예측
test_preds = voting_clf_soft.predict_proba(X_valid)[:, 1]
test_auc = roc_auc_score(y_valid, test_preds)
print(f'Soft Voting Classifier(n={num_of_clf}) AUC: {test_auc:.4f}')    


Soft Voting Classifier(n=30) AUC: 0.8783


In [6]:
# LGBMClassifier 생성 함수
def create_lgbm_classifier(random_state, n_estimators, learning_rate, max_depth):
    return LGBMClassifier(
        random_state=random_state,
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        verbose=-1  # 로그 출력을 최소화하기 위해 verbose 설정
    )

# Soft Voting Classifier를 생성하고 AUC를 계산하는 함수
def evaluate_voting_classifier(num_of_clf):
    classifiers = []
    
    for i in range(num_of_clf):
        random_state = i # 30*i + 50 * np.random.randn()
        n_estimators = max(1, int(100 + 50 * np.random.randn()))  # 최소 1 이상의 값을 가지도록 설정
        learning_rate = np.abs(0.1 * np.random.randn())
        max_depth = max(1, int(3 + 3 * np.random.randn()))  # 최소 1 이상의 값을 가지도록 설정
        classifiers.append(('lgbm_' + str(i), create_lgbm_classifier(random_state, n_estimators, learning_rate, max_depth)))

    voting_clf_soft = VotingClassifier(estimators=classifiers, voting='soft')
    voting_clf_soft.fit(X_train, y_train)
    test_preds = voting_clf_soft.predict_proba(X_valid)[:, 1]
    test_auc = roc_auc_score(y_valid, test_preds)

    print(f'Soft Voting Classifier(n={num_of_clf}) AUC: {test_auc:.4f}')
    return test_auc

# 결과 저장용 DataFrame
results = []

# num_of_clf에 따른 AUC 계산
for num_of_clf in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
    auc = evaluate_voting_classifier(num_of_clf)
    results.append({'num_of_clf': num_of_clf, 'AUC': auc})

# 결과 DataFrame 생성
results_df = pd.DataFrame(results)
results_df

Soft Voting Classifier(n=1) AUC: 0.8722
Soft Voting Classifier(n=10) AUC: 0.8758
Soft Voting Classifier(n=20) AUC: 0.8772
Soft Voting Classifier(n=30) AUC: 0.8771
Soft Voting Classifier(n=40) AUC: 0.8770
Soft Voting Classifier(n=50) AUC: 0.8774
Soft Voting Classifier(n=60) AUC: 0.8771
Soft Voting Classifier(n=70) AUC: 0.8764
Soft Voting Classifier(n=80) AUC: 0.8777
Soft Voting Classifier(n=90) AUC: 0.8767
Soft Voting Classifier(n=100) AUC: 0.8775


Unnamed: 0,num_of_clf,AUC
0,1,0.872247
1,10,0.875844
2,20,0.877238
3,30,0.877091
4,40,0.876991
5,50,0.87737
6,60,0.877089
7,70,0.876373
8,80,0.877734
9,90,0.876705


In [8]:
# 하이퍼파라미터 공간 정의 및 샘플링 함수
def sample_hyperparameters():
    space = {
        'num_leaves': np.random.choice([20, 31, 50, 100]),
        'max_depth': np.random.choice([-1, 20]),
        'learning_rate': np.exp(np.random.uniform(np.log(0.01), np.log(0.2))),
        'n_estimators': np.random.choice([100, 200, 300]),
        'lambda_l1': np.random.uniform(0.0, 2.0),
        'lambda_l2': np.random.uniform(0.0, 2.0)
    }
    return space

# LGBMClassifier 생성 함수
def create_lgbm_classifier(params):
    return LGBMClassifier(
        num_leaves=params['num_leaves'],
        max_depth=params['max_depth'],
        learning_rate=params['learning_rate'],
        n_estimators=params['n_estimators'],
        lambda_l1=params['lambda_l1'],
        lambda_l2=params['lambda_l2'],
        random_state=np.random.randint(0, 10000),
        verbose=-1  # 로그 출력을 최소화하기 위해 verbose 설정
    )

# Soft Voting Classifier를 생성하고 AUC를 계산하는 함수
def evaluate_voting_classifier(num_of_clf):
    classifiers = []
    
    for _ in range(num_of_clf):
        params = sample_hyperparameters()
        lgbm_clf = create_lgbm_classifier(params)
        classifiers.append(('lgbm_' + str(np.random.randint(0, 10000)), lgbm_clf))

    voting_clf_soft = VotingClassifier(estimators=classifiers, voting='soft')
    voting_clf_soft.fit(X_train, y_train)
    test_preds = voting_clf_soft.predict_proba(X_valid)[:, 1]
    test_auc = roc_auc_score(y_valid, test_preds)

    print(f'Soft Voting Classifier(n={num_of_clf}) AUC: {test_auc:.4f}')
    return test_auc

# 결과 저장용 DataFrame
results = []

# num_of_clf에 따른 AUC 계산
for num_of_clf in [1, 10, 20, 30, 40, 50]:
    auc = evaluate_voting_classifier(num_of_clf)
    results.append({'num_of_clf': num_of_clf, 'AUC': auc})

# 결과 DataFrame 생성
results_df = pd.DataFrame(results)
results_df

Soft Voting Classifier(n=1) AUC: 0.8767
Soft Voting Classifier(n=10) AUC: 0.8780
Soft Voting Classifier(n=20) AUC: 0.8775
Soft Voting Classifier(n=30) AUC: 0.8775
Soft Voting Classifier(n=40) AUC: 0.8777
Soft Voting Classifier(n=50) AUC: 0.8782


Unnamed: 0,num_of_clf,AUC
0,1,0.876709
1,10,0.877982
2,20,0.877496
3,30,0.8775
4,40,0.877729
5,50,0.878233
