In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
from lightgbm import LGBMClassifier
from tqdm import tqdm

# Read the data
train_origin = pd.read_csv('/Users/jaesolshin/내 드라이브/2024-2/Google ML Bootcamp2024/data/playground1/train.csv')

In [2]:
from category_encoders import TargetEncoder

# 데이터 샘플링
train = train_origin.sample(frac=0.01, random_state = 42)

# 예측에 필요 없는 'id'와 'Annual_Premium' 변수를 드롭
train = train.drop(columns=['id'])

# 범주형 변수 인코딩
def encoding(train):
    gender_mapping = {'Male': 0, 'Female': 1}
    vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    vehicle_damage_mapping = {'No': 0, 'Yes': 1}

    train['Gender'] = train['Gender'].map(gender_mapping)
    train['Vehicle_Age'] = train['Vehicle_Age'].map(vehicle_age_mapping)
    train['Vehicle_Damage'] = train['Vehicle_Damage'].map(vehicle_damage_mapping)

    return train

train = encoding(train)

# 범주형 변수 타겟 인코딩
cat_columns = ['Region_Code', 'Policy_Sales_Channel', 'Vintage']
train.loc[:,cat_columns] = train.loc[:,cat_columns].astype('category')

target_encoder = TargetEncoder()
train[cat_columns] = target_encoder.fit_transform(train[cat_columns],train['Response'])

#수치형 변수: Age, 'Annual_Premium' -> normalize
scaler = MinMaxScaler()
num_columns = ['Age', 'Annual_Premium']
train[num_columns] = scaler.fit_transform(train[num_columns])

# 예측변수 분리 및 train, valid set 분리
X = train.drop(['Response'], axis=1)
y = train['Response']

In [4]:
#pip install hyperopt

Collecting hyperopt
  Using cached hyperopt-0.2.7-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting future (from hyperopt)
  Using cached future-1.0.0-py3-none-any.whl.metadata (4.0 kB)
Collecting py4j (from hyperopt)
  Using cached py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Using cached hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
Using cached future-1.0.0-py3-none-any.whl (491 kB)
Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Installing collected packages: py4j, future, hyperopt
Successfully installed future-1.0.0 hyperopt-0.2.7 py4j-0.10.9.7
Note: you may need to restart the kernel to use updated packages.


In [7]:
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

# 목적 함수 정의
def objective(params):
    params = {
        'num_leaves': int(params['num_leaves']),
        'max_depth': int(params['max_depth']),
        'learning_rate': params['learning_rate'],
        'n_estimators': int(params['n_estimators']),
        'lambda_l1': params['lambda_l1'],
        'lambda_l2': params['lambda_l2']
    }
    clf = LGBMClassifier(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    score = cross_val_score(clf, X, y, cv=cv, scoring='roc_auc').mean()
    return {'loss': -score, 'status': STATUS_OK}

# 하이퍼파라미터 공간 설정
space = {
    'num_leaves': hp.choice('num_leaves', [20, 31, 50, 100]),
    'max_depth': hp.choice('max_depth', [-1, 20]),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'n_estimators': hp.choice('n_estimators', [100, 200, 300]),
    'lambda_l1': hp.uniform('lambda_l1', 0.0, 2.0),
    'lambda_l2': hp.uniform('lambda_l2', 0.0, 2.0)
}

# Hyperopt 최적화 실행
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, trials=trials, rstate=np.random.default_rng(42))

# 최적의 하이퍼파라미터 출력
print("Best parameters found by Hyperopt:", best)

                                                      

[LightGBM] [Info] Number of positive: 11313, number of negative: 80725
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.078268 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 709                      
[LightGBM] [Info] Number of data points in the train set: 92038, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122917 -> initscore=-1.965096
[LightGBM] [Info] Start training from score -1.965096 
[LightGBM] [Info] Number of positive: 11313, number of negative: 80725
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017793 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 711                      
[LightGBM] [Info] Number of data points in the train set: 92038, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScor