BayesianOptimization2

In [2]:
import os
import pandas as pd 
import numpy as np
import multiprocessing # 여러 개의 일꾼 (cpu)들에게 작업을 분산시키는 역할
from multiprocessing import Pool 
from functools import partial # 함수가 받는 인자들 중 몇개를 고정 시켜서 새롭게 파생된 함수를 형성하는 역할
from data_loader_v2 import data_loader_v2
from tqdm import tqdm_notebook
from sklearn.ensemble import RandomForestClassifier
import joblib # 모델을 저장하고 불러오는 역할

In [3]:
train_folder = 'data/train/'
test_folder = 'data/test/'
train_label_path = 'data/train_label.csv'

In [4]:
train_list = os.listdir(train_folder)
test_list = os.listdir(test_folder)
train_label = pd.read_csv(train_label_path, index_col=0)

In [5]:
# 데이콘 제공
def data_loader_all_v2(func, files, folder='', train_label=None, event_time=10, nrows=60):   
    func_fixed = partial(func, folder=folder, train_label=train_label, event_time=event_time, nrows=nrows)     
    if __name__ == '__main__':
        pool = Pool(processes=multiprocessing.cpu_count()) 
        df_list = list(pool.imap(func_fixed, files)) 
        pool.close()
        pool.join()        
    combined_df = pd.concat(df_list)    
    return combined_df

In [6]:
train = data_loader_all_v2(data_loader_v2, train_list, folder=train_folder, train_label=train_label, event_time=10, nrows=60)

In [8]:
test = data_loader_all_v2(data_loader_v2, test_list, folder=test_folder, train_label=None, event_time=20, nrows=60)

In [10]:
#데이터 프레임생성
X_train = train.drop(['label'], axis=1)
y_train = train['label']

lgbm 최적화

In [51]:
import matplotlib
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization
import lightgbm as lgbm

In [52]:
X=X_train
y=y_train

In [60]:
#목적함수 생성
def lgbm_cv(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
    model = lgbm.LGBMClassifier(num_leaves = int(round(num_leaves)),
                               feature_fraction = max(min(feature_fraction, 1), 0),
                               bagging_fraction = max(min(bagging_fraction, 1), 0),
                               max_depth = int(round(max_depth)),
                               lambda_l1 = max(lambda_l1, 0),
                               lambda_l2 = max(lambda_l2, 0),
                               min_split_gain = min_split_gain,
                               min_child_weight = min_child_weight)
    RMSE = cross_val_score(model, X, y, scoring='accuracy', cv=5).mean()
    return -RMSE

In [61]:
# 입력값의 탐색 대상 구간
pbounds = {'num_leaves': (24, 45),
           'feature_fraction': (0.1, 0.9),
           'bagging_fraction': (0.8, 1),
           'max_depth': (5, 8.99),
           'lambda_l1': (0, 5),
           'lambda_l2': (0, 3),
           'min_split_gain': (0.001, 0.1),
           'min_child_weight': (5, 50)
          }

In [62]:
#객체 생성
lgbmBO = BayesianOptimization(f = lgbm_cv,pbounds = pbounds, verbose = 2, random_state = 1 )

In [None]:
# 반복적으로 베이지안 최적화 수행
# 시간을 줄이기 위해 N-n 값 10으로 지정
# acq='ei'사용
# xi=0.01 로 exploration의 강도를 조금 높임
lgbmBO.maximize(init_points=2, n_iter = 10, acq='ei', xi=0.01)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------


In [None]:
lgbmBO.max # 찾은 파라미터 값 확인

파라미터 적용

In [None]:
#수정예정
fit_lgbm = lgbm.LGBMClassifier(num_leaves = int(round(num_leaves)),
                               feature_fraction = max(min(feature_fraction, 1), 0),
                               bagging_fraction = max(min(bagging_fraction, 1), 0),
                               max_depth = int(round(max_depth)),
                               lambda_l1 = max(lambda_l1, 0),
                               lambda_l2 = max(lambda_l2, 0),
                               min_split_gain = min_split_gain,
                               min_child_weight = min_child_weight)

In [None]:
model_lgbm  = fit_lgbm.fit(X,y)