포스팅 [링크](https://visitor-badge.laobi.icu/badge?page_id=jihyun22.github.io/데이콘리뷰/psychology-02/)

## 01~2 data_encoding.ipynb 참고

In [1]:
import pandas as pd
import numpy as np

In [2]:
# 데이터 로드
train=pd.read_csv('data/train.csv', index_col=0)
test=pd.read_csv('data/test_x.csv', index_col=0)
submission=pd.read_csv('data/sample_submission.csv', index_col=0)
# 범주형 -> 이산형
X = pd.get_dummies(train.drop('voted', axis = 1))
y = train['voted']
test = pd.get_dummies(test)

In [3]:
# 칼럼 개수 변화
print("X : {}\ntest : {}".format(X.shape, test.shape))
# 인코딩 확인
print("Encoding Success") if list(X.columns) == list(test.columns) else list(test.columns)

X : (45532, 100)
test : (11383, 100)
Encoding Success


## 03 데이터 전처리

In [4]:
# nan 값 메꾸기
X = X.fillna(X.mean())
# 중복 값 제거
X.drop_duplicates(keep='first', inplace = True)
# 비교 -> nan 없음
X.shape

(45532, 100)

In [5]:
from sklearn.preprocessing import MinMaxScaler
# 데이터 스케일링 -> 민맥스/스텐다드 모두 성능 비슷함
scaler=MinMaxScaler()
scaler.fit(X)
X=scaler.transform(X)
# 테스트 데이터도 동일 스케일러로
test=scaler.transform(test)

In [6]:
X

array([[5.00000000e-01, 1.40020340e-04, 7.50000000e-01, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 2.57670567e-04, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [7.50000000e-01, 6.61989656e-04, 0.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       ...,
       [7.50000000e-01, 2.35300453e-04, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.99096703e-04, 5.00000000e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.00000000e-01, 1.95117101e-04, 1.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

## 04 모델링(베이지안)

In [34]:
import lightgbm as lgbm
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import cross_validate

In [49]:
#목적함수 생성
def lgbm_cv(learning_rate, num_leaves, max_depth, min_child_weight, colsample_bytree, feature_fraction, bagging_fraction, lambda_l1, lambda_l2):
    model = lgbm.LGBMClassifier(learning_rate=learning_rate,
                                n_estimators = 300,
                                #boosting = 'dart',
                                num_leaves = int(round(num_leaves)),
                                max_depth = int(round(max_depth)),
                                min_child_weight = int(round(min_child_weight)),
                                colsample_bytree = colsample_bytree,
                                feature_fraction = max(min(feature_fraction, 1), 0),
                                bagging_fraction = max(min(bagging_fraction, 1), 0),
                                lambda_l1 = max(lambda_l1, 0),
                                lambda_l2 = max(lambda_l2, 0)
                               )
    scoring = {'roc_auc_score': make_scorer(roc_auc_score)}
    result = cross_validate(model, X, y, cv=5, scoring=scoring)
    auc_score = result["test_roc_auc_score"].mean()
    return auc_score

In [50]:
# 입력값의 탐색 대상 구간
pbounds = {'learning_rate' : (0.0001, 0.05),
           'num_leaves': (300, 600),
           'max_depth': (2, 25),
           'min_child_weight': (30, 100),
           'colsample_bytree': (0, 0.99),
           'feature_fraction': (0.0001, 0.99),
           'bagging_fraction': (0.0001, 0.99),
           'lambda_l1' : (0, 0.99),
           'lambda_l2' : (0, 0.99),
          }

In [51]:
#객체 생성
lgbmBO = BayesianOptimization(f = lgbm_cv, pbounds = pbounds, verbose = 2, random_state = 0 )

In [52]:
# 반복적으로 베이지안 최적화 수행
# acq='ei'사용
# xi=0.01 로 exploration의 강도를 조금 높임
lgbmBO.maximize(init_points=5, n_iter = 20, acq='ei', xi=0.01)

|   iter    |  target   | baggin... | colsam... | featur... | lambda_l1 | lambda_l2 | learni... | max_depth | min_ch... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.6986  [0m | [0m 0.5434  [0m | [0m 0.708   [0m | [0m 0.5968  [0m | [0m 0.5394  [0m | [0m 0.4194  [0m | [0m 0.03233 [0m | [0m 12.06   [0m | [0m 92.42   [0m | [0m 589.1   [0m |
| [0m 2       [0m | [0m 0.6865  [0m | [0m 0.3797  [0m | [0m 0.7838  [0m | [0m 0.5237  [0m | [0m 0.5624  [0m | [0m 0.9163  [0m | [0m 0.003645[0m | [0m 4.004   [0m | [0m 31.42   [0m | [0m 549.8   [0m |
| [95m 3       [0m | [95m 0.7     [0m | [95m 0.7704  [0m | [95m 0.8613  [0m | [95m 0.9688  [0m | [95m 0.7912  [0m | [95m 0.4569  [0m | [95m 0.03905 [0m | [95m 4.72    [0m | [95m 74.79   [0m | [95m 343.0   [0m |
| [0m 4       [0m | [0m 0.6999  [0m | [0m 0.9352  

In [53]:
# 찾은 파라미터 값 확인
lgbmBO.max

{'target': 0.7042782365476333,
 'params': {'bagging_fraction': 0.9515683153667026,
  'colsample_bytree': 0.7860120233288207,
  'feature_fraction': 0.9362075859090412,
  'lambda_l1': 0.7111030183072032,
  'lambda_l2': 0.5642765168754059,
  'learning_rate': 0.011407920284082697,
  'max_depth': 6.090247578634,
  'min_child_weight': 33.33272426081254,
  'num_leaves': 380.181539974917}}

In [54]:
#파라미터 적용
fit_lgbm = lgbm.LGBMClassifier(learning_rate=lgbmBO.max['params']['learning_rate'],
                               num_leaves = int(round(lgbmBO.max['params']['num_leaves'])),
                               max_depth = int(round(lgbmBO.max['params']['max_depth'])),
                               min_child_weight = int(round(lgbmBO.max['params']['min_child_weight'])),
                               colsample_bytree=lgbmBO.max['params']['colsample_bytree'],
                               feature_fraction = max(min(lgbmBO.max['params']['feature_fraction'], 1), 0),
                               bagging_fraction = max(min(lgbmBO.max['params']['bagging_fraction'], 1), 0),
                               lambda_l1 = lgbmBO.max['params']['lambda_l1'],
                               lambda_l2 = lgbmBO.max['params']['lambda_l2']
                               )

In [55]:
model = fit_lgbm.fit(X,y)

## 04 모델 적용

In [56]:
import joblib
joblib.dump(model, 'lgbmBO_201006.pkl')

['lgbmBO_201006.pkl']

In [57]:
pred_y = model.predict(test)

In [58]:
submission['voted']=pred_y
submission.to_csv('lgbmBO_201006.csv')