In [3]:
# ============================================= [ setting ] ==========================================================
import pandas as pd  # 데이터 분석 라이브러리
import numpy as np  # 계산 라이브러리
from tqdm import tqdm  # 진행바
from sklearn.metrics import roc_auc_score  # AUC 스코어 계산
from sklearn.model_selection import KFold  # K-fold CV
from bayes_opt import BayesianOptimization  # 베이지안 최적화 라이브러리
from functools import partial  # 함수 변수 고정
import lightgbm as lgb  # LightGBM 라이브러리
import eli5
from eli5.sklearn import PermutationImportance

Using TensorFlow backend.


In [73]:
tidy_train = pd.read_csv("/Users/gimjiseong/Downloads/[ DACON ] Game_Behavior_DataAnalysis_Comp./tidy_train.csv")

In [74]:
x_train = tidy_train.iloc[:, 0:49]
y_train = tidy_train.loc[:, "winner"]

In [75]:
x_train = x_train.set_index("game_id")

In [76]:
# ========================================================== [ modeling ] ===========================================================
## + [lightGBM modeling ] ==================================
def lgb_cv(num_leaves, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda,
           bagging_fraction, x_data = None, y_data = None, n_splits = 5, output = 'score'):
    score = 0
    kf = KFold(n_splits = n_splits)
    models = []
    for train_index, valid_index in kf.split(x_data):
        x_train, y_train = x_data.iloc[train_index], y_data[train_index]
        x_valid, y_valid = x_data.iloc[valid_index], y_data[valid_index]

        model = lgb.LGBMClassifier(
            num_leaves = int(num_leaves),
            learning_rate = learning_rate,
            n_estimators = int(n_estimators),
            subsample = np.clip(subsample, 0, 1),
            colsample_bytree = np.clip(colsample_bytree, 0, 1),
            reg_alpha = reg_alpha,
            reg_lambda = reg_lambda,
            bagging_fraction = np.clip(bagging_fraction, 0, 1)
        )

        model.fit(x_train, y_train)
        models.append(model)

        pred = model.predict_proba(x_valid)[:, 1]
        true = y_valid
        score += roc_auc_score(true, pred) / n_splits

    if output == 'score':
        return score
    if output == 'model':
        return models

In [8]:
func_fixed = partial(lgb_cv, x_data=x_train, y_data=y_train, n_splits = 5, output='score')

lgbBO = BayesianOptimization(
    func_fixed,
    {   'num_leaves': (16, 1024),  # num_leaves,       범위(16~1024)
        'learning_rate': (0.0001, 0.1),  # learning_rate,    범위(0.0001~0.1)
        'n_estimators': (16, 1024),  # n_estimators,     범위(16~1024)
        'subsample': (0, 1),  # subsample,        범위(0~1)
        'colsample_bytree': (0, 1),  # colsample_bytree, 범위(0~1)
        'reg_alpha': (0, 10),  # reg_alpha,        범위(0~10)
        'reg_lambda': (0, 50),  # reg_lambda,       범위(0~50)
        "bagging_fraction": (0, 1)
    },
    random_state = 21  # 시드 고정
)

lgbBO.maximize(init_points = 5, n_iter = 30)  # 처음 5회 랜덤 값으로 score 계산 후 50회 최적화

|   iter    |  target   | baggin... | colsam... | learni... | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.6566  [0m | [0m 0.04872 [0m | [0m 0.2891  [0m | [0m 0.07212 [0m | [0m 37.79   [0m | [0m 223.6   [0m | [0m 0.5077  [0m | [0m 15.11   [0m | [0m 0.6639  [0m |
| [95m 2       [0m | [95m 0.6698  [0m | [95m 0.3081  [0m | [95m 0.5836  [0m | [95m 0.00705 [0m | [95m 890.3   [0m | [95m 150.3   [0m | [95m 1.781   [0m | [95m 24.8    [0m | [95m 0.8637  [0m |
| [0m 3       [0m | [0m 0.656   [0m | [0m 0.7589  [0m | [0m 0.9705  [0m | [0m 0.07595 [0m | [0m 403.3   [0m | [0m 428.0   [0m | [0m 7.134   [0m | [0m 13.53   [0m | [0m 0.8541  [0m |
| [0m 4       [0m | [0m 0.6629  [0m | [0m 0.9132  [0m | [0m 0.7608  [0m | [0m 0.05172 [0m | [0m 185.2   [0m | [0m 317.1   [0m 

In [77]:
params = lgbBO.max['params']
models = lgb_cv(
    params['num_leaves'],
    params['learning_rate'],
    params['n_estimators'],
    params['subsample'],
    params['colsample_bytree'],
    params['reg_alpha'],
    params['reg_lambda'],
    params["bagging_fraction"],
    x_data = x_train, y_data = y_train, n_splits = 5, output = 'model')

In [78]:
tidy_test = pd.read_csv("/Users/gimjiseong/Downloads/[ DACON ] Game_Behavior_DataAnalysis_Comp./tidy_test.csv")
x_test = tidy_test.copy() 

In [79]:
x_test = x_test.set_index("game_id")

In [80]:
preds = []
for model in models:
    pred = model.predict_proba(x_test)[:, 1]
    preds.append(pred)
pred = np.mean(preds, axis = 0)

In [81]:
# ========================================================== [ output ] ===========================================================
submission = pd.read_csv('/Users/gimjiseong/Downloads/[ DACON ] Game_Behavior_DataAnalysis_Comp./data/sample_submission.csv', index_col = 0)
submission['winner'] = submission['winner'] + pred
submission.head(20)

Unnamed: 0_level_0,winner
game_id,Unnamed: 1_level_1
38872,0.664486
38873,0.643187
38874,0.50932
38875,0.22385
38876,0.591681
38877,0.472641
38878,0.757168
38879,0.391858
38880,0.441628
38881,0.444267


In [82]:
submission.to_csv('/Users/gimjiseong/Downloads/[ DACON ] Game_Behavior_DataAnalysis_Comp./output14.csv')