In [2]:
# ============================================= [ setting ] ==========================================================
import pandas as pd  # 데이터 분석 라이브러리
import numpy as np  # 계산 라이브러리
from tqdm import tqdm  # 진행바
from sklearn.metrics import roc_auc_score  # AUC 스코어 계산
from sklearn.model_selection import KFold  # K-fold CV
from bayes_opt import BayesianOptimization  # 베이지안 최적화 라이브러리
from functools import partial  # 함수 변수 고정
import lightgbm as lgb  # LightGBM 라이브러리


In [4]:
# ============================================= [ pre-processing ] ==============================================

def species_converter(string):
    if string == 'T':
        return 0
    elif string == 'P':
        return 1
    elif string == 'Z':
        return 2
    else:
        raise ValueError


def data_preparation(df, answer=False):
    game_ids = df['game_id'].unique()
    events = ['Ability', 'AddToControlGroup', 'Camera', 'ControlGroup', 'GetControlGroup', 'Right Click', 'Selection',
              'SetControlGroup']
    unique_event_0, unique_event_1, delta_event = {}, {}, {}
    for event in events:
        unique_event_0['P0_' + event] = 0
        unique_event_1['P1_' + event] = 0
        delta_event['delta_' + event] = 0

    species = df.groupby(['game_id', 'player']).species.unique()
    event_count = df.groupby(['game_id', 'player']).event.value_counts()
    if answer:
        winners = df.groupby(['game_id']).winner.max()

    x_data, y_data = [], []
    for game_id in tqdm(game_ids):
        df_event_count = event_count[game_id].unstack(level=-1)
        df = pd.DataFrame(species[game_id])
        df = pd.concat([df, df_event_count], axis=1)
        df = df.fillna(0)

        df_P0_species = pd.DataFrame([species_converter(df.loc[0]['species'][0])], columns=['P0_species'])
        df_P1_species = pd.DataFrame([species_converter(df.loc[1]['species'][0])], columns=['P1_species'])
        df = df.drop(['species'], axis=1)

        df_P0_event = unique_event_0.copy()
        for column in df.columns:
            df_P0_event['P0_' + column] = df.loc[0][column]
        df_P0_event = pd.DataFrame(pd.Series(df_P0_event)).T

        df_P1_event = unique_event_1.copy()
        for column in df.columns:
            df_P1_event['P1_' + column] = df.loc[1][column]
        df_P1_event = pd.DataFrame(pd.Series(df_P1_event)).T

        df_delta_event = delta_event.copy()
        for column in df.columns:
            df_delta_event['delta_' + column] = df_P0_event['P0_' + column][0] - df_P1_event['P1_' + column][0]
        df_delta_event = pd.DataFrame(pd.Series(df_delta_event)).T

        out = pd.concat([df_P0_species, df_P0_event, df_P1_species, df_P1_event, df_delta_event], axis=1)
        out.index = [game_id]
        out.index.name = 'game_id'

        x_data.append(out)
        if answer:
            y_data.append(winners[game_id])

    x_data = pd.concat(x_data)
    y_data = np.array(y_data)

    return x_data, y_data


train = pd.read_csv('C:/Users/HSystem/Desktop/data/train.csv')
x_train, y_train = data_preparation(train, answer=True)
x_train.head()

100%|██████████| 38872/38872 [05:07<00:00, 126.49it/s]


Unnamed: 0_level_0,P0_species,P0_Ability,P0_AddToControlGroup,P0_Camera,P0_ControlGroup,P0_GetControlGroup,P0_Right Click,P0_Selection,P0_SetControlGroup,P1_species,...,P1_Selection,P1_SetControlGroup,delta_Ability,delta_AddToControlGroup,delta_Camera,delta_ControlGroup,delta_GetControlGroup,delta_Right Click,delta_Selection,delta_SetControlGroup
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,34.0,2.0,444.0,0.0,24.0,35.0,50.0,3.0,0,...,57.0,1.0,0.0,2.0,19.0,0.0,21.0,7.0,-7.0,2.0
1,1,77.0,1.0,627.0,0.0,162.0,160.0,186.0,10.0,0,...,116.0,8.0,10.0,1.0,-231.0,0.0,131.0,29.0,70.0,2.0
2,1,69.0,6.0,413.0,0.0,99.0,160.0,90.0,14.0,2,...,232.0,9.0,-16.0,1.0,-312.0,-2.0,-10.0,-44.0,-142.0,5.0
3,0,82.0,0.0,713.0,0.0,132.0,276.0,180.0,6.0,1,...,148.0,19.0,-7.0,0.0,325.0,0.0,-578.0,8.0,32.0,-13.0
4,0,57.0,1.0,430.0,0.0,224.0,177.0,67.0,10.0,2,...,126.0,8.0,21.0,-3.0,158.0,0.0,125.0,71.0,-59.0,2.0


In [5]:
# ========================================================== [ modeling ] ===========================================================
## + [lightGBM modeling ] ==================================
def lgb_cv(num_leaves, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda,
           bagging_fraction, x_data=None, y_data=None, n_splits=5, output='score'):
    score = 0
    kf = KFold(n_splits=n_splits)
    models = []
    for train_index, valid_index in kf.split(x_data):
        x_train, y_train = x_data.iloc[train_index], y_data[train_index]
        x_valid, y_valid = x_data.iloc[valid_index], y_data[valid_index]

        model = lgb.LGBMClassifier(
            num_leaves = int(num_leaves),
            learning_rate = learning_rate,
            n_estimators = int(n_estimators),
            subsample = np.clip(subsample, 0, 1),
            colsample_bytree = np.clip(colsample_bytree, 0, 1),
            reg_alpha = reg_alpha,
            reg_lambda = reg_lambda,
            bagging_fraction = np.clip(bagging_fraction, 0, 1),
            feature_fraction = np.clip(feature_fraction, 0.5, 1)
        )

        model.fit(x_train, y_train)
        models.append(model)

        pred = model.predict_proba(x_valid)[:, 1]
        true = y_valid
        score += roc_auc_score(true, pred) / n_splits

    if output == 'score':
        return score
    if output == 'model':
        return models
    
func_fixed = partial(lgb_cv, 
                     x_data = x_train, 
                     y_data = y_train, 
                     n_splits = 5, 
                     output='score')

In [6]:
lgbBO = BayesianOptimization(
    func_fixed,
    {
        'num_leaves': (16, 1024),  
        'learning_rate': (0.0001, 0.1),
        'n_estimators': (16, 1024),
        'subsample': (0, 1),  
        'colsample_bytree': (0, 1), 
        'reg_alpha': (0, 10),  
        'reg_lambda': (0, 50),  
        "bagging_fraction": (0, 1)
    },
    random_state = 21  
)

lgbBO.maximize(init_points = 5, n_iter = 30) # n_iter = 30 steps

|   iter    |  target   | baggin... | colsam... | featur... | learni... | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.6397  [0m | [0m 0.04872 [0m | [0m 0.2891  [0m | [0m 0.8605  [0m | [0m 0.002259[0m | [0m 223.6   [0m | [0m 67.18   [0m | [0m 3.023   [0m | [0m 33.2    [0m | [0m 0.3081  [0m |
| [95m 2       [0m | [95m 0.6446  [0m | [95m 0.5836  [0m | [95m 0.06957 [0m | [95m 0.9337  [0m | [95m 0.01341 [0m | [95m 195.5   [0m | [95m 515.9   [0m | [95m 8.637   [0m | [95m 37.95   [0m | [95m 0.9705  [0m |
| [0m 3       [0m | [0m 0.6369  [0m | [0m 0.7593  [0m | [0m 0.3843  [0m | [0m 0.7044  [0m | [0m 0.07136 [0m | [0m 288.8   [0m | [0m 876.9   [0m | [0m 9.132   [0m | [0m 38.04   [0m | [0m 0.5167  [0m |
| [0m 4       [0m | [0m 0.6376  [0m | [0m 0.1678  

In [8]:
params = lgbBO.max['params']
models = lgb_cv(
    params['num_leaves'],
    params['learning_rate'],
    params['n_estimators'],
    params['subsample'],
    params['colsample_bytree'],
    params['reg_alpha'],
    params['reg_lambda'],
    params["bagging_fraction"],
    x_data = x_train, y_data = y_train, n_splits = 5, output = 'model')

test = pd.read_csv('C:/Users/HSystem/Desktop/data/test.csv')
x_test, _ = data_preparation(test, answer=False)

preds = []
for model in models:
    pred = model.predict_proba(x_test)[:, 1]
    preds.append(pred)
pred = np.mean(preds, axis=0)

100%|██████████| 16787/16787 [02:17<00:00, 122.03it/s]


In [9]:
# ========================================================== [ output ] ===========================================================
submission = pd.read_csv('C:/Users/HSystem/Desktop/data/sample_submission.csv', index_col=0)
submission['winner'] = submission['winner'] + pred
submission.to_csv('C:/Users/HSystem/Desktop/data/submission.csv')
submission.head()

Unnamed: 0_level_0,winner
game_id,Unnamed: 1_level_1
38872,0.614225
38873,0.501006
38874,0.439308
38875,0.302744
38876,0.463035


In [None]:
# ======================================= [ Deeplearning model ] ========================================

In [15]:
from keras import models 
from keras import layers

In [64]:
model = models.Sequential()
model.add(layers.Dense(64, input_dim = 26, activation = "relu"))
model.add(layers.Dense(64, activation = "relu"))
model.add(layers.Dense(1, activation = "sigmoid"))
model.compile(optimizer  = "adam",
             loss  = "binary_crossentropy", 
             metrics = ["accuracy"])

In [65]:
history = model.fit(x_train, y_train,
                   epochs = 100,
                   batch_size = 32,
                   validation_split = 0.2)

Train on 31097 samples, validate on 7775 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100


Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [66]:
print('\nAccuracy: {:.4f}'.format(model.evaluate(x_train, y_train)[1]))


Accuracy: 0.6006
