## Dacon  3회 게임 행동 데이터 분석 경진대회
## 팀 : 도발하려던건 아니었습니다만
## 2020년 4월 20일

## 1. 라이브러리 및 데이터
## Library & Data

In [None]:
# 라이브러리 설치
import os                                            # 디렉토리 설정
os.chdir("/data")
import warnings                                      # 경고 메세지 무시
warnings.filterwarnings('ignore')
import pandas as pd                                  # 데이터 조작, 분석
import numpy as np                                   # 행렬 연산
import random                                        # 난수 생성
random.seed(2020)
random_seed = 2020
import time                                          # 시간 측정
import re                                            # 정규표현식

from sklearn.model_selection import train_test_split # train, validation 데이터 나누기
from sklearn import metrics                          # AUC 측정
!pip install catboost
from catboost import CatBoostClassifier, Pool        # CatBoost 모델링
import lightgbm as lgb                               # lightGBM 모델링
from sklearn.model_selection import KFold            # K-fold CV    
!pip install bayesian-optimization
from bayes_opt import BayesianOptimization           # 베이지안 최적화 라이브러리  
from functools import partial                        # 함수 변수 고정

# 데이터 불러오기
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/94/ec/12b9a42b2ea7dfe5b602f235692ab2b61ee1334ff34334a15902272869e8/catboost-0.22-cp36-none-manylinux1_x86_64.whl (64.4MB)
[K     |████████████████████████████████| 64.4MB 53kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.22
Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/b5/26/9842333adbb8f17bcb3d699400a8b1ccd

## 2. 데이터 전처리
## Data Cleansing & Pre-Processing

In [None]:
# 반응변수 전처리
def preprocess_y(data, exchange_player=False):
  y = data.drop_duplicates(['game_id', 'winner']).winner.reset_index(drop=True)
  if (exchange_player == True):
    y = y.append(-(y - 1)).reset_index(drop=True)
  return y

In [None]:
# 설명변수 전처리
def preprocess_X(data, exchange_player=False):

  # game_id 개수만큼의 index를 가진 DataFrame X 생성
  n = data.game_id.max() + 1
  X = pd.DataFrame(index=range(n)[data.game_id.min():])

  # time 변수
  X['time'] = data.drop_duplicates(['game_id'],keep='last').set_index('game_id').time
  X['time'] = (X.time*100//100*60 + X.time*100%100).astype(int)

  # species 더미 변수
  X = pd.concat([pd.get_dummies(data[data.player == 0].drop_duplicates(['game_id']).set_index('game_id').species).rename(columns={'P':'0_protoss','T':'0_terran','Z':'0_zerg'}),
                pd.get_dummies(data[data.player == 1].drop_duplicates(['game_id']).set_index('game_id').species).rename(columns={'P':'1_protoss','T':'1_terran','Z':'1_zerg'}),
                X],axis=1)
  
  # event 카운트
  contents = data.loc[:,['player','game_id','time']].groupby(['player', 'game_id']).count().unstack(level=0)
  contents.columns = ['0_event', '1_event']
  X['0_event'], X['1_event'] = contents['0_event'], contents['1_event']

  # event 카운트 / time
  X['0_event_per_sec'], X['1_event_per_sec'] = X['0_event'] /X.time, X['1_event'] /X.time

  # event == Ability, AddToControlGroup, Camera, ControlGroup, GetControlGroup, Right Click, Selection, SetControlGroup일 때 각각 카운트
  contents = data.loc[:,['player','event','game_id','time']].groupby(['player', 'event', 'game_id']).count().unstack(level=[0,1]).fillna(0).astype(int)
  contents.columns = ['0_'+x for x in sorted(data.event.unique())] + ['1_'+x for x in sorted(data.event.unique())]
  for i in contents.columns:
    X[i] = contents[i]

  # event == Camera일 때 event_contents의 2차원 좌표 간 euclidean distance sum, min, median, max
  def move_sum(i):
    return sum(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 +
                       np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))
  def move_min(i):
    if len(i) == 1:
      return 0
    return min(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 +
                       np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))
  def move_median(i):
    if len(i) == 1:
      return 0
    return np.median(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 + 
                             np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))
  def move_max(i):
    if len(i) == 1:
      return 0
    return max(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 +
                       np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))
  contents = (data[data.event == 'Camera'].loc[:,['player','game_id','event_contents']].
              groupby(['player','game_id'])).agg([move_sum,move_min,move_median,move_max]).unstack(level=0)
  contents.columns = [y+x for x in ['sum','min','median','max'] for y in ['0_move_','1_move_']]
  for i in contents.columns:
    X[i] = contents[i].fillna(0)

  # 30초 이내 move_sum
  contents = (data[(data.time < 0.3) & (data.event == 'Camera')].loc[:,['player','game_id','event_contents']].
              groupby(['player','game_id'])).agg(move_sum).unstack(level=0)
  contents.columns = ['0_move_sum_30sec','1_move_sum_30sec']
  for i in contents.columns:
    X[i] = contents[i]

  # event == Ability일 때 event_contents 더미 변수 생성, 카운트
  contents = pd.DataFrame(data.event_contents[(data.event == 'Ability')].map(lambda x: x[x.find('(')+1:x.find(')')]))  # event_contents의 16진수 코드만 추출
  contents['game_id'], contents['player'], contents['count'] = data.game_id, data.player, 1
  contents_X = pd.DataFrame(columns=[x+y for x in ['0_','1_'] for y in contents.event_contents.unique()])
  contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
  contents.columns = contents.columns.map(lambda x: str(x[1])+'_'+x[2])
  contents_X = pd.concat([contents_X, contents])
  for i in contents_X.columns:
    X[i] = contents_X[i]
    X[i] = X[i].fillna(0).astype(int)

  # event == Ability일 때 event_contents 더미 변수 생성 / time
  for i in contents_X.columns:
    X[i+'_div_time'] = X[i] /X.time

  # event == Selection일 때 event_contents 더미 변수 생성, 카운트
  contents = data[data.event == 'Selection'].event_contents.map(lambda x: re.sub('\s\[.....\]', '', re.sub('\s\[......\]', '', re.sub('\s\[.......\]', '', x))).
                                                                replace('[', '').replace(']', '').replace(' ', '').replace('\'', ''))
  contents = contents.str.split(',')
  max_num = max(contents.map(lambda x: len(x)))
  t = [0 for x in range(max_num)]
  for i in range(max_num):
    t[i] = pd.DataFrame(contents[contents.map(lambda x: len(x) > i)].map(lambda x: x[i]))
  contents = pd.concat([t[i] for i in range(max_num)])
  contents['game_id'], contents['player'], contents['count'] = data.game_id, data.player, 1
  contents_X = pd.DataFrame(columns=[x+y for x in ['0_','1_'] for y in contents.event_contents.unique()])
  contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
  contents.columns = contents.columns.map(lambda x: str(x[1])+'_'+x[2])
  contents_X = pd.concat([contents_X, contents])
  for i in contents_X.columns:
    X[i] = contents_X[i]
    X[i] = X[i].fillna(0).astype(int)

  # event == Selection일 때 event_contents 더미 변수 생성 / time
  for i in contents_X.columns:
    X[i+'_div_time'] = X[i] /X.time

  # 30초 이내 event == Selection일 때 event_contents 더미 변수 생성, 카운트
  contents = data[(data.time < 0.3) & (data.event == 'Selection')].event_contents.map(lambda x: re.sub('\s\[.....\]', '', re.sub('\s\[......\]', '', re.sub('\s\[.......\]', '', x))).
                                                                                      replace('[', '').replace(']', '').replace(' ', '').replace('\'', ''))
  contents = contents.str.split(',')
  max_num = max(contents.map(lambda x: len(x)))
  t = [0 for x in range(max_num)]
  for i in range(max_num):
    t[i] = pd.DataFrame(contents[contents.map(lambda x: len(x) > i)].map(lambda x: x[i]))
  contents = pd.concat([t[i] for i in range(max_num)])
  contents['game_id'], contents['player'], contents['count'] = data.game_id, data.player, 1
  contents_X = pd.DataFrame(columns=[x+y for x in ['0_','1_'] for y in contents.event_contents.unique()])
  contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
  contents.columns = contents.columns.map(lambda x: str(x[1])+'_'+x[2])
  contents_X = pd.concat([contents_X, contents])
  for i in contents_X.columns:
    X[i+'_30sec'] = contents_X[i]
    X[i+'_30sec'] = X[i+'_30sec'].fillna(0).astype(int)

  # event == Right Click일 때 Target 이름 더미 변수 생성, 카운트
  contents = pd.DataFrame(data.event_contents[(data.event == 'Right Click') & (data.event_contents.map(lambda x: str(x)[:6]) == 'Target')].map(lambda x: x[x.find(':')+2:x.find(' [')]))  # event_contents의 Target만 추출
  contents['game_id'], contents['player'], contents['count'] = data.game_id, data.player, 1
  contents_X = pd.DataFrame(columns=[x+y for x in ['0_Target_','1_Target_'] for y in contents.event_contents.unique()])
  contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
  contents.columns = contents.columns.map(lambda x: str(x[1])+'_Target_'+x[2])
  contents_X = pd.concat([contents_X, contents])
  for i in contents_X.columns:
    X[i] = contents_X[i]
    X[i] = X[i].fillna(0).astype(int)

  # 컬럼 이름 순서로 정렬
  X = X[sorted(X.columns)]

  # player 0,1 자리 바꾼 X1생성, X와 행 병합해 데이터 2배로 만들기
  if (exchange_player == True):
    c = X.shape[1]//2
    X1 = X.copy()
    X1.columns = list(X.columns[c:2*c])+list(X.columns[:c])+['time']
    X1.index = [x+n for x in range(n)]
    X = pd.concat([X, X1])

  return X

In [None]:
# train, test 전처리 수행, y(반응변수), X,test_X(설명변수) 생성
y = preprocess_y(train, True)
X = preprocess_X(train, True)
test_X = preprocess_X(test, False)

# 메모리 효율 위해 train, test raw data 삭제
del train, test

# X, test_X에만 있는 컬럼 삭제
X.drop(set(X.columns) - set(test_X.columns), axis=1, inplace=True)
test_X.drop(set(test_X.columns) - set(X.columns), axis=1, inplace=True)

## 3. 탐색적 자료분석
## Exploratory Data Analysis


In [None]:
# 입력하세요.

## 4. 변수 선택 및 모델 구축
## Feature Engineering & Initial Modeling

In [None]:
# CatBoost 모델링
def catboost_modeling(x_train, y_train, x_test, grow_policy, depth, learning_rate, l2_leaf_reg, random_seed, n):
  
  # 빈 Series인 test_pred 생성
  test_pred = pd.Series([0 for x in range(len(x_test))], index=x_test.index)
  
  # 10-fold 모델링을 n회 반복할 것
  for i in range(n):
    kf = KFold(n_splits=10, random_state=random_seed+i)
    for train_index, valid_index in kf.split(x_train):
      train_X, train_y = x_train.iloc[train_index], y_train[train_index]
      valid_X, valid_y = x_train.iloc[valid_index], y_train[valid_index]
      
      # catBoost(grow_policy='Depthwise')
      model = CatBoostClassifier(eval_metric = 'AUC',              # AUC로 성능 측정
                                 iterations = 25000,               # 반복횟수 최대 25000
                                 metric_period = 25000,            # 중간결과 출력X
                                 early_stopping_rounds = 1000,     # 1000iteration 동안 AUC 증가 없으면 학습 중단
                                 task_type = 'GPU',                # GPU 사용
                                 grow_policy = grow_policy,        # 트리 노드 생성 방식
                                                                   # 1) Depthwise(지정한 depth에 이를 때까지 level 순으로 노드 분할)
                                                                   # 2) Lossguide(loss 변화가 큰 순으로 노드 분할)
                                 depth = depth,                    # 트리 깊이
                                 learning_rate = learning_rate,    # 러닝레이트
                                 l2_leaf_reg = l2_leaf_reg,        # L2 정규화
                                 random_seed = random_seed+i,      # 랜덤시드 고정
                                 )
      # 모델 학습
      model.fit(train_X, train_y, eval_set=(valid_X, valid_y))
    
      # 모델 적용
      test_pred += model.predict_proba(x_test)[:,1] /(10*n)
    
  # 설정된 디렉토리에 결과물 저장
  sample_submission = pd.read_csv('sample_submission.csv', index_col=0)
  submission = pd.DataFrame(data=test_pred, columns=sample_submission.columns, index=sample_submission.index)
  submission.to_csv('CatBoost_'+grow_policy+'_'+str(depth)+'.csv', index=True)

  return test_pred

## 5. 모델 학습 및 검증
## Model Tuning & Evaluation

In [None]:
# data1, data2, data3, data4 생성

In [None]:
data1 = catboost_modeling(X, y, test_X, 'Depthwise', 10, 0.02423, 20.35, 2014, 2)

0:	learn: 0.6885680	test: 0.6280163	best: 0.6280163 (0)	total: 93.1ms	remaining: 38m 47s
bestTest = 0.7472958565
bestIteration = 4700
Shrink model to first 4701 iterations.
0:	learn: 0.6879591	test: 0.6358847	best: 0.6358847 (0)	total: 90.5ms	remaining: 37m 42s
bestTest = 0.7568647861
bestIteration = 15705
Shrink model to first 15706 iterations.
0:	learn: 0.6868256	test: 0.6478515	best: 0.6478515 (0)	total: 88.4ms	remaining: 36m 50s
bestTest = 0.7744044662
bestIteration = 5953
Shrink model to first 5954 iterations.
0:	learn: 0.6864004	test: 0.6316850	best: 0.6316850 (0)	total: 87.2ms	remaining: 36m 18s
bestTest = 0.7607246339
bestIteration = 11089
Shrink model to first 11090 iterations.
0:	learn: 0.6787407	test: 0.6275454	best: 0.6275454 (0)	total: 85ms	remaining: 35m 24s
bestTest = 0.7645706534
bestIteration = 11940
Shrink model to first 11941 iterations.
0:	learn: 0.6873725	test: 0.6234113	best: 0.6234113 (0)	total: 84.3ms	remaining: 35m 7s
bestTest = 0.7502387166
bestIteration = 106

In [None]:
data2 = catboost_modeling(X, y, test_X, 'Lossguide', 8, 0.01063, 5.127, 2014, 2)

0:	learn: 0.6479527	test: 0.6260457	best: 0.6260457 (0)	total: 30.1ms	remaining: 12m 31s
bestTest = 0.7482405305
bestIteration = 11591
Shrink model to first 11592 iterations.
0:	learn: 0.6407955	test: 0.6331550	best: 0.6331550 (0)	total: 29.4ms	remaining: 12m 15s
bestTest = 0.7595637441
bestIteration = 12962
Shrink model to first 12963 iterations.
0:	learn: 0.6467297	test: 0.6461743	best: 0.6461743 (0)	total: 30.9ms	remaining: 12m 52s
bestTest = 0.774276495
bestIteration = 11624
Shrink model to first 11625 iterations.
0:	learn: 0.6429494	test: 0.6330369	best: 0.6330369 (0)	total: 30.2ms	remaining: 12m 35s
bestTest = 0.7614810467
bestIteration = 16178
Shrink model to first 16179 iterations.
0:	learn: 0.6409373	test: 0.6283141	best: 0.6283141 (0)	total: 32.3ms	remaining: 13m 27s
bestTest = 0.763282001
bestIteration = 13004
Shrink model to first 13005 iterations.
0:	learn: 0.6460768	test: 0.6271483	best: 0.6271483 (0)	total: 33.7ms	remaining: 14m 2s
bestTest = 0.747095108
bestIteration = 

In [None]:
data3 = catboost_modeling(X, y, test_X, 'Depthwise', 12, 0.01564, 49.99, 2022, 2)

0:	learn: 0.7239439	test: 0.6154970	best: 0.6154970 (0)	total: 227ms	remaining: 1h 34m 40s
bestTest = 0.7495038509
bestIteration = 14828
Shrink model to first 14829 iterations.
0:	learn: 0.7305130	test: 0.6290831	best: 0.6290831 (0)	total: 222ms	remaining: 1h 32m 36s
bestTest = 0.7569420934
bestIteration = 8193
Shrink model to first 8194 iterations.
0:	learn: 0.7249723	test: 0.6403514	best: 0.6403514 (0)	total: 199ms	remaining: 1h 22m 45s
bestTest = 0.7764061093
bestIteration = 9936
Shrink model to first 9937 iterations.
0:	learn: 0.7275343	test: 0.6299198	best: 0.6299198 (0)	total: 196ms	remaining: 1h 21m 33s
bestTest = 0.7600096464
bestIteration = 12420
Shrink model to first 12421 iterations.
0:	learn: 0.7243306	test: 0.6415314	best: 0.6415314 (0)	total: 192ms	remaining: 1h 20m 6s
bestTest = 0.7662849426
bestIteration = 10495
Shrink model to first 10496 iterations.
0:	learn: 0.7235512	test: 0.6212602	best: 0.6212602 (0)	total: 170ms	remaining: 1h 10m 41s
bestTest = 0.7492182255
bestI

In [None]:
data4 = catboost_modeling(X, y, test_X, 'Lossguide', 16, 0.01213, 5.027, 2022, 2)

0:	learn: 0.6349732	test: 0.6138771	best: 0.6138771 (0)	total: 112ms	remaining: 46m 27s
bestTest = 0.7470026612
bestIteration = 12194
Shrink model to first 12195 iterations.
0:	learn: 0.6483004	test: 0.6382686	best: 0.6382686 (0)	total: 106ms	remaining: 44m 1s
bestTest = 0.7585081458
bestIteration = 10716
Shrink model to first 10717 iterations.
0:	learn: 0.6501396	test: 0.6520075	best: 0.6520075 (0)	total: 105ms	remaining: 43m 39s
bestTest = 0.7750359476
bestIteration = 8516
Shrink model to first 8517 iterations.
0:	learn: 0.6464623	test: 0.6434762	best: 0.6434762 (0)	total: 108ms	remaining: 45m 6s
bestTest = 0.7626478672
bestIteration = 14248
Shrink model to first 14249 iterations.
0:	learn: 0.6499757	test: 0.6428963	best: 0.6428963 (0)	total: 110ms	remaining: 45m 45s
bestTest = 0.7640561461
bestIteration = 16115
Shrink model to first 16116 iterations.
0:	learn: 0.6478974	test: 0.6284932	best: 0.6284932 (0)	total: 106ms	remaining: 44m 18s
bestTest = 0.7469660044
bestIteration = 7765
S

## 6. 결과 및 결언
## Conclusion & Discussion

In [None]:
# 최종 모델 앙상블
sample_submission = pd.read_csv('sample_submission.csv', index_col=0)
data_final = pd.DataFrame((data1+data2)/2 *1/3 + (data3+data4)/2 *2/3)
data_final.columns = sample_submission.columns
data_final.to_csv('data_final.csv', index =True)
data_final

*  이번 대회의 행동데이터를 전처리를 하여 만들어질 데이터는 sparse할 것이라 예상하였습니다.  
*  따라서 EDA 를 통한 Feature Engineering 보다는, 가공되지 않은 raw 데이터에 포함 된 정보를 최대한 정형화된 형태로 피쳐를 생성하는 것에 집중했습니다.
* 모델 학습 단계에서 모든 observation의 정보를 이용하고, 과적합 방지를 위해서 10 - fold 로 모델을 학습시키고 평균을 내는것이 성능을 많이 높일 수 있다는 것을 확인할 수 있었습니다. 
* player1 과 player2 를 스왑하여 observation을 두배로 만들어 주는것이 성능 향상에 도움이 되었습니다.
* catboost알고리듬의 growplicy 옵션의 파라미터 값( "Lossgiude" 와 "Depthwise") 에 따라 베이지안 최적화를 팀원과 함께 해본 결과, depth와 l2_leaf_reg 등의 최적값은 다르게 나왔습니다.
* 실험을 해 본 결과, 파라미터의 다양성이 모델의 성능을 향상시킨다는 것을 알게 되어, 총 네가지 조합의 파라미터로 앙상블을 했습니다. (data1, data2, data3, data4)
* 또한 각 조합마다 랜덤시드를 2번 바꿔주었기에 (catboost_modeling 사용자 함수에서의 n 파라미터) 총 8개의 모델을 학습시켰습니다.
* 대회기간 내내 코랩의 GPU를 이용했기 때문에 결과는 차이가 날 수 있으나, 차이는 미미할 것으로 예상합니다. (catboost의 깃허브에서 이에 대해 언급하고 있습니다.https://github.com/catboost/catboost/issues/546)
