In [1]:
# 팀이름: 포티나이너스 

In [2]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/bb/7a/fd8059a3881d3ab37ac8f72f56b73937a14e8bb14a9733e68cc8b17dbe3c/bayesian-optimization-1.2.0.tar.gz
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-cp36-none-any.whl size=11685 sha256=2e11c525e65a9c7ed3c5f7fcd3129540ce02a09ebc40231676659474289f7f55
  Stored in directory: /root/.cache/pip/wheels/5a/56/ae/e0e3c1fc1954dc3ec712e2df547235ed072b448094d8f94aec
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


In [3]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/7e/c1/c1c4707013f9e2f8a96899dd3a87f66c9167d6d776a6dc8fe7ec8678d446/catboost-0.24.3-cp36-none-manylinux1_x86_64.whl (66.3MB)
[K     |████████████████████████████████| 66.3MB 89kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.3


In [4]:
#  라이브러리 로드
import pandas as pd                         # 데이터 분석 라이브러리
import numpy as np                          # 계산 라이브러리
import re
from tqdm import tqdm                       # 진행바
from sklearn.metrics import roc_auc_score   # AUC 스코어 계산
from sklearn.model_selection import KFold   # K-fold CV 
from sklearn.model_selection import StratifiedKFold, cross_val_score
from bayes_opt import BayesianOptimization  # 베이지안 최적화 라이브러리  
from functools import partial               # 함수 변수 고정
import lightgbm as lgb                      # LightGBM 라이브러리
import warnings                             
warnings.filterwarnings("ignore")           # 경고 문구 미표시

import itertools

from sklearn.preprocessing import LabelEncoder

from xgboost import plot_importance
from xgboost import XGBClassifier

from catboost import CatBoostClassifier

In [None]:
# 학습, 테스트 데이터 로드
train = pd.read_csv('/data/train.csv')
test = pd.read_csv('/data/test.csv')

In [None]:
train.head()

Unnamed: 0,game_id,winner,time,player,species,event,event_contents
0,0,1,0.0,0,T,Camera,"at (145.25, 21.5078125)"
1,0,1,0.0,1,T,Camera,"at (22.75, 147.0078125)"
2,0,1,0.02,0,T,Selection,['OrbitalCommand [3080001]']
3,0,1,0.02,0,T,Ability,(1360) - TrainSCV
4,0,1,0.14,0,T,Camera,"at (142.99609375, 24.50390625)"


In [None]:
# event가 Ability 일 때 ability_code 계산
def get_ablility_code(x): 
    try:
        return x.split('-')[0].strip('( )')
    except:
        return np.nan

In [None]:
# event가 Camera 일 때 x, y 좌표 계산
def get_camera_pos(x, pos):
        try:
            return float(x.strip('at ( )').split(',')[pos])
        except BaseException as e:
            return np.nan

In [None]:
train.loc[train['event']=='Ability','ability_code'] = train.loc[train['event']=='Ability','event_contents'].apply(lambda x: get_ablility_code(x))
test.loc[test['event']=='Ability','ability_code'] = test.loc[test['event']=='Ability','event_contents'].apply(lambda x: get_ablility_code(x))

In [None]:
train.loc[train['event']=='Camera','camera_pos_x'] = train.loc[train['event']=='Camera','event_contents'].apply(lambda x: get_camera_pos(x,0))
train.loc[train['event']=='Camera','camera_pos_y'] = train.loc[train['event']=='Camera','event_contents'].apply(lambda x: get_camera_pos(x,1))
test.loc[test['event']=='Camera','camera_pos_x'] = test.loc[test['event']=='Camera','event_contents'].apply(lambda x: get_camera_pos(x,0))
test.loc[test['event']=='Camera','camera_pos_y'] = test.loc[test['event']=='Camera','event_contents'].apply(lambda x: get_camera_pos(x,1))

In [None]:
train.head()

Unnamed: 0,game_id,winner,time,player,species,event,event_contents,ability_code,camera_pos_x,camera_pos_y
0,0,1,0.0,0,T,Camera,"at (145.25, 21.5078125)",,145.25,21.507812
1,0,1,0.0,1,T,Camera,"at (22.75, 147.0078125)",,22.75,147.007812
2,0,1,0.02,0,T,Selection,['OrbitalCommand [3080001]'],,,
3,0,1,0.02,0,T,Ability,(1360) - TrainSCV,1360.0,,
4,0,1,0.14,0,T,Camera,"at (142.99609375, 24.50390625)",,142.996094,24.503906


In [None]:
# 유닛코드 별 미네랄 정보 (딕셔너리)
unit_mineral={
    '15E0' : 50,    # TrainProbe
    '1586' : 100,   #
    '15A8' : 150,   #
    '15A0' : 150,   #
    '1581' : 125,   #
    '15C0' : 200,   #
    '15C1' : 25,    #
    '1580' : 100,   #
    '15A4' : 250,   #
    '15C3' : 275,   #
    '1585' : 50,    #
    '15A2' : 350,   #
    '15C2' : 300,   #
    '15D2' : 150,   #
    '1583' : 50,    # 
    '15A9' : 250,   #
    '2720' : 400,   #
    '1360' : 50,    #
    '13E0' : 50,    #
    '1420' : 100,   #
    '13E1' : 50,    #
    '13E3' : 150,   #
    '1421' : 150,   #
    '1407' : 150,   #
    '1426' : 150,   #
    '1422' : 100,   #
    '1424' : 150,   #
    '13E2' : 100,   #
    '1423' : 400,   #
    '1401' : 150,   #
    '1404' : 300,   #
    '1418' : 75,    #
    '1820' : 50,    #
    '1822' : 100,   #
    '1821' : 50,    # 
    '1823' : 100,   #
    '1829' : 75,    #
    '1BA0' : 50,    #
    '4020' : 25,    #
    '1824' : 100,   #
    '182B' : 150,   #
    '182E' : 100,   #
    '1840' : 150,   #
    '182A' : 100,   #
    '1E60' : 150,   #
    '920'  : 25     #
}

In [None]:
# 유닛코드 별 가스 정보
unit_gas = {
    '1586'  :  25 ,    #- TrainAdept/지상
    '15A8'  :  150,    #- TrainOracle/x(마법)
    '15A0'  :  100,    #- TrainPhoenix/공중
    '1581'  :  50 ,    #- TrainStalker/지상
    '15C1'  :  75 ,    #- TrainObserver/x
    '15A4'  :  150,    #- TrainVoidRay/지상,공중
    '15C3'  :  100,    #- TrainImmortal/지상
    '1585'  :  100,    #- TrainSentry/지상,공중 (마법)
    '15A2'  :  250,    #- TrainCarrier/지상,공중
    '15C2'  :  200,    #- TrainColossus/지상
    '15D2'  :  150,    #- TrainDisruptor/x(마법)
    '1583'  :  150,    #- TrainHighTemplar/x(마법)
    '15A9'  :  200,    #- TrainTempest/공중
    '2720'  :  100,    #- TrainMother/지상
    '1420'  :  100,    #- TrainMedivac/x
    '13E1'  :  50 ,    #- TrainReaper/지상
    '13E3'  :  25 ,    #- TrainMarauder/x
    '1421'  :  100,    #- TrainBanshee/지상
    '1407'  :  100,    #- TrainCyclone/지상,공중
    '1426'  :  100,    #- TrainLiberator/공중,지상
    '1422'  :  200,    #- TrainRaven/마법*
    '1424'  :  75 ,    #- TrainViking/지상,공중
    '13E2'  :  50 ,    #- TrainGhost/지상,공중
    '1423'  :  300,    #- TrainBattlecruiser/지상,공중
    '1401'  :  125,    #- Crucio지상
    '1404'  :  200,    #- Thor지상공중
    '1823'  :  50 ,    #- MorphHydralisk/지상,공중
    '1829'  :  25 ,    #- MorphRoach/지상
    '1BA0'  :  50,    # MorphToOverseer/x
    '4020'  :  75 ,    #- MorphToRavager/지상
    '1824'  :  100,    #- MorphMutalisk/지상,공중
    '182B'  :  100,    #- MorphCorruptor/공중
    '182E'  :  75 ,    #- MorphSwarmHost/지상
    '1840'  :  150,    #- MorphToBroodLord/지상
    '182A'  :  150,    #- MorphInfestor/x(마법)
    '920' : 25 #- Baneling지상   
}

In [None]:
# 건물 코드 리스트
building_0 = ['0_1541','0_1021','0_1543','0_1023','0_1541','0_1547','0_102A','0_1260','0_1261','0_102B','0_1024','0_1025','0_154E','0_12A0','0_154D','0_16E4','0_1549','0_16EE','0_1546','0_1020','0_16EF','0_12A1','0_12E1','0_16EA','0_16ED','0_1026','0_102D','0_16E3','0_12E0','0_16E5','0_16E0','0_16E6','0_154B','0_154C','0_154A','0_16E8','0_1542','0_16E2','0_1545','0_1028','0_1022','0_1540','0_102F','0_1029','0_16E9','0_2180','0_16E7']
building_1 = ['1_1541','1_1021','1_1543','1_1023','1_1541','1_1547','1_102A','1_1260','1_1261','1_102B','1_1024','1_1025','1_154E','1_12A0','1_154D','1_16E4','1_1549','1_16EE','1_1546','1_1020','1_16EF','1_12A1','1_12E1','1_16EA','1_16ED','1_1026','1_102D','1_16E3','1_12E0','1_16E5','1_16E0','1_16E6','1_154B','1_154C','1_154A','1_16E8','1_1542','1_16E2','1_1545','1_1028','1_1022','1_1540','1_102F','1_1029','1_16E9','1_2180','1_16E7']

In [None]:
# 건물 별 미네랄 정보
building_mineral ={
    '1541'  :  100,   #   BuildPylon                
    '1021'  :  100,   #   BuildSupplyDepot       
    '1543'  :  150,   #   BuildGateway           
    '1023'  :  150,   #   BuildBarracks          
    '1547'  :  150,   #   BuildPhotonCannon      
    '102A'  :  200,   #  BuildFactory           
    '1260'  :   50,   #   BuildBarracksTechLab   
    '1261'  :  50,   #    BuildBarracksReactor   
    '102B'  :  150,   #    BuildStarport         
    '1024'  :  125,   #   BuildEngineeringBay    
    '1025'  :  100,   #   BuildMissileTurret     
    '154E'  :  150,   #    BuildCyberneticsCore   
    '12A0'  :  50,   #    BuildFactoryTechLab    
    '154D'  :  200,   #    BuildRoboticsFacility 
    '1544'  :  150,   #   BuildForge             
    '16E4'  :   75,   #  BuildEvolutionChamber  
    '1549'  :  150,   #    BuildStargate         
    '16EE'  :  100,   #   BuildSpineCrawler      
    '1546'  :  150,   #    BuildTwilightCouncil  
    '1020'  :  400,   #   BuildCommandCenter     
    '16EF'  :  100,   #   BuildSporeCrawler      
    '12A1'  :  50,   #    BuildFactoryReactor    
    '12E1'  :  50,   #    BuildStarportReactor   
    '16EA'  :  100,   #    BuildBanelingNest      
    '16ED'  :  150,   #  BuildRoachWarren       
    '1026'  :  100,   #   BuildBunker            
    '102D'  :  150,   #    BuildArmory           
    '16E3'  :  200,   #   BuildSpawningPool         
    '12E0'  :  50,   #    BuildStarportTechLab   
    '16E5'  :  100,   #    BuildHydraliskDen     
    '16E0'  :  400,   #    BuildHatchery            
    '16E6'  :  200,   #    BuildSpire            
    '154B'  :  100,   #     BuildDarkShrine      
    '154C'  :  150,   #     BuildRoboticsBay     
    '154A'  :  150,   #     BuildTemplarArchive  
    '16E8'  :  100,   #    BuildInfestationPit   
    '1542'  :  100,   #   BuildAssimilator          
    '16E2'  :  75,   #  BuildExtractor            
    '1545'  :  300,   #    BuildFleetBeacon      
    '1028'  :  100,   #    BuildSensorTower       
    '1022'  :  75,   #  BuildRefinery             
    '1540'  :  400,   #   BuildNexus                
    '102F'  :  200,   #    BuildFusionCore       
    '1029'  :  150,   #    BuildGhostAcademy      
    '16E9'  :  150,   #    BuildNydusNetwork     
    '2180'  :  150,   #   BuildNydusCanal           
    '16E7'  :  150   #    BuildUltraliskCavern  
}

In [None]:
# 건물 별 가스 정보
building_gas ={  
    '102A' : 50   ,   #  BuildFactory           
    '1260' : 50   ,   #   BuildBarracksTechLab   
    '1261' : 50    ,  #    BuildBarracksReactor   
    '102B' : 100  ,   #    BuildStarport         
    '12A0' : 50   ,  #    BuildFactoryTechLab    
    '154D' : 20  ,   #    BuildRoboticsFacility 
    '1549' : 15  ,   #    BuildStargate            
    '1546' : 100  ,   #    BuildTwilightCouncil    
    '12A1' : 50   ,  #    BuildFactoryReactor    
    '12E1' : 50   ,  #    BuildStarportReactor   
    '16EA' : 50   ,   #    BuildBanelingNest            
    '102D' : 100  ,   #    BuildArmory               
    '12E0' : 50   ,  #    BuildStarportTechLab   
    '16E5' : 100  ,   #    BuildHydraliskDen            
    '16E6' : 200  ,   #    BuildSpire            
    '154B' : 25  ,   #     BuildDarkShrine      
    '154C' : 15  ,   #     BuildRoboticsBay     
    '154A' : 20  ,   #     BuildTemplarArchive  
    '16E8' : 100  ,   #    BuildInfestationPit         
    '1545' : 200  ,   #    BuildFleetBeacon      
    '1028' : 50   ,   #    BuildSensorTower                 
    '102F' : 200  ,   #    BuildFusionCore       
    '1029' : 50   ,   #    BuildGhostAcademy     
    '16E9' : 200  ,   #    BuildNydusNetwork            
    '16E7' : 200       #    BuildUltraliskCavern  
}

In [None]:
# 일꾼 코드 리스트 추가
worker_0 = ['0_1360', '0_15E0', '0_1820']
worker_1 = ['1_1360', '1_15E0', '1_1820']
worker = worker_0 + worker_1

# 일꾼 선택 정보 리스트
target_worker_0 = ['0_Target_SCV','0_Target_Probe','0_Target_Drone']
target_worker_1 = ['1_Target_SCV','1_Target_Probe','1_Target_Drone']

target_worker=target_worker_0 + target_worker_1

# 일꾼 공격 정보 리스트 
attack_worker_0 = ['0_Attack_SCV', '0_Attack_Probe', '0_Attack_Drone']
attack_worker_1 = ['1_Attack_SCV', '1_Attack_Probe', '1_Attack_Drone']
attack_worker = attack_worker_0 + attack_worker_1


# 유닛 코드 리스트(프, 테, 저) 추가 

# 프로토스 유닛 코드
              #  Adept  Oracle  Phoenix  Stalker  WarpPrism  Observer  Zealot  VoidRay  Immortal  Sentry  Carrier  Colossus  Disruptor  HighTemplar  Tempest  MothershipCore
unit_code_P_0 = ['0_1586', '0_15A8', '0_15A0', '0_1581', '0_15C0', '0_15C1', '0_1580', '0_15A4', '0_15C3', '0_1585', '0_15A2', '0_15C2', '0_15D2', '0_1583', '0_15A9', '0_2720']
unit_code_P_1 = ['1_1586', '1_15A8', '1_15A0', '1_1581', '1_15C0', '1_15C1', '1_1580', '1_15A4', '1_15C3', '1_1585', '1_15A2', '1_15C2', '1_15D2', '1_1583', '1_15A9', '1_2720']          

# 테란 유닛 코드 
              #  Marine  Medivac  Reaper  Marauder  Banshee  Cyclone  Liberator  Raven  Viking  Ghost  Battlecruiser  Crucio  Thor  Mine
unit_code_T_0 = ['0_13E0', '0_1420', '0_13E1', '0_13E3', '0_1421', '0_1407', '0_1426', '0_1422', '0_1424', '0_13E2', '0_1423', '0_1401', '0_1404', '0_1418']
unit_code_T_1 = ['1_13E0', '1_1420', '1_13E1', '1_13E3', '1_1421', '1_1407', '1_1426', '1_1422', '1_1424', '1_13E2', '1_1423', '1_1401', '1_1404', '1_1418']

# 저그 유닛 코드
              #  Overlord  Zergling  Hydralisk  Roach  ToOverseer  ToRavager  Mutalisk  Corruptor  SwarmHost  ToBroodLord  Infestor  Baneling 
unit_code_Z_0 = ['0_1822', '0_1821', '0_1823', '0_1829', '0_1BA0', '0_4020', '0_1824', '0_182B', '0_182E', '0_1840', '0_182A', '0_1E60', '0_920']
unit_code_Z_1 = ['1_1822', '1_1821', '1_1823', '1_1829', '1_1BA0', '1_4020', '1_1824', '1_182B', '1_182E', '1_1840', '1_182A', '1_1E60', '1_920']

# 지상 유닛 코드    # Stalker Zealot Immortal Sentry Colossus Disruptor HighTemplar Marine Reaper Marauder Crucio Ghost Thor Viking Cyclone Zergling Beneling Queen Hydralisk Roach Infestor SwarmHost
unit_ground_0=['0_1581', '0_1580', '0_15C3', '0_1585', '0_15C2', '0_15D2', '0_1583', '0_13E0', '0_13E1', '0_13E3', '0_1401', '0_13E2', '0_1404', '0_1424', '0_1407', '0_1821', '0_920', '0_1E60', '0_1823', '0_1829', '0_182A', '0_182E']
unit_ground_1=['1_1581', '1_1580', '1_15C3', '1_1585', '1_15C2', '1_15D2', '1_1583', '1_13E0', '1_13E1', '1_13E3', '1_1401', '1_13E2', '1_1404', '1_1424', '1_1407', '1_1821', '1_920', '1_1E60', '1_1823', '1_1829', '1_182A', '1_182E']

# 공중 유닛 코드    # Oracle Phoenix WarpPrism VoidRay Carrier Tempest Mother Banshee Raven Viking Battlecruiser Medivac Mutalisk Corruptor ToBroodLord
unit_sky_0 = ['0_15A8', '0_15A0', '0_15C0', '0_15A4', '0_15A2', '0_15A9', '0_2720',  '0_1421', '0_1422', '0_1424', '0_1423', '0_1420', '0_1824', '0_182B', '0_1840']
unit_sky_1 = ['1_15A8', '1_15A0', '1_15C0', '1_15A4', '1_15A2', '1_15A9', '1_2720',  '1_1421', '1_1422', '1_1424', '1_1423', '1_1420', '1_1824', '1_182B', '1_1840']

# 지상 유닛 공격이 가능한 유닛 코드   #Stalker Zealot VoidRay Immortal Sentry Carrier Colossus HighTemplar Tempest Mother Marine Reaper Marauder Crucio Ghost Thor Banshee Viking Battlecruiser Cyclone Zergling Beneling Queen Hydralisk Roach Mutalisk ToBroodLord SwarmHost
unit_can_atk_ground_0 = ['0_1581', '0_1580', '0_15A4', '0_15C3', '0_1585', '0_15A2', '0_15C2', '0_1583', '0_15A9', '0_2720',  '0_13E0', '0_13E1', '0_13E3', '0_1401', '0_13E2', '0_1404', '0_1421', '0_1424', '0_1423', '0_1407',   '0_1821', '0_920', '0_1E60', '0_1823', '0_1829', '0_1824', '0_1840', '0_182E']
unit_can_atk_ground_1 = ['1_1581', '1_1580', '1_15A4', '1_15C3', '1_1585', '1_15A2', '1_15C2', '1_1583', '1_15A9', '1_2720',  '1_13E0', '1_13E1', '1_13E3', '1_1401', '1_13E2', '1_1404', '1_1421', '1_1424', '1_1423', '1_1407',   '1_1821', '1_920', '1_1E60', '1_1823', '1_1829', '1_1824', '1_1840', '1_182E']

# 공중 유닛 공격이 가능한 유닛 코드     # TrainPhoenix TrainStalker TrainVoidRay TrainSentry TrainCarrier TrainHighTemplar TrainTempest TrainMarine TrainGhost Thor TrainViking TrainBattlecruiser TrainCyclone Queen MorphHydralisk MorphMutalisk MorphCorruptor MorphToBroodLord
unit_can_atk_sky_0 = ['0_15A0', '0_1581', '0_15A4', '0_1585', '0_15A2', '0_1583', '0_15A9', '0_13E0', '0_13E2', '0_1404', '0_1424', '0_1423', '0_1407', '0_1E60', '0_1823', '0_1824', '0_182B', '0_1840']
unit_can_atk_sky_1 = ['1_15A0', '1_1581', '1_15A4', '1_1585', '1_15A2', '1_1583', '1_15A9', '1_13E0', '1_13E2', '1_1404', '1_1424', '1_1423', '1_1407', '1_1E60', '1_1823', '1_1824', '1_182B', '1_1840']

# 지상 유닛 공격이 불가능한 유닛 코드
unit_cannot_atk_ground_0 = ['0_15A0', '0_15C2', '0_15A9', '0_1586', '0_13E1', '0_13E1', '0_13E1', '0_182B' ]
unit_cannot_atk_ground_1 = ['1_15A0', '1_15C2', '1_15A9', '1_1586', '1_13E1', '1_13E1', '1_13E1', '1_182B' ]

# 공중 유닛 공격이 불가능한 유닛 코드
unit_cannot_atk_sky_0 = ['0_1586', '0_1581', '0_1580', '0_15C3', '0_2720', '0_13E1', '0_1421', '0_1401', '0_1821', '0_1829', '0_4020', '0_182E', '0_1840', '0_920']
unit_cannot_atk_sky_1 = ['1_1586', '1_1581', '1_1580', '1_15C3', '1_2720', '1_13E1', '1_1421', '1_1401', '1_1821', '1_1829', '1_4020', '1_182E', '1_1840', '1_920']

total_unit_code = unit_code_P_0 + unit_code_P_1 + unit_code_T_0 + unit_code_T_1 + unit_code_Z_0 + unit_code_Z_1

In [None]:
# events 칼럼의 unique 값
events = ['Ability', 'AddToControlGroup', 'Camera', 'ControlGroup', 'GetControlGroup', 'Right Click', 'Selection', 'SetControlGroup']

In [None]:
# game_id 를 기준으로 두 데이터 프레임을 병합시키는 함수
def merge_data(source, df):
    """
    merge_data함수에 넘겨지는 source는 game_id, player column을 가져야 합니다.
    각 항목을 player_0, player_1로 dataframe에 merge 시키는 역할
    
    source data
    game_id player ability_count
    1       0      10
    1       1      20
    
    merge되었을 때 df 값
    game_id player_0 player_1 ability_count_player_0 ability_count_player_1
    0       0        1        10                     20
    """
    new_c = [c for c in source.columns if c not in ['game_id','player']]
    
    source.columns = ['game_id', 'player_0'] + [f'{c}_player_0'for c in new_c]
    df = df.merge(source, on=['game_id','player_0'], how='left')
    
    source.columns = ['game_id', 'player_1'] + [f'{c}_player_1'for c in new_c]
    df = df.merge(source, on=['game_id','player_1'], how='left')
    return df

In [None]:
# 게임별 플레이어의 종족을 P vs T, P vs Z, T vs Z 로 swap이 필요한지 확인하는 함수
def need_swap(row):
    """
    player swap이 필요한지 확인 (저테프 를 프테저로)
    T vs P -> P vs T
    Z vs P -> P vs Z
    Z vs T -> T vs Z
    """

    if row['species_0'] == 'T' and row['species_1'] == 'P':
        return 1
    elif row['species_0'] == 'Z' and row['species_1'] == 'P':
        return 1
    elif row['species_0'] == 'Z' and row['species_1'] == 'T':
        return 1
    else:
        return 0

In [None]:
# winner 칼럼을 0->1, 1->0 로 바꿔주는 함수
def winner_swap(org_winner):
    if org_winner == 0:
        return 1
    elif org_winner == 1:
        return 0
    else:
        raise ValueError('Incorrect Number')

In [None]:
# swap칼럼의 값이 1인 row의 player_0, player_1 과 연관이 있는 모든 칼럼을 swap 해주는 함수 (player0, player1 swap)
def swap(train, unit_code_unique, target_unique, selection_unique, attack_unique, answer=False):
    events = ['Ability', 'AddToControlGroup', 'Camera', 'ControlGroup', 'GetControlGroup', 'Right Click', 'Selection', 'SetControlGroup']

    swaped_index = train.loc[train.swap == 1].index.values

    try:
        temp_species = train.loc[train.swap == 1, 'species_0']
        train.loc[train.swap == 1, 'species_0'] = train.loc[train.swap == 1, 'species_1']
        train.loc[train.swap == 1, 'species_1'] = temp_species
    except KeyError as e:
            pass

    try:
        temp_event_count = train.loc[train.swap == 1, 'event_count_0']
        train.loc[train.swap == 1, 'event_count_0'] = train.loc[train.swap == 1, 'event_count_1']
        train.loc[train.swap == 1, 'event_count_1'] = temp_event_count
    except KeyError as e:
            pass

    try:
        temp_event_per_sec = train.loc[train.swap == 1, 'event_per_sec_0']
        train.loc[train.swap == 1, 'event_per_sec_0'] = train.loc[train.swap == 1, 'event_per_sec_1']
        train.loc[train.swap == 1, 'event_per_sec_1'] = temp_event_per_sec
    except KeyError as e:
            pass

    try:
        temp_worker = train.loc[train.swap == 1, 'worker_0']
        train.loc[train.swap == 1, 'worker_0'] = train.loc[train.swap == 1, 'worker_1']
        train.loc[train.swap == 1, 'worker_1'] = temp_worker
    except KeyError as e:
            pass
            
    try:
        temp_worker_attack = train.loc[train.swap == 1, 'worker_attack_0']
        train.loc[train.swap == 1, 'worker_attack_0'] = train.loc[train.swap == 1, 'worker_attack_1']
        train.loc[train.swap == 1, 'worker_attack_1'] = temp_worker_attack
    except KeyError as e:
            pass
        
    try:
        temp_building_count = train.loc[train.swap == 1, 'building_count_0']
        train.loc[train.swap == 1, 'building_count_0'] = train.loc[train.swap == 1, 'building_count_1']
        train.loc[train.swap == 1, 'building_count_1'] = temp_building_count
    except KeyError as e:
            pass
    try:
        temp_unit_mineral = train.loc[train.swap == 1, 'building_mineral_0']
        train.loc[train.swap == 1, 'building_mineral_0'] = train.loc[train.swap == 1, 'building_mineral_1']
        train.loc[train.swap == 1, 'building_mineral_1'] = temp_unit_mineral
    except KeyError as e:
            pass

    try:
        temp_building_gas = train.loc[train.swap == 1, 'building_gas_0']
        train.loc[train.swap == 1, 'building_gas_0'] = train.loc[train.swap == 1, 'building_gas_1']
        train.loc[train.swap == 1, 'building_gas_1'] = temp_building_gas
    except KeyError as e:
            pass

    try:
        temp_target_worker = train.loc[train.swap == 1, 'target_worker_0']
        train.loc[train.swap == 1, 'target_worker_0'] = train.loc[train.swap == 1, 'target_worker_1']
        train.loc[train.swap == 1, 'target_worker_1'] = temp_target_worker
    except KeyError as e:
            pass

    try:
        temp_unit_count = train.loc[train.swap == 1, 'unit_count_0']
        train.loc[train.swap == 1, 'unit_count_0'] = train.loc[train.swap == 1, 'unit_count_1']
        train.loc[train.swap == 1, 'unit_count_1'] = temp_unit_count
    except KeyError as e:
            pass

    try:
        temp_unit_mineral = train.loc[train.swap == 1, 'unit_mineral_0']
        train.loc[train.swap == 1, 'unit_mineral_0'] = train.loc[train.swap == 1, 'unit_mineral_1']
        train.loc[train.swap == 1, 'unit_mineral_1'] = temp_unit_mineral
    except KeyError as e:
            pass

    try:
        temp_unit_gas = train.loc[train.swap == 1, 'unit_gas_0']
        train.loc[train.swap == 1, 'unit_gas_0'] = train.loc[train.swap == 1, 'unit_gas_1']
        train.loc[train.swap == 1, 'unit_gas_1'] = temp_unit_gas
    except KeyError as e:
            pass

    try:
        temp_sum_30sec = train.loc[train.swap == 1, '0_move_sum_30sec']
        train.loc[train.swap == 1, '0_move_sum_30sec'] = train.loc[train.swap == 1, '1_move_sum_30sec']
        train.loc[train.swap == 1, '1_move_sum_30sec'] = temp_sum_30sec
    except KeyError as e:
            pass

    for kind in ['unit_ground_count', 'unit_sky_count', 'unit_can_atk_ground_count', 'unit_can_atk_sky_count', 'unit_cannot_atk_ground_count', 'unit_cannot_atk_sky_count']:
        try:
            temp_kind1 = train.loc[train.swap == 1, kind + '_0']
            train.loc[train.swap == 1, kind + '_0'] = train.loc[train.swap == 1, kind + '_1']
            train.loc[train.swap == 1, kind + '_1'] = temp_kind1

            train.loc[train.swap == 1, kind + '_diff'] = (train.loc[train.swap == 1, kind + '_diff']) * -1
        except KeyError as e:
            print('key error!!! ======', e)

    for event in events:
        try:
            # 이벤트들
            temp_event = train.loc[train.swap == 1, event + '_count_0'] 
            train.loc[train.swap == 1, event + '_count_0']  = train.loc[train.swap == 1, event + '_count_1'] 
            train.loc[train.swap == 1, event + '_count_1']  = temp_event
            # diff 는 -1을 해준다
            train.loc[train.swap == 1, event + '_diff'] = (train.loc[train.swap == 1, event + '_diff']) * -1
        except KeyError as e:
            print('key error!!! ======', e)

    for i in ['x', 'y']:
        for j in ['std', 'min', 'max', 'mean']:
            try:
                temp_camera_pos = train.loc[train.swap == 1, "camera_pos_" + i + "_" + j + "_player_0"] 
                train.loc[train.swap == 1, "camera_pos_" + i + "_" + j + "_player_0"] = train.loc[train.swap == 1, "camera_pos_" + i + "_" + j + "_player_1"] 
                train.loc[train.swap == 1, "camera_pos_" + i + "_" + j + "_player_1"]  = temp_camera_pos
            except KeyError as e:
                print('key error!!! ======', e)

    for func in ['sum','min','median','max']:
        try:
            temp_time = train.loc[train.swap == 1, '0_move_' + func ] 
            train.loc[train.swap == 1, '0_move_' + func ] = train.loc[train.swap == 1, '1_move_' + func ] 
            train.loc[train.swap == 1, '1_move_' + func ] = temp_time  
        except KeyError as e:
            print('key error!!! ======', e)

    for unit_code in unit_code_unique:
        try:
            temp_unit_code = train.loc[train.swap == 1, '0_' + unit_code]    
            train.loc[train.swap == 1, '0_' + unit_code]   = train.loc[train.swap == 1, '1_' + unit_code]  
            train.loc[train.swap == 1, '1_' + unit_code]  = temp_unit_code

            train.loc[train.swap == 1, unit_code + '_diff'] = (train.loc[train.swap == 1, unit_code + '_diff']) * -1

            temp_unit_code_div_time = train.loc[train.swap == 1, '0_' + unit_code + '_div_time']
            train.loc[train.swap == 1, '0_' + unit_code + '_div_time'] = train.loc[train.swap == 1, '1_' + unit_code + '_div_time']
            train.loc[train.swap == 1, '1_' + unit_code + '_div_time'] = temp_unit_code_div_time
        except KeyError as e:
            print('key error!!! ======', e)

    for attack in attack_unique:
        try:
            temp_attack = train.loc[train.swap == 1, '0_Attack_' + attack]    
            train.loc[train.swap == 1, '0_Attack_' + attack]   = train.loc[train.swap == 1, '1_Attack_' + attack]  
            train.loc[train.swap == 1, '1_Attack_' + attack]  = temp_attack
        except KeyError as e:
            print('key error!!! ======', e)

    for target in target_unique:
        try:
            temp_target = train.loc[train.swap == 1, '0_Target_' + target]     
            train.loc[train.swap == 1, '0_Target_' + target] = train.loc[train.swap == 1, '1_Target_' + target] 
            train.loc[train.swap == 1, '1_Target_' + target]  = temp_target
      
        except KeyError as e:
            print('key error!!! ======', e)

    for selection in selection_unique:
        try:
            temp_selection = train.loc[train.swap == 1, '0_' + selection]     
            train.loc[train.swap == 1, '0_' + selection] = train.loc[train.swap == 1, '1_' + selection] 
            train.loc[train.swap == 1, '1_' + selection]  = temp_selection

            train.loc[train.swap == 1, selection + '_diff'] = (train.loc[train.swap == 1, selection + '_diff']) * -1

            temp_selection_div_time = train.loc[train.swap == 1, '0_' + selection + '_div_time']
            train.loc[train.swap == 1, '0_' + selection + '_div_time'] = train.loc[train.swap == 1, '1_' + selection + '_div_time']
            train.loc[train.swap == 1, '1_' + selection + '_div_time'] = temp_selection_div_time
        except KeyError as e:
            print('key error!!! ======', e)

    try:
        train.loc[train.swap == 1, 'event_per_sec_diff'] = (train.loc[train.swap == 1, 'event_per_sec_diff']) * -1
    except KeyError as e:
            pass
    try:
        train.loc[train.swap == 1, 'worker_diff'] = (train.loc[train.swap == 1, 'worker_diff']) * -1
    except KeyError as e:
            pass
    try:
        train.loc[train.swap == 1, 'worker_attack_diff'] = (train.loc[train.swap == 1, 'worker_attack_diff']) * -1
    except KeyError as e:
            pass
    try:
        train.loc[train.swap == 1, 'building_count_diff'] = (train.loc[train.swap == 1, 'building_count_diff']) * -1
    except KeyError as e:
            pass

    # winner 반전
    if answer:
        train.loc[train.swap == 1, 'winner'] = train.loc[train.swap == 1, 'winner'].apply(lambda x: winner_swap(x))

    train.drop(labels='swap',axis=1, inplace=True)
    return swaped_index

In [None]:
# 종족의 unique값 확인
train['species'].unique()

array(['T', 'Z', 'P'], dtype=object)

In [None]:
# Label Encoding

species_encoder = LabelEncoder()
species_encoder.fit(train['species'].unique())

#Encoding 결과 확인
# 0:프, 1:테, 2:저
species_encoder.classes_

array(['P', 'T', 'Z'], dtype=object)

In [None]:
# event == Camera일 때 event_contents의 2차원 좌표 간 euclidean distance sum, min, median, max 구하는 함수
def move_sum(i):
    return sum(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 +
                       np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))
def move_min(i):
    if len(i) == 1:
      return 0
    return min(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 +
                       np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))
def move_median(i):
    if len(i) == 1:
      return 0
    return np.median(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 + 
                             np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))
def move_max(i):
    if len(i) == 1:
      return 0
    return max(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 +
                       np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))

In [None]:
# 전처리를 해주는 함수
def data_preparation(train, answer=False):

    # game_id unique 값을 칼럼으로 설정한 데이터 프레임 생성
    df_train = pd.DataFrame(columns=['game_id'])
    df_train.game_id = train.game_id.unique()
    
    df_train['player_0'] = 0
    df_train['player_1'] = 1

    #시간을 초단위로 변경
    df_train['time'] = np.array(train[train.shift(-1).game_id != train.game_id].time)
    df_train['time'] = (df_train.time*100//100*60 + df_train.time*100%100).astype(int) 

    # player 종족 추출
    df = train[train.player==0]
    df_train['species_0']=np.array(df[df.shift(-1).game_id != df.game_id].species)
    df = train[train.player==1]
    df_train['species_1']=np.array(df[df.shift(-1).game_id != df.game_id].species)

    # 설정한 종족에 따라 swap이 필요한지 확인 후 swap칼럼 생성 (PTZ 순서로 안되어 있으면 1로 설정됨)
    df_train['swap'] = df_train.apply(lambda row: need_swap(row), axis=1)

    # 전체 event의 count 추출    
    df = train[train.player==0]
    df_train['event_count_0'] = np.array(df.game_id.value_counts()[df.game_id.unique()])
    df = train[train.player==1]
    df_train['event_count_1'] = np.array(df.game_id.value_counts()[df.game_id.unique()])

    # 이벤트당 평균 소요 시간 추출
    df_train['event_per_sec_0'], df_train['event_per_sec_1'] = df_train['event_count_0'] /df_train.time, df_train['event_count_1'] /df_train.time  

    # event의 unique 값 count 추출   
    for event in events:
        df = train[(train.player==0)&(train.event==event)]
        df = pd.DataFrame(df.game_id.value_counts()[df.game_id.unique()]).rename({'game_id':event+'_count_0'}, axis = 1)
        df['game_id']= np.array(df.index)
        df_train = pd.merge(df_train, df, on='game_id', how='left')
    
        df = train[(train.player==1)&(train.event==event)]
        df = pd.DataFrame(df.game_id.value_counts()[df.game_id.unique()]).rename({'game_id':event+'_count_1'}, axis = 1)
        df['game_id']= np.array(df.index)
        df_train = pd.merge(df_train, df, on='game_id', how='left')
    df_train = df_train.fillna(0)

    # player 별 Camera x, y 좌표의 표준편차, 최소값, 최댓값, 평균값을 추출
    temp = train.groupby(['game_id','player'])[['camera_pos_x','camera_pos_y']].agg(['std','min','max','mean']).reset_index()
    temp.columns = ['game_id','player'] + [c[0]+'_'+c[1] for c in itertools.product(['camera_pos_x','camera_pos_y'],['std','min','max','mean'])]
    df_train = merge_data(temp, df_train)

    # game_id 열의 값을 index로 만듦
    df_train.index = df_train['game_id']
    df_train = df_train.drop(labels='game_id',axis=1)

    # move_sum,move_min,move_median,move_max 호출
    contents = (train[train.event == 'Camera'].loc[:,['player','game_id','event_contents']].groupby(['player','game_id'])).agg([move_sum,move_min,move_median,move_max]).unstack(level=0)
    contents.columns = [y+x for x in ['sum','min','median','max'] for y in ['0_move_','1_move_']]
    for i in contents.columns:
        df_train[i] = contents[i].fillna(0)

    # 30초 이내 move_sum
    contents = (train[(train.time < (train.event == 'Camera'))].loc[:,['player','game_id','event_contents']].groupby(['player','game_id'])).agg(move_sum).unstack(level=0)
    contents.columns = ['0_move_sum_30sec','1_move_sum_30sec']
    for i in contents.columns:
        df_train[i] = contents[i]

    # event칼럼의 값이 Ability일 때 event_contents의 16진수 코드만 추출 후 각 코드의 count 값을 새로운 칼럼으로 추출
    df_total_unit_code = []
    contents = pd.DataFrame(train.event_contents[(train.event == 'Ability')].map(lambda x: x[x.find('(')+1:x.find(')')]))  
    contents['game_id'], contents['player'], contents['count'] = train.game_id, train.player, 1
    unit_code_unique = contents.event_contents.unique().copy()
    contents_X = pd.DataFrame(columns=[x+y for x in ['0_','1_'] for y in contents.event_contents.unique()])
    contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
    contents.columns = contents.columns.map(lambda x: str(x[1])+'_'+x[2])
    contents_X = pd.concat([contents_X, contents])
    for i in contents_X.columns:
        if i in worker:
            df_total_unit_code.append(i)
            df_train[i] = contents_X[i]
            df_train[i] = df_train[i].fillna(0).astype(int)
        if i in total_unit_code:
            df_total_unit_code.append(i)
            df_train[i] = contents_X[i]
            df_train[i] = df_train[i].fillna(0).astype(int)
        df_train[i] = contents_X[i]
        df_train[i] = df_train[i].fillna(0).astype(int)

    # event == Ability일 때 event_contents 더미 변수 생성 / time
    for i in contents_X.columns:
        df_train[i+'_div_time'] = df_train[i] /df_train.time

    # 프로브, SCV, 드론 3종류의 일꾼은 하나의 칼럼으로 만듦
    worker_count_0 = 0
    worker_count_1 = 0
    for i in worker_0:
        try:
            worker_count_0 += df_train[i]
        except KeyError as e:
            print('key error!!! ======', e)
    for j in worker_1:
        try:
            worker_count_1 += df_train[j]
        except KeyError as e:
            print('key error!!! ======', e)

    df_train['worker_0'] = worker_count_0
    df_train['worker_1'] = worker_count_1


    # event == Ability이고 event contents 가 Attack Target: '유닛' 일 때 Target에 대한 count 값을 칼럼으로 만듦
    contents = pd.DataFrame(train.event_contents[(train.event == 'Ability') & (train.event_contents.map(lambda x: str(x)[8:22]) == 'Attack; Target')].map(lambda x: x[x.find(':')+2:x.find(' [')]))  # Attack Target 만 추출
    contents['game_id'], contents['player'], contents['count'] = train.game_id, train.player, 1
    attack_unique = contents.event_contents.unique().copy()
    contents_X = pd.DataFrame(columns=[x+y for x in ['0_Attack_','1_Attack_'] for y in contents.event_contents.unique()])
    contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
    contents.columns = contents.columns.map(lambda x: str(x[1])+'_Attack_'+x[2])
    contents_X = pd.concat([contents_X, contents])
    for i in contents_X.columns:
        df_train[i] = contents_X[i]
        df_train[i] = df_train[i].fillna(0).astype(int)

    # 프로브, SCV, 드론 3종류의 일꾼은 하나의 칼럼으로 만듦
    worker_attack_0 = 0
    worker_attack_1 = 0
    for i in attack_worker_0:
        try:
            worker_attack_0 += df_train[i]
        except KeyError as e:
            print('key error!!! ======', e)
    for j in attack_worker_1:
        try:
            worker_attack_1 += df_train[j]
        except KeyError as e:
            print('key error!!! ======', e)

    df_train['worker_attack_0'] = worker_attack_0
    df_train['worker_attack_1'] = worker_attack_1

    # 플레이어 별 지상유닛의 count 값을 새로운 칼럼으로 
    unit_ground_count_0 = 0
    unit_ground_count_1 = 0
    for i in unit_ground_0:
        try:
            unit_ground_count_0 += df_train[i]
        except KeyError as e:
            print('key error!!! ======', e)
    for j in unit_ground_1:
        try:
            unit_ground_count_1 += df_train[j]
        except KeyError as e:
            print('key error!!! ======', e)

    df_train['unit_ground_count_0'] = unit_ground_count_0
    df_train['unit_ground_count_1'] = unit_ground_count_1

    # 플레이어 별 공중유닛의 count 값을 새로운 칼럼으로
    unit_sky_count_0 = 0
    unit_sky_count_1 = 0
    for i in unit_sky_0:
        try:
            unit_sky_count_0 += df_train[i]
        except KeyError as e:
            print('key error!!! ======', e)
    for j in unit_sky_1:
        try:
            unit_sky_count_1 += df_train[j]
        except KeyError as e:
            print('key error!!! ======', e)

    df_train['unit_sky_count_0'] = unit_sky_count_0
    df_train['unit_sky_count_1'] = unit_sky_count_1

    # 플레이어 별 지상유닛을 공격 가능한 유닛의 count 값을 새로운 칼럼으로
    unit_can_atk_ground_count_0 = 0
    unit_can_atk_ground_count_1 = 0
    for i in unit_can_atk_ground_0:
        try:
            unit_can_atk_ground_count_0 += df_train[i]
        except KeyError as e:
            print('key error!!! ======', e)
    for j in unit_can_atk_ground_1:
        try:
            unit_can_atk_ground_count_1 += df_train[j]
        except KeyError as e:
            print('key error!!! ======', e)

    df_train['unit_can_atk_ground_count_0'] = unit_can_atk_ground_count_0
    df_train['unit_can_atk_ground_count_1'] = unit_can_atk_ground_count_1

    # 플레이어 별 공중유닛을 공격 가능한 유닛의 count 값을 새로운 칼럼으로
    unit_can_atk_sky_count_0 = 0
    unit_can_atk_sky_count_1 = 0
    for i in unit_can_atk_sky_0:
        try:
            unit_can_atk_sky_count_0 += df_train[i]
        except KeyError as e:
            print('key error!!! ======', e)
    for j in unit_can_atk_sky_1:
        try:
            unit_can_atk_sky_count_1 += df_train[j]
        except KeyError as e:
            print('key error!!! ======', e)

    df_train['unit_can_atk_sky_count_0'] = unit_can_atk_sky_count_0
    df_train['unit_can_atk_sky_count_1'] = unit_can_atk_sky_count_1

    # 플레이어 별 지상유닛을 공격할 수 없는 유닛의 count 값을 새로운 칼럼으로
    unit_cannot_atk_ground_count_0 = 0
    unit_cannot_atk_ground_count_1 = 0
    for i in unit_cannot_atk_ground_0:
        try:
            unit_cannot_atk_ground_count_0 += df_train[i]
        except KeyError as e:
            print('key error!!! ======', e)
    for j in unit_cannot_atk_ground_1:
        try:
            unit_cannot_atk_ground_count_1 += df_train[j]
        except KeyError as e:
            print('key error!!! ======', e)

    df_train['unit_cannot_atk_ground_count_0'] = unit_cannot_atk_ground_count_0
    df_train['unit_cannot_atk_ground_count_1'] = unit_cannot_atk_ground_count_1

    # 플레이어 별 공중유닛을 공격할 수 없는 유닛의 count 값을 새로운 칼럼으로
    unit_cannot_atk_sky_count_0 = 0
    unit_cannot_atk_sky_count_1 = 0
    for i in unit_cannot_atk_sky_0: 
        try:
            unit_cannot_atk_sky_count_0 += df_train[i]
        except KeyError as e:
            print('key error!!! ======', e)
    for j in unit_cannot_atk_sky_1:
        try:
            unit_cannot_atk_sky_count_1 += df_train[j]
        except KeyError as e:
            print('key error!!! ======', e)

    df_train['unit_cannot_atk_sky_count_0'] = unit_cannot_atk_sky_count_0
    df_train['unit_cannot_atk_sky_count_1'] = unit_cannot_atk_sky_count_1
    
    # event칼럼의 값이 Right Click일 때 event_contents의 Target유닛을 추출 후 각 유닛의 count 값을 새로운 칼럼으로 추출
    df_total_unit_target =[]
    contents = pd.DataFrame(train.event_contents[(train.event == 'Right Click') & (train.event_contents.map(lambda x: str(x)[:6]) == 'Target')].map(lambda x: x[x.find(':')+2:x.find(' [')]))  # event_contents의 Target만 추출
    contents['game_id'], contents['player'], contents['count'] = train.game_id, train.player, 1
    target_unique = contents.event_contents.unique().copy()
    contents_X = pd.DataFrame(columns=[x+y for x in ['0_Target_','1_Target_'] for y in contents.event_contents.unique()])
    contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
    contents.columns = contents.columns.map(lambda x: str(x[1])+'_Target_'+x[2])
    contents_X = pd.concat([contents_X, contents])
    for i in contents_X.columns:
        if i in target_worker:
            df_total_unit_target.append(i)
            df_train[i] = contents_X[i]
            df_train[i] = df_train[i].fillna(0).astype(int)
        df_train[i] = contents_X[i]
        df_train[i] = df_train[i].fillna(0).astype(int)
    
    # 프로브, SCV, 드론 3종류의 일꾼은 하나의 칼럼으로 만듦
    target_worker_count_0 = 0
    target_worker_count_1 = 0
    for i in target_worker_0:
        try:
            target_worker_count_0 += df_train[i]
        except KeyError as e:
            print('key error!!! ======', e)
    for j in target_worker_1:
        try:
            target_worker_count_1 += df_train[j]
        except KeyError as e:
            print('key error!!! ======', e)

    df_train['target_worker_0'] = target_worker_count_0
    df_train['target_worker_1'] = target_worker_count_1

    # event == Selection일 때 event_contents 더미 변수 생성, 카운트
    contents = train[train.event == 'Selection'].event_contents.map(lambda x: re.sub('\s\[.....\]', '', re.sub('\s\[......\]', '', re.sub('\s\[.......\]', '', x))).
                                                            replace('[', '').replace(']', '').replace(' ', '').replace('\'', ''))
    contents = contents.str.split(',')
    max_num = max(contents.map(lambda x: len(x)))
    t = [0 for x in range(max_num)]
    for i in range(max_num):
        t[i] = pd.DataFrame(contents[contents.map(lambda x: len(x) > i)].map(lambda x: x[i]))

    contents = pd.concat([t[i] for i in range(max_num)])
    contents['game_id'], contents['player'], contents['count'] = train.game_id, train.player, 1
    selection_unique = contents.event_contents.unique().copy()
    contents_X = pd.DataFrame(columns=[x+y for x in ['0_','1_'] for y in contents.event_contents.unique()])
    contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
    contents.columns = contents.columns.map(lambda x: str(x[1])+'_'+x[2])
    contents_X = pd.concat([contents_X, contents])
    for i in contents_X.columns:
        df_train[i] = contents_X[i]
        df_train[i] = df_train[i].fillna(0).astype(int)

    # event == Selection일 때 event_contents 더미 변수 생성 / time
    for i in contents_X.columns:
        df_train[i+'_div_time'] = df_train[i] /df_train.time

    # 플레이어별 일꾼을 제외한 전체 유닛의 count 칼럼 추가
    unit_count_0 = 0
    unit_count_1 = 0
    for j in unit_code_P_0 + unit_code_T_0 + unit_code_Z_0:
        try:
            unit_count_0 += df_train[j]
        except KeyError:    # 데이터에 해당 유닛 코드가 없을 때
            unit_count_0 += 0

    for j in unit_code_P_1 + unit_code_T_1 + unit_code_Z_1:
        try:
            unit_count_1 += df_train[j]
        except KeyError:
            unit_count_1 += 0

    df_train['unit_count_0'], df_train['unit_count_1'] = unit_count_0, unit_count_1

    # 플레이어별 전체 건물의 count 칼럼 추가
    building_count_0 = 0
    building_count_1 = 0
    for i in building_0:
        try:
            building_count_0 += df_train[i]
        except KeyError:    # 데이터에 해당 유닛 코드가 없을 때
            building_count_0 += 0


    for j in building_1:
        try:
            building_count_1 += df_train[j]
        except KeyError:
            building_count_1 += 0


    df_train['building_count_0'], df_train['building_count_1'] = building_count_0, building_count_1

    # 플레이어별 일꾼을 제외한 전체 유닛의 미네랄 칼럼 추가 (unit_count * 유닛별 미네랄)
    unit_mineral_0 = 0
    unit_mineral_1 = 0
    for i in unit_code_P_0 + unit_code_T_0 + unit_code_Z_0:
        try:
            unit_mineral_0 += df_train[i] * unit_mineral[i.split('_')[1]]
        except KeyError:
            unit_mineral_0 +=0

    for i in unit_code_P_1 + unit_code_T_1 + unit_code_Z_1:
        try:
            unit_mineral_1 += df_train[i] * unit_mineral[i.split('_')[1]]
        except KeyError:
            unit_mineral_1 +=0

    df_train['unit_mineral_0'] = unit_mineral_0
    df_train['unit_mineral_1'] = unit_mineral_1

    # 플레이어별 일꾼을 제외한 전체 유닛의 가스 칼럼 추가 (unit_count * 유닛별 가스)
    unit_gas_0 = 0
    unit_gas_1 = 0
    for i in unit_code_P_0 + unit_code_T_0 + unit_code_Z_0:
        try:
            unit_gas_0 += df_train[i] * unit_gas[i.split('_')[1]]
        except KeyError:
            unit_gas_0 +=0
    
    for i in unit_code_P_1 + unit_code_T_1 + unit_code_Z_1:
        try:
            unit_gas_1 += df_train[i] * unit_gas[i.split('_')[1]]
        except KeyError:
            unit_gas_1 +=0
    
    df_train['unit_gas_0'] = unit_gas_0
    df_train['unit_gas_1'] = unit_gas_1

    # 플레이어별 전체 건물의 미네랄 칼럼 추가 (building_count * 건물별 미네랄)
    building_mineral_0 = 0
    building_mineral_1 = 0
    for i in building_0:
        try:
            building_mineral_0 += df_train[i] * building_mineral[i.split('_')[1]]
        except KeyError:
            building_mineral_0 +=0

    for i in building_1:
        try:
            building_mineral_1 += df_train[i] * building_mineral[i.split('_')[1]]
        except KeyError:
            building_mineral_1 +=0

    df_train['building_mineral_0'] = building_mineral_0
    df_train['building_mineral_1'] = building_mineral_1

    # 플레이어별 전체 건물의 가스 칼럼 추가 (building_count * 건물별 가스)
    building_gas_0 = 0
    building_gas_1 = 0
    for i in building_0:
        try:
            building_gas_0 += df_train[i] * building_gas[i.split('_')[1]]
        except KeyError:
            building_gas_0 +=0
    
    for i in building_1:
        try:
            building_gas_1 += df_train[i] * building_gas[i.split('_')[1]]
        except KeyError:
            building_gas_1 +=0
    
    df_train['building_gas_0'] = building_gas_0
    df_train['building_gas_1'] = building_gas_1

    # 3종류의 일꾼은 하나의 칼럼으로 묶었으므로 일꾼 한종류의 칼럼은 삭제
    try:
        df_train.drop(labels=worker, axis=1, inplace=True)
    except KeyError as e:
        pass
    try:
        df_train.drop(labels=target_worker, axis=1, inplace=True)
    except KeyError as e:
        pass
    try:
        df_train.drop(labels=attack_worker, axis=1, inplace=True)
    except KeyError as e:
        pass

    # event 칼럼의 count 값의 차이를 새로운 컬럼으로 만듦
    for event in events:
        try:
            df_train[event + '_diff'] = df_train[event+'_count_0'] - df_train[event+'_count_1']
        except KeyError as e:
            pass
    # event 별 평균 시간의 차이
    try:
        df_train['event_per_sec_diff'] = df_train['event_per_sec_0'] - df_train['event_per_sec_1']
    except KeyError as e:
            pass
    # 일꾼 수의 차이
    try:
        df_train['worker_diff'] = df_train['worker_0'] - df_train['worker_1']
    except KeyError as e:
            pass
    # 건물 수의 차이
    try:
        df_train['building_count_diff'] = df_train['building_count_0'] - df_train['building_count_1']
    except KeyError as e:
            pass
    # 일꾼 공격 횟수의 차이
    try:    
        df_train['worker_attack_diff'] = df_train['worker_attack_0'] - df_train['worker_attack_1']
    except KeyError as e:
            pass

    # 지상유닛, 공중유닛, 지상유닛 공격가능 유닛, 공중유닛 공격가능 유닛, 지상유닛 공격 불가능 유닛, 공중유닛 공격 불가능 유닛 수의 차이
    for kind in ['unit_ground_count', 'unit_sky_count', 'unit_can_atk_ground_count', 'unit_can_atk_sky_count', 'unit_cannot_atk_ground_count', 'unit_cannot_atk_sky_count']:
        try:
            df_train[kind + '_diff'] = df_train[kind + '_0'] - df_train[kind + '_1']
        except KeyError as e:
          pass
    
    # 일꾼을 제외한 유닛별 count 차이
    for unit_code in unit_code_unique:
        try:
            df_train[unit_code + '_diff'] = df_train['0_' + unit_code] - df_train['1_' + unit_code]
        except KeyError as e:
          pass

    # 일꾼을 제외한 유닛별 selection 횟수 차이
    for selection in selection_unique:
        try:
            df_train[selection + '_diff'] = df_train['0_' + selection] - df_train['1_' + selection]
        except KeyError as e:
            pass

    # train데이터를 함수에 넣었을 경우 winner칼럼 추가
    if answer:
        df_train['winner'] = np.array(train[train.shift(-1).game_id != train.game_id].winner)

    # 종족 레이블 인코딩 (LabelEncoder 사용)
    df_train['species_0'] = species_encoder.transform(df_train['species_0'])
    df_train['species_1'] = species_encoder.transform(df_train['species_1'])  

    # player swap
    swaped_index = swap(df_train, unit_code_unique, target_unique, selection_unique, attack_unique, answer=answer)

    # 전쟁 유형 추가 (프프, 프테, 프저, 테테, 테저, 저저 6종류의 유형으로 새로운 칼럼 추가)
    df_train['species_war_kind'] = df_train['species_0'].astype(str) + '_' + df_train['species_1'].astype(str)
    df_train['species_war_kind'] = df_train['species_war_kind'].map({'0_0':0,'0_1':1,'0_2':2, '1_1':3,'1_2':4,'2_2':5})

    # 쓸모없는 칼럼 삭제
    df_train.drop(labels=['player_0', 'player_1'], axis=1, inplace=True)
    
    # train, test set인지에 따라 Label(winner) 칼럼이 제외할 지 말지 결정 후 리턴
    if answer:
        x_data = df_train.drop(labels='winner', axis=1, inplace=False)
    else:
        x_data = df_train.iloc[:, :]
    y_data = pd.Series()

    x_data = x_data[sorted(x_data.columns)]

    if answer:
        y_data = df_train['winner']
    return x_data, y_data, swaped_index, unit_code_unique, target_unique, selection_unique, attack_unique

In [None]:
# 학습 데이터 전처리
x_train, y_train, _, unit_code_unique, target_unique, selection_unique, attack_unique = data_preparation(train, answer=True)



In [None]:
x_train.head()

Unnamed: 0_level_0,0_,0_1000,0_1000_div_time,0_1020,0_1020_div_time,0_1021,0_1021_div_time,0_1022,0_1022_div_time,0_1023,0_1023_div_time,0_1024,0_1024_div_time,0_1025,0_1025_div_time,0_1026,0_1026_div_time,0_1028,0_1028_div_time,0_1029,0_1029_div_time,0_102A,0_102A_div_time,0_102B,0_102B_div_time,0_102D,0_102D_div_time,0_102F,0_102F_div_time,0_103E,0_103E_div_time,0_1060,0_1060_div_time,0_1080,0_1080_div_time,0_1081,0_1081_div_time,0_10C0,0_10C0_div_time,0_10E0,...,event_count_1,event_per_sec_0,event_per_sec_1,event_per_sec_diff,species_0,species_1,species_war_kind,target_worker_0,target_worker_1,time,unit_can_atk_ground_count_0,unit_can_atk_ground_count_1,unit_can_atk_ground_count_diff,unit_can_atk_sky_count_0,unit_can_atk_sky_count_1,unit_can_atk_sky_count_diff,unit_cannot_atk_ground_count_0,unit_cannot_atk_ground_count_1,unit_cannot_atk_ground_count_diff,unit_cannot_atk_sky_count_0,unit_cannot_atk_sky_count_1,unit_cannot_atk_sky_count_diff,unit_count_0,unit_count_1,unit_gas_0,unit_gas_1,unit_ground_count_0,unit_ground_count_1,unit_ground_count_diff,unit_mineral_0,unit_mineral_1,unit_sky_count_0,unit_sky_count_1,unit_sky_count_diff,worker_0,worker_1,worker_attack_0,worker_attack_1,worker_attack_diff,worker_diff
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
0,5,0,0.0,1,0.002252,4,0.009009,2,0.004505,2,0.004505,1,0.002252,0,0.0,0,0.0,0,0.0,0,0.0,1,0.002252,1,0.002252,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,548,1.333333,1.234234,0.099099,1,1,3,0,0,444,3,7,-4,3,5,-2,0,0,0,0,2,-2,5,7,100,250,3,7,-4,325,550,1,0,1,9,6,0,0,0,3
7,13,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,655,2.600446,1.462054,1.138393,0,2,2,1,0,448,0,3,-3,2,3,-1,6,0,6,4,0,4,8,7,600,0,0,3,-3,1000,850,4,0,4,22,23,4,0,4,-1
14,0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,815,1.631579,3.06391,-1.432331,0,0,0,2,0,266,0,4,-4,0,3,-3,3,0,3,3,4,-1,3,4,75,250,0,3,-3,300,775,0,1,-1,7,8,0,0,0,-1
16,4,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,1155,2.063439,1.928214,0.135225,0,1,1,2,2,599,1,5,-4,0,5,-5,3,0,3,4,0,4,5,7,175,200,0,5,-5,900,450,2,2,0,16,9,2,0,2,7
18,6,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,393,1.616949,1.332203,0.284746,2,2,5,0,1,295,3,6,-3,0,0,0,0,0,0,3,6,-3,4,9,0,0,3,6,-3,250,600,0,0,0,6,6,0,0,0,0


In [None]:
# 테스트 데이터 전처리 (swap한 row는 1-prob 해줘야 하므로 미리 swap된 index를 받아놓아야 함)
x_test, _, swaped_index, _, _, _, _ = data_preparation(test, answer=False)



In [None]:
# 메모리 절약을 위한 train, test 데이터 삭제
del train
del test

In [None]:
swaped_index[:10]

array([38873, 38875, 38882, 38885, 38890, 38892, 38898, 38900, 38901,
       38906])

In [None]:
print(x_train.shape)
print(y_train.shape)

(19436, 3524)
(19436,)


In [None]:
# 칼럼이 1종류의 값으로만 이루어진 칼럼 추출 (쓸모없는 칼럼 추출)
useless = []
for c in x_train.columns:
    try:
        if x_train[c].nunique()<2:
            useless.append(c)
            print("useless", c)
    except:
        print("exception:", c)

useless 0_11E0
useless 0_11E0_div_time
useless 0_152C
useless 0_152C_div_time
useless 0_152F
useless 0_152F_div_time
useless 0_1780
useless 0_1780_div_time
useless 0_17C0
useless 0_17C0_div_time
useless 0_1826
useless 0_1826_div_time
useless 0_182A
useless 0_182A_div_time
useless 0_182B
useless 0_182B_div_time
useless 0_1840
useless 0_1840_div_time
useless 0_1860
useless 0_1860_div_time
useless 0_1880
useless 0_1880_div_time
useless 0_18A0
useless 0_18A0_div_time
useless 0_18C0
useless 0_18C0_div_time
useless 0_1980
useless 0_1980_div_time
useless 0_1A63
useless 0_1A63_div_time
useless 0_1AE0
useless 0_1AE0_div_time
useless 0_1B00
useless 0_1B00_div_time
useless 0_1B23
useless 0_1B23_div_time
useless 0_1B60
useless 0_1B60_div_time
useless 0_1BE3
useless 0_1BE3_div_time
useless 0_2061
useless 0_2061_div_time
useless 0_2101
useless 0_2101_div_time
useless 0_2240
useless 0_2240_div_time
useless 0_2D80
useless 0_2D80_div_time
useless 0_2DA0
useless 0_2DA0_div_time
useless 0_3161
useless 0_

In [None]:
# 쓸모없는 칼럼 drop
x_train.drop(labels=useless, axis=1, inplace=True)

In [None]:
x_train

Unnamed: 0_level_0,0_,0_1000,0_1000_div_time,0_1020,0_1020_div_time,0_1021,0_1021_div_time,0_1022,0_1022_div_time,0_1023,0_1023_div_time,0_1024,0_1024_div_time,0_1025,0_1025_div_time,0_1026,0_1026_div_time,0_1028,0_1028_div_time,0_1029,0_1029_div_time,0_102A,0_102A_div_time,0_102B,0_102B_div_time,0_102D,0_102D_div_time,0_102F,0_102F_div_time,0_103E,0_103E_div_time,0_1060,0_1060_div_time,0_1080,0_1080_div_time,0_1081,0_1081_div_time,0_10C0,0_10C0_div_time,0_10E0,...,event_count_1,event_per_sec_0,event_per_sec_1,event_per_sec_diff,species_0,species_1,species_war_kind,target_worker_0,target_worker_1,time,unit_can_atk_ground_count_0,unit_can_atk_ground_count_1,unit_can_atk_ground_count_diff,unit_can_atk_sky_count_0,unit_can_atk_sky_count_1,unit_can_atk_sky_count_diff,unit_cannot_atk_ground_count_0,unit_cannot_atk_ground_count_1,unit_cannot_atk_ground_count_diff,unit_cannot_atk_sky_count_0,unit_cannot_atk_sky_count_1,unit_cannot_atk_sky_count_diff,unit_count_0,unit_count_1,unit_gas_0,unit_gas_1,unit_ground_count_0,unit_ground_count_1,unit_ground_count_diff,unit_mineral_0,unit_mineral_1,unit_sky_count_0,unit_sky_count_1,unit_sky_count_diff,worker_0,worker_1,worker_attack_0,worker_attack_1,worker_attack_diff,worker_diff
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
0,5,0,0.0,1,0.002252,4,0.009009,2,0.004505,2,0.004505,1,0.002252,0,0.0,0,0.0,0,0.0,0,0.0,1,0.002252,1,0.002252,0,0.00000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,548,1.333333,1.234234,0.099099,1,1,3,0,0,444,3,7,-4,3,5,-2,0,0,0,0,2,-2,5,7,100,250,3,7,-4,325,550,1,0,1,9,6,0,0,0,3
7,13,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.000000,0,0.000000,0,0.00000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,655,2.600446,1.462054,1.138393,0,2,2,1,0,448,0,3,-3,2,3,-1,6,0,6,4,0,4,8,7,600,0,0,3,-3,1000,850,4,0,4,22,23,4,0,4,-1
14,0,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.000000,0,0.000000,0,0.00000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,815,1.631579,3.063910,-1.432331,0,0,0,2,0,266,0,4,-4,0,3,-3,3,0,3,3,4,-1,3,4,75,250,0,3,-3,300,775,0,1,-1,7,8,0,0,0,-1
16,4,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.000000,0,0.000000,0,0.00000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,1155,2.063439,1.928214,0.135225,0,1,1,2,2,599,1,5,-4,0,5,-5,3,0,3,4,0,4,5,7,175,200,0,5,-5,900,450,2,2,0,16,9,2,0,2,7
18,6,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.000000,0,0.000000,0,0.00000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,393,1.616949,1.332203,0.284746,2,2,5,0,1,295,3,6,-3,0,0,0,0,0,0,3,6,-3,4,9,0,0,3,6,-3,250,600,0,0,0,6,6,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38865,0,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.000000,0,0.000000,0,0.00000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,58,2.256410,1.487179,0.769231,0,2,2,0,0,39,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,0,0,0,2
38866,0,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.000000,0,0.000000,0,0.00000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,79,1.702703,2.135135,-0.432432,1,1,3,0,0,37,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0
38867,2,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.000000,0,0.000000,0,0.00000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,831,0.910359,1.655378,-0.745020,0,1,1,1,0,502,4,3,1,2,3,-1,4,0,4,6,0,6,10,3,800,0,1,3,-2,1700,150,5,0,5,6,11,0,0,0,-5
38868,0,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.000000,0,0.000000,0,0.00000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,806,1.295492,1.345576,-0.050083,0,1,1,1,3,599,3,15,-12,1,9,-8,0,15,-15,3,6,-3,3,15,150,425,2,15,-13,625,900,1,0,1,11,7,0,6,-6,4


In [None]:
# x_train, x_test에만 있는 컬럼 삭제
x_train.drop(set(x_train.columns) - set(x_test.columns), axis=1, inplace=True)
x_test.drop(set(x_test.columns) - set(x_train.columns), axis=1, inplace=True)

In [None]:
x_train.info

<bound method DataFrame.info of          0_  0_1000  ...  worker_attack_diff  worker_diff
game_id              ...                                 
0         5       0  ...                   0            3
7        13       0  ...                   4           -1
14        0       0  ...                   0           -1
16        4       0  ...                   2            7
18        6       0  ...                   0            0
...      ..     ...  ...                 ...          ...
38865     0       0  ...                   0            2
38866     0       0  ...                   0            0
38867     2       0  ...                   0           -5
38868     0       0  ...                  -6            4
38871    18       0  ...                   7          -15

[19436 rows x 3258 columns]>

In [None]:
x_test.info

<bound method DataFrame.info of          0_  0_1000  ...  worker_attack_diff  worker_diff
game_id              ...                                 
38872     4       0  ...                   1           -1
38873     1       0  ...                   0            6
38874     1       0  ...                   0            4
38875    11       0  ...                  -4           -2
38876     0       0  ...                   0            2
...      ..     ...  ...                 ...          ...
55654     4       0  ...                   1           12
55655    11       0  ...                   6           -1
55656     6       0  ...                   0           -2
55657     0       0  ...                   0            2
55658     4       0  ...                   0           -2

[16787 rows x 3258 columns]>

In [None]:
# 지금부터 player 0, 1을 swap한 뒤 학습데이터에 행병합하여 학습데이터를 2배로 만드는 과정
x_train_copy = x_train.copy()

In [None]:
y_train_copy = y_train.copy()

In [None]:
# 전체 데이터를 swap하기 위해 값이 1인 swap 칼럼 추가
x_train_copy['swap'] = 1

In [None]:
# 복사한 학습용 피쳐데이터 swap
_ = swap(x_train_copy, unit_code_unique, target_unique, selection_unique, attack_unique, answer=False)



In [None]:
# 학습용 레이블 데이터는 전부 winner_swap
y_train_copy = y_train_copy.apply(lambda x: winner_swap(x))

In [None]:
# 기존 학습용 피쳐 데이터의 row 개수와 인덱스 + 1 값 구함
n = x_train.shape[0]
game_id_max = x_train.index.max() + 1

In [None]:
# 복사본의 인덱스가 기존 학습데이터의 인덱스 다음부터 쭉 이어지도록 만듦 
x_train_copy.index = [game_id_max + x for x in range(n)]
y_train_copy.index = [game_id_max + x for x in range(n)]

In [None]:
x_train_copy

Unnamed: 0,0_,0_1000,0_1000_div_time,0_1020,0_1020_div_time,0_1021,0_1021_div_time,0_1022,0_1022_div_time,0_1023,0_1023_div_time,0_1024,0_1024_div_time,0_1025,0_1025_div_time,0_1026,0_1026_div_time,0_1028,0_1028_div_time,0_1029,0_1029_div_time,0_102A,0_102A_div_time,0_102B,0_102B_div_time,0_102D,0_102D_div_time,0_102F,0_102F_div_time,0_103E,0_103E_div_time,0_1060,0_1060_div_time,0_1080,0_1080_div_time,0_1081,0_1081_div_time,0_10C0,0_10C0_div_time,0_10E0,...,event_count_1,event_per_sec_0,event_per_sec_1,event_per_sec_diff,species_0,species_1,species_war_kind,target_worker_0,target_worker_1,time,unit_can_atk_ground_count_0,unit_can_atk_ground_count_1,unit_can_atk_ground_count_diff,unit_can_atk_sky_count_0,unit_can_atk_sky_count_1,unit_can_atk_sky_count_diff,unit_cannot_atk_ground_count_0,unit_cannot_atk_ground_count_1,unit_cannot_atk_ground_count_diff,unit_cannot_atk_sky_count_0,unit_cannot_atk_sky_count_1,unit_cannot_atk_sky_count_diff,unit_count_0,unit_count_1,unit_gas_0,unit_gas_1,unit_ground_count_0,unit_ground_count_1,unit_ground_count_diff,unit_mineral_0,unit_mineral_1,unit_sky_count_0,unit_sky_count_1,unit_sky_count_diff,worker_0,worker_1,worker_attack_0,worker_attack_1,worker_attack_diff,worker_diff
38872,4,1,0.002252,0,0.000000,3,0.006757,2,0.004505,1,0.002252,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.000000,1,0.002252,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,...,592,1.234234,1.333333,-0.099099,1,1,3,0,0,444,7,3,4,5,3,2,0,0,0,2,0,2,7,5,250,100,7,3,4,550,325,0,1,-1,6,9,0,0,0,-3
38873,15,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,1165,1.462054,2.600446,-1.138393,2,0,2,0,1,448,3,0,3,3,2,1,0,6,-6,0,4,-4,7,8,0,600,3,0,3,850,1000,0,4,-4,23,22,0,4,-4,1
38874,3,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,434,3.063910,1.631579,1.432331,0,0,0,0,2,266,4,0,4,3,0,3,0,3,-3,4,3,1,4,3,250,75,3,0,3,775,300,1,0,1,8,7,0,0,0,1
38875,5,0,0.000000,2,0.003339,6,0.010017,0,0.000000,2,0.003339,1,0.001669,0,0.000000,0,0.000000,0,0.0,0,0.000000,1,0.001669,1,0.001669,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,...,1236,1.928214,2.063439,-0.135225,1,0,1,2,2,599,5,1,4,5,0,5,0,3,-3,0,4,-4,7,5,200,175,5,0,5,450,900,2,2,0,9,16,0,2,-2,-7
38876,8,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,477,1.332203,1.616949,-0.284746,2,2,5,1,0,295,6,3,3,0,0,0,0,0,0,6,3,3,9,4,0,0,6,3,3,600,250,0,0,0,6,6,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58303,0,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,88,1.487179,2.256410,-0.769231,2,0,2,0,0,39,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0,0,0,-2
58304,0,0,0.000000,0,0.000000,1,0.027027,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,63,2.135135,1.702703,0.432432,1,1,3,0,0,37,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0
58305,3,0,0.000000,1,0.001992,2,0.003984,4,0.007968,1,0.001992,1,0.001992,1,0.001992,0,0.000000,0,0.0,0,0.000000,1,0.001992,1,0.001992,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,457,1.655378,0.910359,0.745020,1,0,1,0,1,502,3,4,-1,3,2,1,0,4,-4,0,6,-6,3,10,0,800,3,1,2,150,1700,0,5,-5,11,6,0,0,0,5
58306,0,0,0.000000,1,0.001669,3,0.005008,2,0.003339,1,0.001669,1,0.001669,4,0.006678,2,0.003339,0,0.0,1,0.001669,1,0.001669,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,776,1.345576,1.295492,0.050083,1,0,1,3,1,599,15,3,12,9,1,8,15,0,15,6,3,3,15,3,425,150,15,2,13,900,625,0,1,-1,7,11,6,0,6,-4


In [None]:
y_train_copy

38872    0
38873    0
38874    0
38875    0
38876    0
        ..
58303    0
58304    0
58305    1
58306    0
58307    0
Name: winner, Length: 19436, dtype: int64

In [None]:
# 복사하고 swap 한 학습 데이터를 기존 학습데이터와 행 병합
x_train = pd.concat([x_train, x_train_copy])
y_train = pd.concat([y_train, y_train_copy])

In [None]:
x_train

Unnamed: 0,0_,0_1000,0_1000_div_time,0_1020,0_1020_div_time,0_1021,0_1021_div_time,0_1022,0_1022_div_time,0_1023,0_1023_div_time,0_1024,0_1024_div_time,0_1025,0_1025_div_time,0_1026,0_1026_div_time,0_1028,0_1028_div_time,0_1029,0_1029_div_time,0_102A,0_102A_div_time,0_102B,0_102B_div_time,0_102D,0_102D_div_time,0_102F,0_102F_div_time,0_103E,0_103E_div_time,0_1060,0_1060_div_time,0_1080,0_1080_div_time,0_1081,0_1081_div_time,0_10C0,0_10C0_div_time,0_10E0,...,event_count_1,event_per_sec_0,event_per_sec_1,event_per_sec_diff,species_0,species_1,species_war_kind,target_worker_0,target_worker_1,time,unit_can_atk_ground_count_0,unit_can_atk_ground_count_1,unit_can_atk_ground_count_diff,unit_can_atk_sky_count_0,unit_can_atk_sky_count_1,unit_can_atk_sky_count_diff,unit_cannot_atk_ground_count_0,unit_cannot_atk_ground_count_1,unit_cannot_atk_ground_count_diff,unit_cannot_atk_sky_count_0,unit_cannot_atk_sky_count_1,unit_cannot_atk_sky_count_diff,unit_count_0,unit_count_1,unit_gas_0,unit_gas_1,unit_ground_count_0,unit_ground_count_1,unit_ground_count_diff,unit_mineral_0,unit_mineral_1,unit_sky_count_0,unit_sky_count_1,unit_sky_count_diff,worker_0,worker_1,worker_attack_0,worker_attack_1,worker_attack_diff,worker_diff
0,5,0,0.0,1,0.002252,4,0.009009,2,0.004505,2,0.004505,1,0.002252,0,0.000000,0,0.000000,0,0.0,0,0.000000,1,0.002252,1,0.002252,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,548,1.333333,1.234234,0.099099,1,1,3,0,0,444,3,7,-4,3,5,-2,0,0,0,0,2,-2,5,7,100,250,3,7,-4,325,550,1,0,1,9,6,0,0,0,3
7,13,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,655,2.600446,1.462054,1.138393,0,2,2,1,0,448,0,3,-3,2,3,-1,6,0,6,4,0,4,8,7,600,0,0,3,-3,1000,850,4,0,4,22,23,4,0,4,-1
14,0,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,815,1.631579,3.063910,-1.432331,0,0,0,2,0,266,0,4,-4,0,3,-3,3,0,3,3,4,-1,3,4,75,250,0,3,-3,300,775,0,1,-1,7,8,0,0,0,-1
16,4,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,1155,2.063439,1.928214,0.135225,0,1,1,2,2,599,1,5,-4,0,5,-5,3,0,3,4,0,4,5,7,175,200,0,5,-5,900,450,2,2,0,16,9,2,0,2,7
18,6,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,393,1.616949,1.332203,0.284746,2,2,5,0,1,295,3,6,-3,0,0,0,0,0,0,3,6,-3,4,9,0,0,3,6,-3,250,600,0,0,0,6,6,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58303,0,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,88,1.487179,2.256410,-0.769231,2,0,2,0,0,39,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,0,0,0,-2
58304,0,0,0.0,0,0.000000,1,0.027027,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,63,2.135135,1.702703,0.432432,1,1,3,0,0,37,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0
58305,3,0,0.0,1,0.001992,2,0.003984,4,0.007968,1,0.001992,1,0.001992,1,0.001992,0,0.000000,0,0.0,0,0.000000,1,0.001992,1,0.001992,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,457,1.655378,0.910359,0.745020,1,0,1,0,1,502,3,4,-1,3,2,1,0,4,-4,0,6,-6,3,10,0,800,3,1,2,150,1700,0,5,-5,11,6,0,0,0,5
58306,0,0,0.0,1,0.001669,3,0.005008,2,0.003339,1,0.001669,1,0.001669,4,0.006678,2,0.003339,0,0.0,1,0.001669,1,0.001669,0,0.000000,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,...,776,1.345576,1.295492,0.050083,1,0,1,3,1,599,15,3,12,9,1,8,15,0,15,6,3,3,15,3,425,150,15,2,13,900,625,0,1,-1,7,11,6,0,6,-4


In [None]:
# CatBoost 모델링
def cat_cv(learning_rate, n_estimators, subsample, reg_lambda, x_data=None, y_data=None, n_splits=5, output='score'):
    score = 0
    # StratifiedKFold 사용 
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    models = []
    for train_index, valid_index in skf.split(x_data, y_data):
        x_train, y_train = x_data.iloc[train_index], y_data.iloc[train_index]
        x_valid, y_valid = x_data.iloc[valid_index], y_data.iloc[valid_index]
        
        model = CatBoostClassifier(
            learning_rate = learning_rate, 
            n_estimators = int(n_estimators), 
            subsample = np.clip(subsample, 0, 1), 
            reg_lambda = reg_lambda,
            random_seed = 4321
        )
        
        # 모델 학습
        model.fit(x_train, y_train)

        # 학습된 모델 list에 추가
        models.append(model)
        
        pred = model.predict_proba(x_valid)[:, 1]
        true = y_valid

        # 모델의 auc score 계산
        score += roc_auc_score(true, pred)/n_splits
    
    if output == 'score':
        return score
    if output == 'model':
        return models

In [None]:
models1 = cat_cv(
    0.06104986994977514, 
    1021.5002260187719, 
    0.47862788174573134, 
    20.696786965610887, 
    x_data=x_train, y_data=y_train, n_splits=5, output='model')

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
105:	learn: 0.6193651	total: 21.8s	remaining: 3m 8s
106:	learn: 0.6192091	total: 22s	remaining: 3m 8s
107:	learn: 0.6189480	total: 22.2s	remaining: 3m 7s
108:	learn: 0.6186930	total: 22.4s	remaining: 3m 7s
109:	learn: 0.6183669	total: 22.6s	remaining: 3m 7s
110:	learn: 0.6180732	total: 22.9s	remaining: 3m 7s
111:	learn: 0.6178059	total: 23.1s	remaining: 3m 7s
112:	learn: 0.6176490	total: 23.3s	remaining: 3m 6s
113:	learn: 0.6174153	total: 23.5s	remaining: 3m 6s
114:	learn: 0.6171783	total: 23.6s	remaining: 3m 6s
115:	learn: 0.6168669	total: 23.8s	remaining: 3m 6s
116:	learn: 0.6164656	total: 24s	remaining: 3m 5s
117:	learn: 0.6161691	total: 24.3s	remaining: 3m 5s
118:	learn: 0.6158436	total: 24.5s	remaining: 3m 5s
119:	learn: 0.6156991	total: 24.7s	remaining: 3m 5s
120:	learn: 0.6155712	total: 24.9s	remaining: 3m 4s
121:	learn: 0.6153036	total: 25s	remaining: 3m 4s
122:	learn: 0.6151171	total: 25.2s	remaining: 3m 4s
123:	learn: 0.614816

In [None]:
# 모델들의 평균 점수 저장
preds1 = []
for model in models1:
    pred = model.predict_proba(x_test)[:, 1]
    preds1.append(pred)
pred1 = np.mean(preds1, axis=0)

In [None]:
# LGBM 모델링
def lgb_cv(num_leaves, learning_rate, n_estimators, subsample, colsample_bytree, reg_alpha, reg_lambda, x_data=None, y_data=None, n_splits=5, output='score'):
    score = 0
    # StratifiedKFold 사용 
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    models = []
    for train_index, valid_index in skf.split(x_data, y_data):
        x_train, y_train = x_data.iloc[train_index], y_data.iloc[train_index]
        x_valid, y_valid = x_data.iloc[valid_index], y_data.iloc[valid_index]
        
        model = lgb.LGBMClassifier(
            num_leaves = int(num_leaves), 
            learning_rate = learning_rate, 
            n_estimators = int(n_estimators), 
            subsample = np.clip(subsample, 0, 1), 
            colsample_bytree = np.clip(colsample_bytree, 0, 1), 
            reg_alpha = reg_alpha, 
            reg_lambda = reg_lambda,
            random_seed = 4321
        )
        
        # 모델 학습
        model.fit(x_train, y_train)
        # 학습된 모델 list에 추가
        models.append(model)
        
        pred = model.predict_proba(x_valid)[:, 1]
        true = y_valid

        # 모델의 auc score 계산
        score += roc_auc_score(true, pred)/n_splits
    
    if output == 'score':
        return score
    if output == 'model':
        return models

In [None]:
models2 = lgb_cv(
    304.64533877086365, 
    0.08152489470734282, 
    790.0482006660721, 
    0.4062287125072225, 
    0.07080287595563761, 
    1.9309431163985868, 
    49.36736630465849, 
    x_data=x_train, y_data=y_train, n_splits=5, output='model')

In [None]:
# 모델들의 평균 점수 저장
preds2 = []
for model in models2:
    pred = model.predict_proba(x_test)[:, 1]
    preds2.append(pred)
pred2 = np.mean(preds2, axis=0)

In [None]:
# sample_submission 로드
submission1 = pd.read_csv('/data/sample_submission.csv', index_col=0)
# 복사본 생성
submission2 = submission1.copy()

In [None]:
submission1['winner'] = submission1['winner'] + pred1 # Catboost의 예측값 저장

In [None]:
submission1.head()

Unnamed: 0_level_0,winner
game_id,Unnamed: 1_level_1
38872,0.541086
38873,0.4011
38874,0.469982
38875,0.888404
38876,0.669156


In [None]:
submission2['winner'] = submission2['winner'] + pred2 # LGBM의 예측값 저장

In [None]:
submission2.head()

Unnamed: 0_level_0,winner
game_id,Unnamed: 1_level_1
38872,0.420803
38873,0.26456
38874,0.427226
38875,0.962379
38876,0.742474


In [None]:
# swap된 index를 1-prob
for i in swaped_index:
    submission1.loc[i, :] = 1 - submission1.loc[i, :]
    submission2.loc[i, :] = 1 - submission2.loc[i, :]

In [None]:
submission1.head()

Unnamed: 0_level_0,winner
game_id,Unnamed: 1_level_1
38872,0.541086
38873,0.5989
38874,0.469982
38875,0.111596
38876,0.669156


In [None]:
submission2.head()

Unnamed: 0_level_0,winner
game_id,Unnamed: 1_level_1
38872,0.420803
38873,0.73544
38874,0.427226
38875,0.037621
38876,0.742474


In [None]:
# Catboost:6 LGBM:4의 비율로 앙상블
submission1['winner'] = submission1['winner']*0.6 + submission2['winner']*0.4

In [None]:
submission1.head()

Unnamed: 0_level_0,winner
game_id,Unnamed: 1_level_1
38872,0.492973
38873,0.653516
38874,0.45288
38875,0.082006
38876,0.698484


In [None]:
# 최종 결과 저장
submission1.to_csv('/data/submission_master_submit.csv')