## Import & Load

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from localfunc import reduce_ram_usage
from lightgbm import LGBMRegressor
import gc
from category_encoders import BinaryEncoder

In [2]:
print("Data loading...")
train = pd.read_csv("./data/train_V2.csv")
test = pd.read_csv("./data/test_V2.csv")

Data loading...


### Check (just in case)

In [4]:
test.isnull().sum()

Id                 0
groupId            0
matchId            0
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
numGroups          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
dtype: int64

## Pre-Processing

In [3]:
train_bak = train.copy()
train_bak.shape             #테스트 및 백업. 실행시키지 말 것!!!!

(4446966, 29)

### Drop null column

In [55]:
print("Pre-Processing...")
train = train_bak.copy()  #--
for i in train.columns.to_list() :
    train.drop(index=train[train[i].isnull()==True].index, inplace=True)

train_bak_null = train.copy() #--

Pre-Processing...


### Drop Outlier

In [115]:

print("Droping Outliers...")
train = train_bak_null.copy() #--
vip_features = ["assists","boosts","DBNOs","heals","kills","killStreaks","walkDistance", "revives", "roadKills", "vehicleDestroys"]
dpidx_sum=0

for col in (vip_features + ["damageDealt","longestKill", "rideDistance", "swimDistance","weaponsAcquired", "matchDuration"]):
    dpidx = train[train[col]>train[col].quantile(0.999)].index
    train.drop(index=dpidx, inplace=True)
    dpidx_sum += len(dpidx)
     
for col in vip_features:
    dpidx = train[train["walkDistance"]<train[col]].index
    train.drop(index=dpidx, inplace=True)
    dpidx_sum += len(dpidx)
    

dpidx = train[ train.groupby('matchId')['kills'].transform('max')  > train.groupby('matchId')['Id'].transform('count')  ].index
train.drop(index=dpidx, inplace=True)
dpidx_sum += len(dpidx)
               
dpidx=train[ (train['rideDistance']==0) & (train['roadKills']>0)  ].index   
train.drop(index=dpidx, inplace=True)
dpidx_sum += len(dpidx)
           
print(f"{dpidx_sum} Columns Deleted") 

del dpidx_sum, dpidx
gc.collect()             

Droping Outliers...
50012 Columns Deleted


### Encoding (필요가 있는지는 잘 모르겠음)

In [83]:

train = train_bak_null.copy() #--

mapper = lambda x: 'normal' if ('normal' in x) or ('crash' in x)or ('flare' in x)else x 
train["matchType"]=train["matchType"].apply(mapper)

mapper = lambda x: 'solo' if ('solo' in x) else 'duo' if ('duo' in x) else 'normal' if ('normal' in x) else 'squad' 
train["matchType"]=train["matchType"].apply(mapper)

train = pd.concat([train,pd.get_dummies(train["matchType"])], axis=1)

del mapper

### Make Columns

    - groupId
    - matchId
    - matchType
    - maxPlace (≈numGroups)

In [106]:
#unleaked Killplace
#train.groupby(["matchId"])["kills"]

sort = train.sort_values(["matchId", 'kills'])
sort

#
# killplace = 한 매치 안에서 킬을 한 순위

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,duo,normal,solo,squad
7931,a121348062f67a,7bd08592bb25e2,0000a43bce5eec,0,1,0.0,0,0,1,41,...,0,0,2723.00,3,1502,0.6667,0,0,0,1
101714,731b2318b2adf2,8ebb76fe544c27,0000a43bce5eec,0,2,0.0,0,0,4,45,...,0,0,1241.00,3,1471,0.6296,0,0,0,1
134000,503ae3c191575b,97364d4199bf30,0000a43bce5eec,0,0,0.0,0,0,0,78,...,0,0,37.38,1,1476,0.1111,0,0,0,1
310967,8bc0d095f488b4,767819928e6279,0000a43bce5eec,1,0,81.9,0,0,0,67,...,0,0,93.47,2,1481,0.2593,0,0,0,1
329124,2267f810807e22,e8ff1c0fe7f6aa,0000a43bce5eec,0,0,0.0,0,0,0,84,...,0,0,45.05,2,1460,0.0741,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4219004,50dcec33bee519,3f8b442285e75a,fffe92232706aa,0,1,429.2,3,0,3,8,...,0,0,1911.00,9,0,0.5357,0,0,0,1
445384,682f6e6f66470f,abc744463b2dd4,fffe92232706aa,0,1,724.7,6,0,2,4,...,0,0,761.00,5,0,0.4286,0,0,0,1
1118609,56ea6b32521f5b,809d8288457e3d,fffe92232706aa,1,6,594.4,2,1,6,3,...,0,0,2706.00,7,0,1.0000,0,0,0,1
304989,54e7365e18a0b6,6cfbeead9e1050,fffe92232706aa,1,3,617.9,4,0,1,1,...,0,0,3344.00,7,0,0.9643,0,0,0,1


In [131]:
train["KillPlace"] = train.groupby("matchId")["kills"].transform('rank', ascending=False, method='dense')

In [132]:
train[train.matchId=="a10357fd1a4a91"][['matchId','kills','killPlace','KillLPlace']].sort_values(["kills"]).tail(10)

Unnamed: 0,matchId,kills,killPlace,KillLPlace
2471436,a10357fd1a4a91,3,12,4.0
1182186,a10357fd1a4a91,3,9,4.0
687216,a10357fd1a4a91,3,11,4.0
1386393,a10357fd1a4a91,3,7,4.0
1912681,a10357fd1a4a91,4,6,3.0
3958182,a10357fd1a4a91,4,4,3.0
4171661,a10357fd1a4a91,4,3,3.0
1584671,a10357fd1a4a91,4,5,3.0
2162806,a10357fd1a4a91,5,2,2.0
3692199,a10357fd1a4a91,7,1,1.0


## Predict - Evaluation

### Parameter Turning

### Prediction

In [92]:
features = train.drop(["Id","groupId", "killPlace", "matchType","matchId","winPlacePerc"], axis=1) #all
target = train["winPlacePerc"]
train_X, test_X, train_y, test_y = train_test_split(features, target, test_size=0.2, random_state=589)


In [94]:
print("Fitting...")
model = LGBMRegressor()
model.fit(train_X, train_y)

In [95]:
pred_y = model.predict(test_X)

In [96]:
print("MAE : ",np.round(mean_absolute_error(pred_y, test_y),6))

0.09097761953948993

### one more thing : 

In [None]:
0.09071973631904867

### Dummy

        사용한 데이터: 29x4490000의 배그 승률예측 데이터.

__1,__
>편의상 칼럼을 주요 칼럼 4개만 뽑음.

-  df A: 뽑은 칼럼의 이상치 제거(quantile 0.99)
-  df B: 대조군



df A,B를 train set과 validation set로 분리 후 fit-predict. 시드는 서로 같게.    
그 결과, __이상치를 제거한 A가 B보다 성능 악화__ (6개의 시드 시행 전부에서)

-----------
__2,__
>이번엔 전체 데이터에 대해서 동일한 시험 수행.

-  df A: 다수 칼럼의 이상치 제거(quantile 0.99)
-  df B: 대조군

실험 1에서 사용한 6개의 시드 그대로 fit-predict 시행.   
결과는 __이상치를 제거한 A가 B보다 성능이 대체로 우수했음.__ 


-  df C: 모든 칼럼을 사용하되, 실험 1에서 사용한 컬럼 4개에서만 이상치 제거.

마찬가지 실험 결과, __C가 대조군 B보다 성능이 좋았음.__     
실험 1과 상반되는 결과.

-----------



        실험 결과 : 데이터의 크기가 작았을 때는 이상치 제거가 성능을 떨어트렸음.    
                    반면 데이터의 크기가 크다면 이상치 제거가 성능을 올림.
                    크기가 작은 데이터와 동일한 이상치만을 제거했음에도 데이터의 크기가 커지면 오히려 성능을 올리는 요인으로 바뀜.
어째서일까?


사용한 model : lgbm, linear.