In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
base_path = '../data/raw/'

df_train = pd.read_csv(base_path + 'train_V2.csv')
df_test = pd.read_csv(base_path + 'test_V2.csv')
submission = pd.read_csv(base_path + 'sample_submission_V2.csv')

# Preprocessing

In [34]:
# 결측치 확인
df_train[df_train.isnull().any(axis=1)]

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
2744604,f70c74418bb064,12dfbede33f92b,224a123c53e008,0,0,0.0,0,0,0,1,...,0,0.0,0,0.0,0,0,0.0,0,0,


In [53]:
# 결측치 제거
train = df_train.dropna(axis=0)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4446965 entries, 0 to 4446965
Data columns (total 29 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Id               object 
 1   groupId          object 
 2   matchId          object 
 3   assists          int64  
 4   boosts           int64  
 5   damageDealt      float64
 6   DBNOs            int64  
 7   headshotKills    int64  
 8   heals            int64  
 9   killPlace        int64  
 10  killPoints       int64  
 11  kills            int64  
 12  killStreaks      int64  
 13  longestKill      float64
 14  matchDuration    int64  
 15  matchType        object 
 16  maxPlace         int64  
 17  numGroups        int64  
 18  rankPoints       int64  
 19  revives          int64  
 20  rideDistance     float64
 21  roadKills        int64  
 22  swimDistance     float64
 23  teamKills        int64  
 24  vehicleDestroys  int64  
 25  walkDistance     float64
 26  weaponsAcquired  int64  
 27  winPoints   

In [158]:
train.columns

Index(['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals',
       'kills', 'killStreaks', 'longestKill', 'matchType', 'revives',
       'rideDistance', 'roadKills', 'swimDistance', 'vehicleDestroys',
       'walkDistance', 'weaponsAcquired', 'winPlacePerc'],
      dtype='object')

In [54]:
# 학습에 사용할 컬럼을 추출
train = train.drop(columns=['Id','killPlace','killPoints','matchDuration',\
    'matchId','rankPoints','teamKills','winPoints','groupId','numGroups',\
        'maxPlace'])

In [57]:
train.head()

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,kills,killStreaks,longestKill,matchType,revives,rideDistance,roadKills,swimDistance,vehicleDestroys,walkDistance,weaponsAcquired,winPlacePerc
0,0,0,0.0,0,0,0,0,0,0.0,squad-fpp,0,0.0,0,0.0,0,244.8,1,0.4444
1,0,0,91.47,0,0,0,0,0,0.0,squad-fpp,0,0.0045,0,11.04,0,1434.0,5,0.64
2,1,0,68.0,0,0,0,0,0,0.0,duo,0,0.0,0,0.0,0,161.8,2,0.7755
3,0,0,32.9,0,0,0,0,0,0.0,squad-fpp,0,0.0,0,0.0,0,202.7,3,0.1667
4,0,0,100.0,0,0,0,1,1,58.53,solo-fpp,0,0.0,0,0.0,0,49.75,2,0.1875


# Feature Engineering

## matchType Ordinal encoding

In [59]:
# custom match
train.loc[(train.matchType.str.contains('normal'))|\
    (train.matchType.str.contains('flare'))|\
        (train.matchType.str.contains('crash')), 'matchType'] = 'custom'

In [60]:
# standard match
train.loc[train.matchType.str.contains('solo'), 'matchType'] = 'solo'
train.loc[train.matchType.str.contains('duo'), 'matchType'] = 'duo'
train.loc[train.matchType.str.contains('squad'), 'matchType'] = 'squad'

In [106]:
# Nominal Encoding
train_OHE = pd.get_dummies(train, columns=['matchType'])
train_OHE.head(10)

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,kills,killStreaks,longestKill,revives,...,roadKills,swimDistance,vehicleDestroys,walkDistance,weaponsAcquired,winPlacePerc,matchType_custom,matchType_duo,matchType_solo,matchType_squad
0,0,0,0.0,0,0,0,0,0,0.0,0,...,0,0.0,0,244.8,1,0.4444,0,0,0,1
1,0,0,91.47,0,0,0,0,0,0.0,0,...,0,11.04,0,1434.0,5,0.64,0,0,0,1
2,1,0,68.0,0,0,0,0,0,0.0,0,...,0,0.0,0,161.8,2,0.7755,0,1,0,0
3,0,0,32.9,0,0,0,0,0,0.0,0,...,0,0.0,0,202.7,3,0.1667,0,0,0,1
4,0,0,100.0,0,0,0,1,1,58.53,0,...,0,0.0,0,49.75,2,0.1875,0,0,1,0
5,0,0,100.0,1,1,0,1,1,18.44,0,...,0,0.0,0,34.7,1,0.037,0,0,0,1
6,0,0,0.0,0,0,0,0,0,0.0,0,...,0,0.0,0,13.5,1,0.0,0,0,0,1
7,0,0,8.538,0,0,0,0,0,0.0,0,...,0,0.0,0,1089.0,6,0.7368,0,0,1,0
8,0,0,51.6,0,0,0,0,0,0.0,0,...,0,0.0,0,799.9,4,0.3704,0,0,0,1
9,0,0,37.27,0,0,0,0,0,0.0,0,...,0,0.0,0,65.67,1,0.2143,0,0,0,1


## feature scaling

In [112]:
X = train_OHE.drop(columns='winPlacePerc')
y = train_OHE.winPlacePerc

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
temp = scaler.fit_transform(X.loc[:,:'weaponsAcquired'])
temp

array([[0.        , 0.        , 0.        , ..., 0.        , 0.00949573,
        0.00423729],
       [0.        , 0.        , 0.01382557, ..., 0.        , 0.05562452,
        0.02118644],
       [0.04545455, 0.        , 0.01027811, ..., 0.        , 0.00627618,
        0.00847458],
       ...,
       [0.        , 0.        , 0.00892684, ..., 0.        , 0.03059348,
        0.01694915],
       [0.        , 0.12121212, 0.02726723, ..., 0.        , 0.10659426,
        0.03389831],
       [0.        , 0.06060606, 0.04050786, ..., 0.        , 0.04825446,
        0.02118644]])

In [113]:
pd.DataFrame(temp)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.00,0.000000,0.000000,0.000000e+00,0.0,0.000000,0.0,0.009496,0.004237
1,0.000000,0.000000,0.013826,0.000000,0.000000,0.0000,0.000000,0.00,0.000000,0.000000,1.105380e-07,0.0,0.002888,0.0,0.055625,0.021186
2,0.045455,0.000000,0.010278,0.000000,0.000000,0.0000,0.000000,0.00,0.000000,0.000000,0.000000e+00,0.0,0.000000,0.0,0.006276,0.008475
3,0.000000,0.000000,0.004973,0.000000,0.000000,0.0000,0.000000,0.00,0.000000,0.000000,0.000000e+00,0.0,0.000000,0.0,0.007863,0.012712
4,0.000000,0.000000,0.015115,0.000000,0.000000,0.0000,0.013889,0.05,0.053501,0.000000,0.000000e+00,0.0,0.000000,0.0,0.001930,0.008475
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4446960,0.000000,0.000000,0.000000,0.000000,0.000000,0.0000,0.000000,0.00,0.000000,0.000000,3.173667e-02,0.0,0.000000,0.0,0.039527,0.012712
4446961,0.000000,0.030303,0.006673,0.000000,0.000000,0.0000,0.000000,0.00,0.000000,0.000000,0.000000e+00,0.0,0.000000,0.0,0.003169,0.025424
4446962,0.000000,0.000000,0.008927,0.000000,0.000000,0.0000,0.000000,0.00,0.000000,0.000000,0.000000e+00,0.0,0.000571,0.0,0.030593,0.016949
4446963,0.000000,0.121212,0.027267,0.018868,0.015625,0.0250,0.027778,0.05,0.090037,0.051282,0.000000e+00,0.0,0.000000,0.0,0.106594,0.033898


In [121]:
X.loc[:,:'weaponsAcquired'] = temp[:, :]

In [135]:
X.columns

Index(['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals',
       'kills', 'killStreaks', 'longestKill', 'revives', 'rideDistance',
       'roadKills', 'swimDistance', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'matchType_custom', 'matchType_duo',
       'matchType_solo', 'matchType_squad'],
      dtype='object')

In [136]:
train_OHE.columns

Index(['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals',
       'kills', 'killStreaks', 'longestKill', 'revives', 'rideDistance',
       'roadKills', 'swimDistance', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPlacePerc', 'matchType_custom', 'matchType_duo',
       'matchType_solo', 'matchType_squad'],
      dtype='object')

In [134]:
y

0          0.4444
1          0.6400
2          0.7755
3          0.1667
4          0.1875
            ...  
4446961    0.1786
4446962    0.2935
4446963    0.4815
4446964    0.8000
4446965    0.5464
Name: winPlacePerc, Length: 4446965, dtype: float64

# Training

In [124]:
# 학습을 위한 라이브러리 세팅
from sklearn.linear_model import LinearRegression   # 1. Linear Regression 
from sklearn.linear_model import Lasso              # 2. Lasso
from sklearn.linear_model import Ridge              # 3. Ridge
from xgboost.sklearn import XGBRegressor            # 4. XGBoost
from lightgbm.sklearn import LGBMRegressor          # 5. LightGBM

# 평가 지표
from sklearn.metrics import mean_absolute_error

In [132]:
def training(m, t, target):
    model = m
    model.fit(t, target)
    pred_train = model.predict(t)
    mae_train = mean_absolute_error(target, pred_train)
    return mae_train

In [133]:
print(" 1. Linear Regression : %.4f" % training(LinearRegression(), X, y))
print(" 2. Lasso             : %.4f" % training(Lasso(), X, y))
print(" 3. Ridge             : %.4f" % training(Ridge(), X, y))
print(" 4. XGBoost           : %.4f" % training(XGBRegressor(), X, y))
print(" 5. LigthGBM          : %.4f" % training(LGBMRegressor(), X, y))

 1. Linear Regression : 0.1247
 2. Lasso             : 0.2679
 3. Ridge             : 0.1247
 4. XGBoost           : 0.0986
 5. LigthGBM          : 0.0989


In [None]:
# Hyper-parameter tuning

# GridSearchCV
from sklearn.model_selection import GridSearchCV

parma_grid = {
    "max_depth" : [],
    "learning_rate" : [],
    "n_estimators" : [],
}

# Test  
training set과 같은 전처리를 해줘야 함.

In [139]:
test = df_test.copy()

In [140]:
# 결측치 확인
df_test[df_test.isnull().any(axis=1)]

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints


In [141]:
# 사용할 컬럼만 추출
test = test.drop(columns=['Id','killPlace','killPoints','matchDuration',\
    'matchId','rankPoints','teamKills','winPoints','groupId','numGroups',\
        'maxPlace'])

In [142]:
# custom match
test.loc[(test.matchType.str.contains('normal'))|\
    (test.matchType.str.contains('flare'))|\
        (test.matchType.str.contains('crash')), 'matchType'] = 'custom'

# standard match
test.loc[test.matchType.str.contains('solo'), 'matchType'] = 'solo'
test.loc[test.matchType.str.contains('duo'), 'matchType'] = 'duo'
test.loc[test.matchType.str.contains('squad'), 'matchType'] = 'squad'

In [143]:
test

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,kills,killStreaks,longestKill,matchType,revives,rideDistance,roadKills,swimDistance,vehicleDestroys,walkDistance,weaponsAcquired
0,0,0,51.46,0,0,0,0,0,0.00,squad,0,0.0,0,0.000,0,588.00,1
1,0,4,179.10,0,0,2,2,1,361.90,duo,2,4669.0,0,0.000,0,2017.00,6
2,1,0,23.40,0,0,4,0,0,0.00,squad,0,0.0,0,0.000,0,787.80,4
3,0,0,65.52,0,0,0,0,0,0.00,duo,0,0.0,0,0.000,0,1812.00,3
4,0,4,330.20,1,2,1,3,1,60.06,squad,1,0.0,0,0.000,0,2963.00,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1934169,1,2,381.00,3,0,7,4,2,69.27,squad,1,0.0,0,0.000,0,1799.00,5
1934170,0,0,0.00,0,0,0,0,0,0.00,squad,0,0.0,0,0.000,0,1195.00,3
1934171,0,0,91.96,0,0,3,1,1,25.38,squad,0,0.0,0,7.798,0,3327.00,3
1934172,1,2,138.60,0,0,12,1,1,67.89,duo,1,0.0,0,0.000,0,1893.00,4


In [144]:
# Ordinal Encoding
test_OHE = pd.get_dummies(test, columns=['matchType'])
test_OHE.head(10)

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,kills,killStreaks,longestKill,revives,rideDistance,roadKills,swimDistance,vehicleDestroys,walkDistance,weaponsAcquired,matchType_custom,matchType_duo,matchType_solo,matchType_squad
0,0,0,51.46,0,0,0,0,0,0.0,0,0.0,0,0.0,0,588.0,1,0,0,0,1
1,0,4,179.1,0,0,2,2,1,361.9,2,4669.0,0,0.0,0,2017.0,6,0,1,0,0
2,1,0,23.4,0,0,4,0,0,0.0,0,0.0,0,0.0,0,787.8,4,0,0,0,1
3,0,0,65.52,0,0,0,0,0,0.0,0,0.0,0,0.0,0,1812.0,3,0,1,0,0
4,0,4,330.2,1,2,1,3,1,60.06,1,0.0,0,0.0,0,2963.0,4,0,0,0,1
5,0,0,0.0,0,0,0,0,0,0.0,0,0.0,0,0.0,0,0.0,0,0,0,0,1
6,0,3,470.7,3,2,17,5,1,57.61,0,0.0,0,0.0,0,1000.0,4,0,1,0,0
7,0,0,68.61,0,0,0,0,0,0.0,0,0.0,0,0.0,0,1217.0,5,0,0,0,1
8,0,0,0.0,0,0,0,0,0,0.0,0,2355.0,0,0.0,0,1390.0,7,0,0,0,1
9,0,0,67.32,0,0,0,0,0,0.0,0,0.0,0,0.0,0,1634.0,5,0,0,0,1


In [147]:
X_test = test_OHE.copy()

In [148]:
# feature scaling
scaler = MinMaxScaler()
temp2 = scaler.fit_transform(X_test.loc[:,:'weaponsAcquired'])
pd.DataFrame(temp2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.000000,0.000000,0.008261,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.0,0.039437,0.006536
1,0.000000,0.166667,0.028753,0.000000,0.00000,0.026667,0.034483,0.066667,0.360458,0.10,0.114717,0.0,0.000000,0.0,0.135278,0.039216
2,0.037037,0.000000,0.003757,0.000000,0.00000,0.053333,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.0,0.052837,0.026144
3,0.000000,0.000000,0.010519,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.0,0.121529,0.019608
4,0.000000,0.166667,0.053010,0.016949,0.04878,0.013333,0.051724,0.066667,0.059821,0.05,0.000000,0.0,0.000000,0.0,0.198726,0.026144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1934169,0.037037,0.083333,0.061166,0.050847,0.00000,0.093333,0.068966,0.133333,0.068994,0.05,0.000000,0.0,0.000000,0.0,0.120657,0.032680
1934170,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.0,0.000000,0.0,0.080148,0.019608
1934171,0.000000,0.000000,0.014763,0.000000,0.00000,0.040000,0.017241,0.066667,0.025279,0.00,0.000000,0.0,0.002384,0.0,0.223139,0.019608
1934172,0.037037,0.083333,0.022251,0.000000,0.00000,0.160000,0.017241,0.066667,0.067620,0.05,0.000000,0.0,0.000000,0.0,0.126962,0.026144


In [150]:
X_test.columns

Index(['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals',
       'kills', 'killStreaks', 'longestKill', 'revives', 'rideDistance',
       'roadKills', 'swimDistance', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'matchType_custom', 'matchType_duo',
       'matchType_solo', 'matchType_squad'],
      dtype='object')

In [151]:
X_test.loc[:,:'weaponsAcquired'] = temp2[:, :]

In [154]:
reg = XGBRegressor()
reg.fit(X, y)
result = reg.predict(X_test)
print(result)

[0.36024776 0.98117596 0.53439146 ... 0.81388414 0.87325823 0.11511708]


In [156]:
submission['winPlacePerc'] = result
submission.to_csv('submission.csv', index=False)

# Check

In [5]:
df_train.columns

Index(['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints', 'winPlacePerc'],
      dtype='object')

In [None]:
df_train['average_speed'] = (df_train.ridedistance + df_train.swimDistance + df_train.walkDistance)/df_train.matchDuration