In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm_notebook
import sklearn.preprocessing

In [2]:
# dataframe display 옵션
pd.set_option('display.max_columns' , 30)
pd.set_option('display.width', None)

In [4]:
data = pd.read_csv('./data/final4/final4_solo_Data.csv', 
                   dtype={                  
					'matchType': 'int8',
					'match_size': 'int16',
					'assists_ratio': 'float64',
					'boosts_ratio': 'float64',
					'headshotKills_ratio': 'float64',
					'heals_ratio': 'float64',
					'kills_ratio': 'float64',
					'killStreaks_ratio': 'float64',
					'longestKill_ratio': 'float64',
					'rideDistance_ratio': 'float64',
					'swimDistance_ratio': 'float64',
					'walkDistance_ratio': 'float64',
					'weaponsAcquired_ratio': 'float64',
					'headshotKillrate_ratio': 'float64',
					'DBNOs_ratio': 'float64',
					'revives_ratio': 'float64',
					'winPlacePerc': 'float64'
                   })

In [12]:
data.columns

Index(['Id', 'groupId', 'matchId', 'matchType', 'assists_mean', 'boosts_mean',
       'headshotKills_mean', 'heals_mean', 'kills_mean', 'killStreaks_mean',
       'weaponsAcquired_mean', 'longestKill_mean_log', 'rideDistance_mean_log',
       'swimDistance_mean_log', 'walkDistance_mean_log', 'match_size',
       'headshotKillrate_mean', 'assists_ratio', 'boosts_ratio',
       'headshotKills_ratio', 'heals_ratio', 'kills_ratio',
       'killStreaks_ratio', 'longestKill_ratio', 'rideDistance_ratio',
       'swimDistance_ratio', 'walkDistance_ratio', 'weaponsAcquired_ratio',
       'headshotKillrate_ratio', 'winPlacePerc'],
      dtype='object')

In [5]:
slic_data = data.drop(columns=['Id','matchId','groupId','matchType'], axis=1)
slic_data.describe()

Unnamed: 0,assists_mean,boosts_mean,headshotKills_mean,heals_mean,kills_mean,killStreaks_mean,weaponsAcquired_mean,longestKill_mean_log,rideDistance_mean_log,swimDistance_mean_log,walkDistance_mean_log,match_size,headshotKillrate_mean,assists_ratio,boosts_ratio,headshotKills_ratio,heals_ratio,kills_ratio,killStreaks_ratio,longestKill_ratio,rideDistance_ratio,swimDistance_ratio,walkDistance_ratio,weaponsAcquired_ratio,headshotKillrate_ratio,winPlacePerc
count,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699377.0,699376.0
mean,0.050203,0.978549,0.22013,0.900147,0.853229,0.45898,3.486635,1.413872,1.369525,0.20624,5.754798,94.158121,0.110137,0.87721,0.922388,0.894323,0.907035,0.918156,0.960482,0.906922,0.935919,0.788744,0.957869,0.983591,0.41562,0.477227
std,0.217276,1.653971,0.546894,1.981955,1.328474,0.535215,2.267194,1.766546,2.866547,0.879869,1.913742,9.204865,0.271372,4.346539,1.563128,2.244054,1.979411,1.429372,1.123605,1.83673,3.568739,5.109672,1.055321,0.629195,1.043982,0.29488
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,4.587108,94.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109486,0.527473,0.0,0.2211
50%,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,6.156555,96.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.511597,0.926752,0.0,0.4691
75%,0.0,1.0,0.0,1.0,1.0,1.0,5.0,3.011605,0.0,0.0,7.326466,97.0,0.0,0.0,1.253333,0.0,1.033708,1.114943,2.0625,0.934645,0.0,0.0,1.570616,1.361702,0.0,0.7312
max,1.0,9.0,4.0,17.0,9.0,2.0,12.0,5.703449,9.101084,5.68833,8.51198,100.0,1.0,100.0,25.526316,28.0,33.73913,11.675676,5.684211,19.82748,99.0,100.00001,17.0,4.6,14.5,1.0


In [9]:
train_df, test_df = train_test_split(slic_data, train_size = 0.7)
round(train_df,4)
round(test_df,4)

train_df.dropna(inplace = True)
print(train_df.isnull().any().any())
test_df.dropna(inplace = True)
print(test_df.isnull().any().any())

train_y = np.array(train_df['winPlacePerc'])
train_x = train_df.drop(columns=['winPlacePerc'], axis=1)

test_y = np.array(test_df['winPlacePerc'])
test_x = test_df.drop(columns=['winPlacePerc'], axis=1)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

False
False
(489562, 25) (209814, 25) (489562,) (209814,)


In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', learning_rate = 0.005, colsample_bytree=0.3,
                max_depth = 5, n_estimators = 100, gamma=0)
xg_reg.fit(train_x, train_y)

In [15]:
print('mae train: ', mean_absolute_error(xg_reg.predict(train_x), train_y))
print('mae test: ', mean_absolute_error(xg_reg.predict(test_x), test_y))

mae train:  0.1860932565537033
mae test:  0.18575588249362082
