## PUBG Modelling

플레이어들이 전장에서 벌이는 모든 행위를 분류하여 승리를 예측하기 위한 프로젝트

## Column 별 정보
- ASSISTS : 자신에 의해 사살되지는 못했지만 자신의 데미지 지분율이 가장 높은 적의 수
- BOOSTS : 도핑 아이템 사용 횟수(에너지 드링크, 진통제, 아드레날린 주사기)
- DAMAGEDEALT : 적에게 가한 전체 데미지
- DBNOS : 기절시킨 적의 수
- GROUPID : 경기 내 그룹 식별 ID
- HEADSHOTKILLS : 헤드샷으로 죽인 적의 수
- HEALS : 회복 아이템 사용 횟수(붕대, 구급상자, 의료용 키트)
- ID : 해당 데이터 ID
- KILLPLACE : 경기 내 적을 죽인 적 수의 순위
- KILLPOINTS : 유저의 ELO레이팅(죽인 적 수 기반)
- KILLS : 경기 내 적을 죽인 수
- KILLSTREAKS : 짧은 시간 내에 연속으로 적을 죽인 수
- LONGESTKILL : 사살한 적까지의 거리의 최대값
- MATCHDURATION : 경기가 진행된 시간(단위:초)
- MATCHID : 경기 식별 ID
- MATCHTYPE : 경기 모드( ex: 솔로, 듀오, 스쿼드 )
- MAXPLACE : 경기 내 총 인원 수
- NUMGROUP : 경기 내 실제 참여 인원 수
- RANKPOINTS : 유저의 ELO 레이팅
- REVIVES : 유저가 팀원을 부활시킨 횟수
- RIDEDISTANCE : 이동수단을 통해 이동한 거리(단위:m)
- ROADKILLS : 이동수단으로 살해한 적의 수
- SWIMDISTANCE : 수영으로 이동한 거리(단위:m)
- TEAMKILLS : 같은 팀원을 살해한 수
- VEHICLEDESTROYS : 파괴한 이동수단의 수
- WALKDISTANCE : 도보로 이동한 거리(단위:m)
- WEAPONSACQUIRED : 획득한 무기의 수
- WINPOINTS : 유저의 ELO 레이팅(승리 횟수 기반)
- WINPLACEPERC : 현재 경기에서의 백분위 기반 유저의 순위(종속변수)

## 라이브러리 및 데이터 불러오기

In [1]:
# 데이터 분석
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# 학습을 위한 라이브러리 세팅
from sklearn.linear_model import LinearRegression   
from sklearn.linear_model import Lasso              
from sklearn.linear_model import Ridge             
from xgboost.sklearn import XGBRegressor            
from lightgbm.sklearn import LGBMRegressor 
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from lightgbm.sklearn import LGBMClassifier        
from sklearn.metrics import mean_absolute_error
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from functools import partial

# VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Hyper parameter tuning
import optuna
import sklearn.metrics as metrics

# others 
import scipy as sp
from scipy import stats
import sys
import gc
import os
print(os.listdir("./data/pubg-finish-placement-prediction/"))
import warnings                      
warnings.filterwarnings("ignore")

  from pandas import MultiIndex, Int64Index


['test_V2.csv', 'sample_submission_V2.csv', 'train_V2.csv']


In [2]:
train = pd.read_csv("./data/pubg-finish-placement-prediction/train_V2.csv")
test = pd.read_csv("./data/pubg-finish-placement-prediction/test_V2.csv")
submission = pd.read_csv("./data/pubg-finish-placement-prediction/sample_submission_V2.csv")

## 함수

In [3]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
#     start_mem = df.memory_usage().sum() / 1024**2
#     print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

#     end_mem = df.memory_usage().sum() / 1024**2
#     print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
#     print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [4]:
def sorted_corr(data, column) :
    df_corr = data.corr()
    df_corr = df_corr.apply(lambda x: round(x ,2))
    df_corr = df_corr.unstack()
    df_corr = pd.DataFrame(df_corr[column][df_corr[column]<1].sort_values(ascending=False), columns=['Correlation'])
    df_corr = df_corr.style.background_gradient(cmap='coolwarm_r')
    return df_corr

In [5]:
def toVIF(features): 
    return pd.DataFrame({
        "feature": features.columns,
        "VIF": [variance_inflation_factor(features.values, idx)
                for idx in range(features.shape[1])]
    })

In [6]:
def OLS_summary(features, target):
    sm_feature = sm.add_constant(features)
    model = sm.OLS(target, sm_feature).fit()
    print(model.summary2())

In [7]:
from sklearn.linear_model import LinearRegression   # 1. Linear Regression
from sklearn.linear_model import Lasso              # 2. Lasso
from sklearn.linear_model import Ridge              # 3. Ridge
from xgboost.sklearn import XGBRegressor            # 4. XGBoost
from lightgbm.sklearn import LGBMRegressor          # 5. LightGBM
from sklearn.metrics import mean_absolute_error


def Linear_training_mae(target, features):
    model = LinearRegression().fit(features,target)
    pred = model.predict(features)
    mae = np.round(mean_absolute_error(target, pred), 4)
    return mae
def Lasso_training_mae(target, features):
    model = Lasso().fit(features,target)
    pred = model.predict(features)
    mae = np.round(mean_absolute_error(target, pred), 4)
    return mae
def Ridge_training_mae(target, features):
    model = Ridge().fit(features,target)
    pred = model.predict(features)
    mae = np.round(mean_absolute_error(target, pred), 4)
    return mae
def XGB_training_mae(target, features):
    model = XGBRegressor().fit(features,target)
    pred = model.predict(features)
    mae = np.round(mean_absolute_error(target, pred), 4)
    return mae
def LGBM_training_mae(target, features):
    model = LGBMRegressor().fit(features,target)
    pred = model.predict(features)
    mae = np.round(mean_absolute_error(target, pred), 4)
    return mae
def trainings_mae(target, features):
    print("1. Linear Regression\t, train=%.4f" % Linear_training_mae(target, features))
    print("2. Lasso\t\t, train=%.4f" % Lasso_training_mae(target, features))        
    print("3. Ridge\t\t, train=%.4f" % Ridge_training_mae(target, features))
    print("4. XGBoost\t\t, train=%.4f" % XGB_training_mae(target, features))
    print("5. LightGBM\t\t, train=%.4f" % LGBM_training_mae(target, features))

In [8]:
def one_hot_encoding(data):
    train_OHE = pd.get_dummies(data, columns=["matchType"])
    train_OHE = reduce_mem_usage(train_OHE)
    return train_OHE

In [9]:
def ordinal_encoding(data):
    train_OE = data.copy()
    train_OE['matchType'] = train_OE['matchType'].map({
        'solo':1,
        'solo-fpp':2,
        'duo':3,
        'duo-fpp':4,
        'squad':5,
        'squad-fpp':6,
        'normal-duo':7,
        'normal-duo-fpp':8,
        'normal-solo':9,
        'normal-solo-fpp':10,
        'normal-squad':11,
        'normal-squad-fpp':12,
        'crashfpp':13,
        'crashtpp':14,
        'flarefpp':15,
        'flaretpp':16
        })
    train_OE = reduce_mem_usage(train_OE)
    return train_OE

In [10]:
# optuna RandomForest
def optimizer_RF(trial, X, y, K):
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    max_depth = trial.suggest_int("max_depth", 8, 30)
    max_features = trial.suggest_categorical("max_features", ['auto', 'sqrt', 'log2'])
    evaluation_metric = mean_absolute_error
    
    model = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  max_features=max_features,
                                  n_jobs=-1,
                                  random_state=0xC0FFEE)
    
    folds = KFold(n_splits=K)
    scores = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        score = evaluation_metric(y_val, preds)
        scores.append(score)
        
    return np.mean(scores)

In [11]:
# Optuna LightGBM
def optimizer_LGBM(trial, X, y, K):
    import os
    param = {
        'objective': 'regression', # 회귀
        'verbose': 0,
        'max_depth': trial.suggest_int('max_depth', 8, 20),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1e-2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_loguniform('subsample', 0.4, 1)
        #"device" : 'gpu'
    }

    model = LGBMRegressor(**param, n_jobs=os.cpu_count())
    evaluation_metric = mean_absolute_error
    
    folds = KFold(n_splits=K)
    scores = []
    
    for train_idx, val_idx in folds.split(X, y):
        
        # X_train,X_val = X[train_idx],X[val_idx]
        # y_train,y_val = y[train_idx],y[val_idx]
        
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=25)
        preds = model.predict(X_val)
        score = evaluation_metric(y_val, preds)
        scores.append(score)
    
    return np.mean(scores)

In [12]:
def feature_enginnering(df):
    # drop unnecessary columns 
    df = df.drop(["Id", "groupId","matchId"], axis=1)
    df = ordinal_encoding(df)
    # df = df.drop(['killPlace'], axis=1)
    # df = df.drop(['damageDealt'], axis=1)
    # df = df.drop(['killPoints','rankPoints','winPoints'], axis=1)
    # df = df.drop(['numGroups'], axis=1)
    
    # modify existed columns
    df['killPerc'] = df.groupby('matchId')['kills'].rank(pct=True).values
    df['killPlacePerc'] = df.groupby('matchId')['killPlace'].rank(pct=True).values
    df['weaponsAcquired'] = df.groupby('matchId')['weaponsAcquired'].rank(pct=True).values
    df.loc[(df['rankPoints']==-1), 'rankPoints'] = 0
    
    # create new columns 
    df['walkDistance_duration'] = df['walkDistance'] / df['matchDuration']
    df['walkDistance_kills'] = df['walkDistance'] / df['kills']
    df['walkDistance_heals'] = df['walkDistance'] / df['heals']
    df['walkDistance_boosts'] = df['walkDistance'] / df['boosts']
    df['walkDistancePerc'] = df.groupby('matchId')['walkDistance'].rank(pct=True).values
    
    df['rideDistance_duration'] = df['rideDistance'] / df['matchDuration']
    df['rideDistance_kills'] = df['rideDistance'] / df['kills']
    df['rideDistance_heals'] = df['rideDistance'] / df['heals']
    df['rideDistance_boosts'] = df['rideDistance'] / df['boosts']
    
    df['swimDistance_duration'] = df['swimDistance'] / df['matchDuration']
    df['swimDistance_kills'] = df['swimDistance'] / df['kills']
    df['swimDistance_heals'] = df['swimDistance'] / df['heals']
    df['swimDistance_boosts'] = df['swimDistance'] / df['boosts']
    
    df['totalDistance'] = df['walkDistance'] + df['rideDistance'] + df['swimDistance']
    df['totalDistance_duration'] = df['totalDistance'] / df['matchDuration']
    df['totalDistance_kills'] = df['totalDistance'] / df['kills']
    df['totalDistance_heals'] = df['totalDistance'] / df['heals']
    df['totalDistance_boosts'] = df['totalDistance'] / df['boosts']
    df['totalDistance'] = df.groupby('matchId')['totalDistance'].rank(pct=True).values
    
    df['killPoints_rankPoints'] = df['killPoints'] + df['rankPoints']
    
    df['kills_headshotKills'] = df['kills'] / df['headshotKills']
    df['killStreaks_kills'] = df['killStreaks'] / df['kills']
    return df

## 전처리

In [13]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Id               object 
 1   groupId          object 
 2   matchId          object 
 3   assists          int8   
 4   boosts           int8   
 5   damageDealt      float16
 6   DBNOs            int8   
 7   headshotKills    int8   
 8   heals            int8   
 9   killPlace        int8   
 10  killPoints       int16  
 11  kills            int8   
 12  killStreaks      int8   
 13  longestKill      float16
 14  matchDuration    int16  
 15  matchType        object 
 16  maxPlace         int8   
 17  numGroups        int8   
 18  rankPoints       int16  
 19  revives          int8   
 20  rideDistance     float16
 21  roadKills        int8   
 22  swimDistance     float16
 23  teamKills        int8   
 24  vehicleDestroys  int8   
 25  walkDistance     float16
 26  weaponsAcquired  int16  
 27  winPoints   

In [15]:
train.isna().sum()

Id                 0
groupId            0
matchId            0
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
numGroups          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       1
dtype: int64

In [16]:
train = train.dropna(axis=0)

In [17]:
train.isna().sum()

Id                 0
groupId            0
matchId            0
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
numGroups          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       0
dtype: int64