<a href="https://colab.research.google.com/github/Haebuk/kuggle/blob/main/pubg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Reference
- [competition](https://www.kaggle.com/c/pubg-finish-placement-prediction/code?competitionId=10335&sortBy=voteCount
)
- [1st place discussion](https://www.kaggle.com/c/pubg-finish-placement-prediction/discussion/79161)
- [LigthGBM Baseline](https://www.kaggle.com/chocozzz/lightgbm-baseline)

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import gc, sys
gc.enable()
import os
import time
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
import lightgbm as lgb

### Feature Engineering

In [2]:
def feature_engineering(is_train=True, debug=True):
    test_idx = None
    if is_train:
        print('preprocessing train.csv')
        if debug == True:
            df = pd.read_csv('/content/drive/MyDrive/input/pubg/train_V2.csv', nrow=10000)
        else:
            df = pd.read_csv('/content/drive/MyDrive/input/pubg/train_V2.csv')

        df = df[df['maxPlace'] > 1]
    else:
        print('processing test.csv')
        df = pd.read_csv('/content/drive/MyDrive/input/pubg/test_V2.csv')
        test_idx = df.Id

    print('remove some columns')
    target = 'winPlacePerc'

    print('Adding Features')

    df['headshotrate'] = df['kills']/df['headshotKills']
    df['killStreakrate'] = df['killStreaks']/df['kills']
    df['healthitems'] = df['heals'] + df['boosts']
    df['totalDistance'] = df['rideDistance'] + df["walkDistance"] + df["swimDistance"]
    df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']
    df['headshotKills_over_kills'] = df['headshotKills'] / df['kills']
    df['distance_over_weapons'] = df['totalDistance'] / df['weaponsAcquired']
    df['walkDistance_over_heals'] = df['walkDistance'] / df['heals']
    df['walkDistance_over_kills'] = df['walkDistance'] / df['kills']
    df['killsPerWalkDistance'] = df['kills'] / df['walkDistance']
    df["skill"] = df["headshotKills"] + df["roadKills"]

    df[df == np.Inf] = np.NaN
    df[df == np.NINF] = np.NaN

    print("Removing Na's From DF")
    df.fillna(0, inplace=True)

    features = list(df.columns)
    features.remove('Id')
    features.remove('matchId')
    features.remove('groupId')
    features.remove('matchType')

    y = None

    if is_train:
        print('get target')
        y = np.array(df.groupby(['matchId', 'groupId'])[target].agg('mean'), dtype=np.float64)
        features.remove(target)

    print('get group mean feature')
    agg = df.groupby(['matchId', 'groupId'])[features].agg('mean')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    if is_train:
        df_out = agg.reset_index()[['matchId', 'groupId']]
    else:
        df_out = df[['matchId', 'groupId']]

    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank,  suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])

    print('get group max feature')
    agg = df.groupby(['matchId', 'groupId'])[features].agg('max')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])

    print('get group min feature')
    agg = df.groupby(['matchId', 'groupId'])[features].agg('min')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])

    print('get group size feature')
    agg = df.groupby(['matchId', 'groupId']).size().reset_index(name='group_size')
    df_out = df_out.merge(agg, how='left', on=['matchId', 'groupId'])

    print('get match mean feature')
    agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])

    print('get match size feature')
    agg = df.groupby(['matchId']).size().reset_index(name='match_size')
    df_out = df_out.merge(agg, how='left', on=['matchId'])

    df_out.drop(['matchId', 'groupId'], axis=1, inplace=True)

    X = df_out

    feature_names = list(df_out.columns)

    del df, df_out, agg, agg_rank
    gc.collect()

    return X, y, feature_names, test_idx

In [3]:
x_train, y_train, train_columns, _ = feature_engineering(True, False)

preprocessing train.csv
remove some columns
Adding Features
Removing Na's From DF
get target
get group mean feature
get group max feature
get group min feature
get group size feature
get match mean feature
get match size feature


In [4]:
x_test, _, _, test_idx = feature_engineering(False, True)

processing test.csv
remove some columns
Adding Features
Removing Na's From DF
get group mean feature
get group max feature
get group min feature
get group size feature
get match mean feature
get match size feature


In [5]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum()
    print("Memory usage of dataframe in {:.2f} MB",format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                # np.iinfo: 정수형 타입의 데이터에 명시한 데이터 타입만큼의 메모리 할당
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                # np.finfo
                elif c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float64).min and c_max < np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
            else:
                df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum()
    print('Memory Usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

x_train = reduce_mem_usage(x_train)
x_test = reduce_mem_usage(x_test)


Memory usage of dataframe in {:.2f} MB 4021060096
Memory Usage after optimization is: 1348624632.00 MB
Decreased by 66.5%
Memory usage of dataframe in {:.2f} MB 3837401216
Memory Usage after optimization is: 1113111432.00 MB
Decreased by 71.0%


In [6]:
import lightgbm as lgb

In [7]:
train_index = round(int(x_train.shape[0] * 0.8))
tr_X = x_train[:train_index]
val_X= x_train[train_index:]
tr_y = y_train[:train_index]
val_y = y_train[train_index:]
gc.collect()

# lightgbm model custom
def run_lgb(train_X, train_y, val_X, val_y, x_Ztest):
    params = {'objective': 'regression',
              'metric': 'mae',
              'n_estimators': 20000,
              'early_stopping_rounds': 200,
              'num_leaves': 31,
              'learning_rate': 0.05,
              'bagging_fraction': 0.7,
              'bagging_seed': 0,
              'num_threads': 4,
              'colsample_bytree': 0.7}

    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, valid_sets=[lgtrain, lgval],
                      early_stopping_rounds=200, verbose_eval=1000)
    pred_test_y = model.predict(x_test, num_iteration=model.best_iteration)
    return pred_test_y, model

# model training
pred_test, model = run_lgb(tr_X, tr_y, val_X, val_y, x_test)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 893861
[LightGBM] [Info] Number of data points in the train set: 1621395, number of used features: 247
[LightGBM] [Info] Start training from score 0.499778
Training until validation scores don't improve for 200 rounds
[1000]	training's l1: 0.0323387	valid_1's l1: 0.0386112
[2000]	training's l1: 0.0281774	valid_1's l1: 0.0380709
[3000]	training's l1: 0.0252599	valid_1's l1: 0.0378424
[4000]	training's l1: 0.0230324	valid_1's l1: 0.0377257
Early stopping, best iteration is:
[4295]	training's l1: 0.0224639	valid_1's l1: 0.0377215


In [14]:
df_sub = pd.read_csv('/content/drive/MyDrive/input/pubg/sample_submission_V2.csv')
df_test = pd.read_csv('/content/drive/MyDrive/input/pubg/test_V2.csv')
df_sub['winPlacePerc'] = pred_test
# 몇 가지 열 복원
df_sub = df_sub.merge(df_test[['Id', 'matchId', 'groupId', 'maxPlace', 'numGroups']],
                      on='Id', how='left')

#
df_sub_group = df_sub.groupby(['matchId', 'groupId']).first().reset_index()
df_sub_group['rank'] = df_sub.groupby(['matchId'])['winPlacePerc'].rank()
df_sub_group = df_sub_group.merge(
    df_sub_group.groupby('matchId')['rank'].max().to_frame('max_rank').reset_index(),
    on='matchId', how='left'
)
df_sub_group['adjusted_perc'] = (df_sub_group['rank'] - 1) / (df_sub_group['numGroups'] - 1)

df_sub = df_sub.merge(df_sub_group[['adjusted_perc', 'matchId', 'groupId']],
                      on = ['matchId', 'groupId'], how='left')
df_sub['winPlacePerc'] = df_sub['adjusted_perc']

df_sub.loc[df_sub.maxPlace == 0, 'winPlacePerc'] = 0
df_sub.loc[df_sub.maxPlace == 1, 'winPlacePerc'] = 1

subset = df_sub.loc[df_sub.maxPlace > 1]
gap = 1.0 / (subset.maxPlace.values - 1)
new_perc = np.around(subset.winPlacePerc.values / gap) * gap
df_sub.loc[df_sub.maxPlace > 1, 'winPlacePerc'] = new_perc

df_sub.loc[(df_sub.maxPlace > 1) & (df_sub.numGroups == 1), 'winPlacePerc'] = 0
assert df_sub['winPlacePerc'].isnull().sum() == 0

df_sub[['Id', 'winPlacePerc']].to_csv('submission_adjusted.csv', index=False)


![image](https://user-images.githubusercontent.com/68543150/119942983-62b28680-bfcd-11eb-8894-e61e85cdf9e5.png)
