<a href="https://colab.research.google.com/github/Haebuk/kuggle/blob/main/pubg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://www.kaggle.com/c/pubg-finish-placement-prediction/code?competitionId=10335&sortBy=voteCount


In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import gc, sys
gc.enable()

In [None]:
def feature_engineering(is_train=True, debug=True):
    test_idx = None
    if is_train:
        print('preprocessing train.csv')
        if debug == True:
            df = pd.read_csv('/content/drive/MyDrive/input/pubg/train_V2.csv', nrow=10000)
        else:
            df = pd.read_csv('/content/drive/MyDrive/input/pubg/train_V2.csv')

        df = df[df['maxPlace'] > 1]
    else:
        print('processing test.csv')
        df = pd.read_csv('/content/drive/MyDrive/input/pubg/test_V2.csv')
        test_idx = df.Id

    print('remove some columns')
    target = 'winPlacePerc'

    print('Adding Features')

    df['headshotrate'] = df['kills'] / df['headshotKills']
    df['killStreakrate'] = df['killStreaks'] / df['kills']
    df['healthitems'] = df['heals'] + df['boosts']
    df['totalDistance'] = df['rideDistance'] + df['walkDistance'] + df['swimDistance']
    df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']
    df['headshotKills_over_weapons'] = df['headshotKills'] / df['kills']
    df['distance_over_weapons'] = df['totalDistance'] / df['weaponsAcquired']
    df['walkDistance_over_kills'] = df['walkDistance'] / df['kills']
    df['walkDistance_over_heals'] = df['walkDistance'] / df['heals']
    df['killsPerWalkDistance'] = df['kills'] / df['walkDistance']
    df['skill'] = df['headshotKills'] + df['roadKills']

    df[df == np.Inf] = np.NaN
    df[df == np.NINF] = np.NaN

    print("Removing Na's From DF")
    df.fillna(0, inplace=True)

    features = list(df.columns)
    features.remove('Id')
    features.remove('matchId')
    features.remove('groupId')
    features.remove('matchType')

    y = None

    if is_train:
        print('get target')
        y = np.array(df.groupby(['matchId', 'groupId'])[target].agg('mean'), dtype=np.float64)
        features.remove(target)

    print('get group mean feature')
    agg = df.groupby(['matchId', 'groupId'])[features].agg('mean')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    if is_train:
        df_out = agg.reset_index()[['matchId', 'groupId']]
    else:
        df_out = df[['matchId', 'groupId']]

    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank,  suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])

    print('get group max feature')
    agg = df.groupby(['matchId', 'groupId'])[features].agg('max')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])

    print('get group min feature')
    agg = df.groupby(['matchId', 'groupId'])[features].agg('min')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])

    print('get group size feature')
    agg = df.groupby(['matchId', 'groupId']).size().reset_index(name='group_size')
    df_out = df_out.merge(agg, how='left', on=['matchId', 'groupId'])

    print('get match mean feature')
    agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])

    print('get match size feature')
    agg = df.groupby(['matchId']).size().reset_index(name='match_size')
    df_out = df_out.merge(agg, how='left', on=['matchId'])

    df_out.drop(['matchId', 'groupId'], axis=1, inplace=True)

    X = df_out

    feature_names = list(df_out.columns)

    del df, df_out, agg, agg_rank
    gc.collect()

    return X, y, feature_names, test_idx

In [None]:
x_train, y_train, train_columns, _ = feature_engineering(True, False)

preprocessing train.csv
remove some columns
Adding Features
Removing Na's From DF
get target
get group mean feature
get group max feature
get group min feature
get group size feature
get match mean feature
get match size feature


In [None]:
x_test, _, _, test_idx = feature_engineering(False, True)

processing test.csv
remove some columns
Adding Features
Removing Na's From DF
get group mean feature
get group max feature
get group min feature
get group size feature
get match mean feature
get match size feature
