In [15]:
import gc
import time
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

In [16]:
train = pd.read_csv('../input/pubg-finish-placement-prediction/train_V2.csv')

In [17]:
# Ideas to experiment

def original(df):
    return df

def items(df):
    df['items'] = df['heals'] + df['boosts']
    return df

def players_in_team(df):
    agg = df.groupby('groupId').size().to_frame('players_in_team')
    df = df.merge(agg, on='groupId')
    return df

def total_distance(df):
    df['total_distance'] = df['walkDistance'] + df['rideDistance'] + df['swimDistance']
    return df

def headshots_over_kills(df):
    df['headshots_over_kills'] = df['headshotKills'] / df['kills']
    df['headshots_over_kills'] = df['headshots_over_kills'].fillna(0)
    return df

def killPlace_over_maxPlace(df):
    df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']
    return df

def walkDistance_over_heals(df):
    df['walkDistance_over_heals'] = df['walkDistance'] / df['heals']
    df['walkDistance_over_heals'] = df['walkDistance_over_heals'].replace(np.inf, 0)
    return df

def walkDistance_over_kills(df):
    df['workDistance_ove_kills'] = df['walkDistance'] / df['kills']
    df['workDistance_ove_kills'] = df['workDistance_ove_kills'].fillna(0)
    df['workDistance_ove_kills'] = df['workDistance_ove_kills'].replace(np.inf, 0)
    return df

def teamwork(df):
    df['teamwork'] = df['assists'] + df['revives']
    return df
    
def match_mode_classifier(mt):
    if 'solo' in mt:
        return 'solo'
    elif 'duo' in mt:
        return 'duo'
    elif 'squad' in mt:
        return 'squad'
    else:
        return 'others'

In [18]:
# My Idea to experiment
def match_mode(df):
    df['match_mode'] = df['matchType'].map(lambda mt: match_mode_classifier(mt))
    dummies = pd.get_dummies(df['match_mode'])
    df = df.join(dummies)
    df = df.drop('match_mode', axis=1)
    return df

In [19]:
# Function to run experiments
def run_experiments(functions):
    results = []
    for function in functions:
        start = time.time()
        score = run_experiment(function)
        execution_time = time.time() - start
        result = {
            'name': function.__name__,
            'score': score,
            'exection time': f'{round(execution_time, 2)}s'
        }
        print(result)
        results.append(result)
        gc.collect()
    return pd.DataFrame(results, columns=['name', 'score', 'execution time']).sort_values(by='score')

In [20]:
# Function to run a experiment
def run_experiment(function):
    df = train.copy()
    df = function(df)
    
    target = 'winPlacePerc'
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
    cols_to_fit = [
        col
        for col in df.columns
        if col not in cols_to_drop
    ]
    
    X = df[cols_to_fit]
    y = df[target].fillna(df[target].mean())
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)
    
    model = LGBMRegressor(random_state=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    return mean_absolute_error(y_pred, y_valid)

In [21]:
# Run Experiments!
run_experiments([
    teamwork,
    match_mode,
    original,
    items,
    players_in_team,
    total_distance,
    headshots_over_kills,
    killPlace_over_maxPlace,
    walkDistance_over_heals,
    walkDistance_over_kills,
])

{'name': 'teamwork', 'score': 0.06042156842004747, 'exection time': '15.53s'}
{'name': 'match_mode', 'score': 0.05993997473505046, 'exection time': '21.26s'}
{'name': 'original', 'score': 0.06040569264166316, 'exection time': '18.88s'}
{'name': 'items', 'score': 0.060371289267722485, 'exection time': '17.78s'}
{'name': 'players_in_team', 'score': 0.059477319089875226, 'exection time': '36.89s'}
{'name': 'total_distance', 'score': 0.06034799153683099, 'exection time': '19.06s'}
{'name': 'headshots_over_kills', 'score': 0.06040569264166316, 'exection time': '17.02s'}
{'name': 'killPlace_over_maxPlace', 'score': 0.06016599459918434, 'exection time': '17.35s'}
{'name': 'walkDistance_over_heals', 'score': 0.06050697145399489, 'exection time': '16.4s'}
{'name': 'walkDistance_over_kills', 'score': 0.06043715341366822, 'exection time': '17.57s'}


Unnamed: 0,name,score,execution time
4,players_in_team,0.059477,
1,match_mode,0.05994,
7,killPlace_over_maxPlace,0.060166,
5,total_distance,0.060348,
3,items,0.060371,
2,original,0.060406,
6,headshots_over_kills,0.060406,
0,teamwork,0.060422,
9,walkDistance_over_kills,0.060437,
8,walkDistance_over_heals,0.060507,


In [22]:
# Ideas to experiment
def min_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId','groupId'])[features].min()
    return df.merge(agg, suffixes=['', '_min'], how='left', on=['matchId', 'groupId'])

def max_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].max()
    return df.merge(agg, suffixes=['', '_max'], how='left', on=['matchId', 'groupId'])

def sum_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].sum()
    return df.merge(agg, suffixes=['', '_sum'], how='left', on=['matchId', 'groupId'])

def median_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].median()
    return df.merge(agg, suffixes=['', '_median'], how='left', on=['matchId', 'groupId'])

def mean_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].mean()
    return df.merge(agg, suffixes=['', '_mean'], how='left', on=['matchId', 'groupId'])

def rank_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].mean()
    agg = agg.groupby('matchId')[features].rank(pct=True)
    return df.merge(agg, suffixes=['', '_mean_rank'], how='left', on=['matchId', 'groupId'])

In [23]:
# Run Experiments!
run_experiments([
    original,
    min_by_team,
    max_by_team,
    sum_by_team,
    median_by_team,
    mean_by_team,
    rank_by_team
])

{'name': 'original', 'score': 0.06040569264166316, 'exection time': '15.09s'}
{'name': 'min_by_team', 'score': 0.05710791111924572, 'exection time': '45.82s'}
{'name': 'max_by_team', 'score': 0.049371616400010886, 'exection time': '46.42s'}
{'name': 'sum_by_team', 'score': 0.052975908732634705, 'exection time': '47.41s'}
{'name': 'median_by_team', 'score': 0.05293310586643201, 'exection time': '43.85s'}
{'name': 'mean_by_team', 'score': 0.05221497300384134, 'exection time': '49.43s'}
{'name': 'rank_by_team', 'score': 0.04553478309255459, 'exection time': '60.17s'}


Unnamed: 0,name,score,execution time
6,rank_by_team,0.045535,
2,max_by_team,0.049372,
5,mean_by_team,0.052215,
4,median_by_team,0.052933,
3,sum_by_team,0.052976,
1,min_by_team,0.057108,
0,original,0.060406,


In [25]:
import eli5
from eli5.sklearn import PermutationImportance

target = 'winPlacePerc'
cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
cols_to_fit = [col for col in train.columns if col not in cols_to_drop]

X = train[cols_to_fit]
y = train[target].fillna(train[target].mean())

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)

model = LGBMRegressor(random_state=0)
model.fit(X_train, y_train)

perm = PermutationImportance(model, random_state=42).fit(X_valid, y_valid)
eli5.show_weights(perm, feature_names=list(cols_to_fit))

  return f(*args, **kwds)


Weight,Feature
1.3475  ± 0.0029,killPlace
0.3965  ± 0.0007,walkDistance
0.2445  ± 0.0002,kills
0.0512  ± 0.0002,numGroups
0.0271  ± 0.0001,matchDuration
0.0226  ± 0.0000,maxPlace
0.0127  ± 0.0001,killStreaks
0.0061  ± 0.0001,boosts
0.0056  ± 0.0000,rideDistance
0.0028  ± 0.0001,weaponsAcquired


In [27]:
train.columns

Index(['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints', 'winPlacePerc'],
      dtype='object')

In [26]:
def run_promising_preprocesses(df):
    # Caution! There are dependencies to run.
    df = run_promissing_aggregates(df)
    df = run_promissing_conversions(df)
    df = run_promissing_creations(df)
    return df

def run_promissing_aggregates(df):
    # Common
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    
    # Aggregates
#     agg_min = df.groupby(['matchId','groupId'])[features].min()
#     agg_max = df.groupby(['matchId', 'groupId'])[features].max()
#     agg_sum = df.groupby(['matchId', 'groupId'])[features].sum()
#     agg_median = df.groupby(['matchId', 'groupId'])[features].median()
    agg_mean = df.groupby(['matchId', 'groupId'])[features].mean()
    agg_rank = agg_mean.groupby('matchId')[features].rank(pct=True)
    
    # Merge
#     df = df.merge(agg_min, suffixes=['', '_min'], how='left', on=['matchId', 'groupId'])
#     df = df.merge(agg_max, suffixes=['', '_max'], how='left', on=['matchId', 'groupId'])
#     df = df.merge(agg_sum, suffixes=['', '_sum'], how='left', on=['matchId', 'groupId'])
#     df = df.merge(agg_median, suffixes=['', '_median'], how='left', on=['matchId', 'groupId'])
#     df = df.merge(agg_mean, suffixes=['', '_mean'], how='left', on=['matchId', 'groupId'])
    df = df.merge(agg_rank, suffixes=['', '_rank'], how='left', on=['matchId', 'groupId'])
    
    return df

def run_promissing_conversions(df):
    df = match_mode(df) # One-Hot encoding 'matchMode'
    return df

def run_promissing_creations(df):
    df = players_in_team(df) # Add 'players_in_team'
    df = killPlace_over_maxPlace(df) # Add 'killPlace_over_maxPlace'
    df = total_distance(df) # Add 'total_distance'
    df = items(df) # Add 'total_distance'
    return df

In [28]:
# Run Promising Preprocesses
train_preprocessed = run_promising_preprocesses(train.copy())
train_preprocessed

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,weaponsAcquired_rank,winPoints_rank,duo,others,solo,squad,players_in_team,killPlace_over_maxPlace,total_distance,items
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.00,0,0,0,60,...,0.192308,0.192308,0,0,0,1,4,2.142857,244.8000,0
1,7516514fbd1091,4d4b580de459be,a10357fd1a4a91,0,0,0.00,0,0,0,62,...,0.192308,0.192308,0,0,0,1,4,2.214286,48.2800,0
2,c56d45be16aa86,4d4b580de459be,a10357fd1a4a91,0,0,318.00,2,1,0,6,...,0.192308,0.192308,0,0,0,1,4,0.214286,342.8000,0
3,100eef17c4d773,4d4b580de459be,a10357fd1a4a91,0,0,90.75,0,0,0,61,...,0.192308,0.192308,0,0,0,1,4,2.178571,96.0800,0
4,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0.860000,0.520000,0,0,0,1,4,2.192308,1445.0445,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4446961,d05b0c4b2ff311,8248fa2552457b,88c002b589d411,0,0,203.50,0,0,0,32,...,0.603261,0.505435,0,0,1,0,1,0.336842,1559.0000,0
4446962,894c01c8e4524f,c33e793af077f9,deb3a91c03d0f3,0,0,30.10,0,0,0,58,...,0.731959,0.427835,0,0,1,0,1,0.585859,2146.0000,0
4446963,d29bfa313ad766,ac3f1b4a56e5ad,2f3b1af94739b3,0,0,22.68,0,0,0,89,...,0.126316,0.505263,0,0,1,0,1,0.927083,40.2500,0
4446964,f4197cf374e6c0,408cdb5c46b2ac,ee854b837376d9,0,1,44.15,0,0,0,69,...,0.881720,0.505376,0,0,1,0,1,0.741935,81.7000,1


In [29]:
# Evaluate
def evaluate(df):
    target = 'winPlacePerc'
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
    cols_to_fit = [col for col in df.columns if col not in cols_to_drop]
    X = df[cols_to_fit]
    y = df[target].fillna(df[target].mean())

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)

    model = LGBMRegressor(random_state=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    return mean_absolute_error(y_pred, y_valid)

evaluate(train_preprocessed)

0.04501140003633366

In [30]:
# Output Memory Usage
import sys

print("{}{: >25}{}{: >10}{}".format('|','Variable Name','|','Memory','|'))
print(" ------------------------------------ ")
for var_name in dir():
    if not var_name.startswith("_"):
        print("{}{: >25}{}{: >10}{}".format('|',var_name,'|',sys.getsizeof(eval(var_name)),'|'))

|            Variable Name|    Memory|
 ------------------------------------ 
|                       In|       344|
|            LGBMRegressor|      2000|
|                      Out|       368|
|    PermutationImportance|      1464|
|                        X| 853817624|
|                  X_train| 667044824|
|                  X_valid| 222348424|
|             cols_to_drop|       104|
|              cols_to_fit|       264|
|                     eli5|        80|
|                 evaluate|       136|
|                     exit|        56|
|                       gc|        80|
|              get_ipython|        64|
|     headshots_over_kills|       136|
|                    items|       136|
|  killPlace_over_maxPlace|       136|
|               match_mode|       136|
|    match_mode_classifier|       136|
|              max_by_team|       136|
|      mean_absolute_error|       136|
|             mean_by_team|       136|
|           median_by_team|       136|
|              min_by_tea

In [31]:
# Fitting with promissing data
target = 'winPlacePerc'
cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
cols_to_fit = [col for col in train_preprocessed.columns if col not in cols_to_drop]

X = train_preprocessed[cols_to_fit]
y = train_preprocessed[target].fillna(train_preprocessed[target].mean())

model = LGBMRegressor(random_state=0)
model.fit(X, y)

LGBMRegressor(random_state=0)

In [33]:
# Import Test Data
test = pd.read_csv('../input/pubg-finish-placement-prediction/test_V2.csv')

In [34]:
# Run Promising Preprocesses in Test
test_preprocessed = run_promising_preprocesses(test.copy())
test_preprocessed

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,weaponsAcquired_rank,winPoints_rank,duo,others,solo,squad,players_in_team,killPlace_over_maxPlace,total_distance,items
0,9329eb41e215eb,676b23c24e70d6,45b576ab7daa7f,0,0,51.46,0,0,0,73,...,0.178571,0.517857,0,0,0,1,4,2.607143,588.000,0
1,d6267a32c5709c,676b23c24e70d6,45b576ab7daa7f,0,0,0.00,0,0,0,71,...,0.178571,0.517857,0,0,0,1,4,2.535714,3243.500,0
2,b896f8954a92e2,676b23c24e70d6,45b576ab7daa7f,1,0,74.20,1,0,0,72,...,0.178571,0.517857,0,0,0,1,4,2.571429,386.300,0
3,2f134f2c7be198,676b23c24e70d6,45b576ab7daa7f,0,0,0.00,0,0,0,70,...,0.178571,0.517857,0,0,0,1,4,2.500000,913.000,0
4,639bd0dcd7bda8,430933124148dd,42a9a0b906c928,0,4,179.10,0,0,2,11,...,0.872340,0.510638,1,0,0,0,2,0.229167,6686.000,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1934169,dadb017a444602,97572de2a4da68,00eaf1db913030,0,0,100.00,0,0,0,40,...,0.161290,0.505376,0,0,1,0,1,0.425532,91.190,0
1934170,4e876c8d185011,7e3ffcf069160d,d75584b6a89a04,0,0,42.57,0,0,0,66,...,0.115789,0.905263,0,0,1,0,1,0.673469,299.600,0
1934171,130020cd922eb9,355623b5def3e6,bdd686c2da1b5a,0,0,11.02,0,0,0,79,...,0.244898,0.505102,0,0,1,0,1,0.806122,117.000,0
1934172,760e8d8f9a798d,a64e0c1ca94fb2,e21d178e2e5eeb,0,0,0.00,0,0,2,57,...,0.671875,0.505208,0,0,1,0,1,0.581633,633.600,2


In [35]:
# Predict
X_test = test_preprocessed[cols_to_fit]

y_pred = model.predict(X_test)
y_pred

array([0.24713226, 0.25562407, 0.25386582, ..., 0.2174884 , 0.57150507,
       0.00722013])

In [36]:
submission = pd.DataFrame({
    'Id': test_preprocessed['Id'], # Caution!
    'winPlacePerc': y_pred
})
submission.to_csv('../output/submission.csv', index=False)