In [1]:
import gc
import time
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

In [6]:
train = pd.read_csv('../input/train_V2.csv')

In [None]:
# Ideas to experiment

def original(df):
    return df

def items(df):
    df['items'] = df['heals'] + df['boosts']
    return df

def players_in_team(df):
    agg = df.groupby('groupId').size().to_frame('group_size')
    df = df.merge(agg, how='left', on='groupId')
    return df

def players_in_match(df):
    agg = df.groupby('matchId').size().reset_index(name='match_size')
    df = df.merge(agg, how='left', on='matchId')
    return df

def total_distance(df):
    df['total_distance'] = df['walkDistance'] + df['rideDistance'] + df['swimDistance']
    return df

def total_healsandboosts(df):
    df['healsandboosts'] = df['heals'] + df['boosts']
    return df

def rankPoints(df):
    df['rankPoints'] = np.where(df['rankPoints'] <= 0 ,0 , df['rankPoints'])

def headshots_over_kills(df):
    df['headshots_over_kills'] = df['headshotKills'] / df['kills']
    df['headshots_over_kills'] = df['headshots_over_kills'].fillna(0)
    return df

def killPlace_over_maxPlace(df):
    df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']
    return df

def walkDistance_over_heals(df):
    df['walkDistance_over_heals'] = df['walkDistance'] / df['heals']
    df['walkDistance_over_heals'] = df['walkDistance_over_heals'].replace(np.inf, 0)
    return df

def walkDistance_over_kills(df):
    df['workDistance_ove_kills'] = df['walkDistance'] / df['kills']
    df['workDistance_ove_kills'] = df['workDistance_ove_kills'].fillna(0)
    df['workDistance_ove_kills'] = df['workDistance_ove_kills'].replace(np.inf, 0)
    return df

def teamwork(df):
    df['teamwork'] = df['assists'] + df['revives']
    return df

In [9]:
pt_df = players_in_team(train.copy())
print(pt_df.columns)
display(pt_df.head())
pm_df = players_in_match(train.copy())
print(pm_df.columns)
display(pm_df.head())

Index(['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints', 'winPlacePerc', 'group_size'],
      dtype='object')


Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,group_size
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0.0,0,0.0,0,0,244.8,1,1466,0.4444,4
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0.0045,0,11.04,0,0,1434.0,5,0,0.64,4
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,0.0,0,0.0,0,0,161.8,2,0,0.7755,2
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,...,0.0,0,0.0,0,0,202.7,3,0,0.1667,1
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,0.0,0,0.0,0,0,49.75,2,0,0.1875,1


Index(['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints', 'winPlacePerc', 'match_size'],
      dtype='object')


Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,...,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,match_size
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,...,0.0,0,0.0,0,0,244.8,1,1466,0.4444,96
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,...,0.0045,0,11.04,0,0,1434.0,5,0,0.64,91
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,...,0.0,0,0.0,0,0,161.8,2,0,0.7755,98
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,...,0.0,0,0.0,0,0,202.7,3,0,0.1667,91
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,...,0.0,0,0.0,0,0,49.75,2,0,0.1875,97


In [14]:
# Function to run experiments
def run_experiments(functions):
    results = []
    for function in functions:
        start = time.time()
        score = run_experiment(function)
        execution_time = time.time() - start
        result = {
            'name': function.__name__,
            'score': score,
            'exection time': f'{round(execution_time, 2)}s'
        }
        print(result)
        results.append(result)
        gc.collect()
    return pd.DataFrame(results, columns=['name', 'score', 'execution time']).sort_values(by='score')

In [15]:
# Function to run a experiment
def run_experiment(function):
    df = train.copy()
    df = function(df)
    
    target = 'winPlacePerc'
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
    cols_to_fit = [
        col
        for col in df.columns
        if col not in cols_to_drop
    ]
    
    X = df[cols_to_fit]
    y = df[target].fillna(df[target].mean())
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)
    
    model = LGBMRegressor(random_state=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    return mean_absolute_error(y_pred, y_valid)

In [16]:
# Run Experiments!
run_experiments([
    original,
    teamwork,
    items,
    players_in_team,
    players_in_match,
    total_distance,
    total_healsandboosts,
    rankPoints,
    headshots_over_kills,
    killPlace_over_maxPlace,
    walkDistance_over_heals,
    walkDistance_over_kills,
])

{'name': 'original', 'score': 0.06040569264166316, 'exection time': '25.51s'}
{'name': 'teamwork', 'score': 0.06042156842004747, 'exection time': '24.26s'}
{'name': 'items', 'score': 0.060371289267722485, 'exection time': '25.77s'}
{'name': 'players_in_team', 'score': 0.059520871232651075, 'exection time': '41.45s'}
{'name': 'players_in_match', 'score': 0.05810358595577312, 'exection time': '32.01s'}
{'name': 'total_distance', 'score': 0.06034799153683099, 'exection time': '22.66s'}
{'name': 'headshots_over_kills', 'score': 0.06040569264166316, 'exection time': '23.68s'}
{'name': 'killPlace_over_maxPlace', 'score': 0.06016599459918434, 'exection time': '23.33s'}
{'name': 'walkDistance_over_heals', 'score': 0.06050697145399489, 'exection time': '23.39s'}
{'name': 'walkDistance_over_kills', 'score': 0.06043715341366822, 'exection time': '23.65s'}


Unnamed: 0,name,score,execution time
4,players_in_match,0.058104,
3,players_in_team,0.059521,
7,killPlace_over_maxPlace,0.060166,
5,total_distance,0.060348,
2,items,0.060371,
0,original,0.060406,
6,headshots_over_kills,0.060406,
1,teamwork,0.060422,
9,walkDistance_over_kills,0.060437,
8,walkDistance_over_heals,0.060507,


In [17]:
# Ideas to experiment
def min_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'matchDuration', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId','groupId'])[features].min().reset_index()
    return df.merge(agg, suffixes=['', '_min'], how='left', on=['matchId', 'groupId'])

def max_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'matchDuration', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].max().reset_index()
    return df.merge(agg, suffixes=['', '_max'], how='left', on=['matchId', 'groupId'])

def sum_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'matchDuration', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].sum().reset_index()
    return df.merge(agg, suffixes=['', '_sum'], how='left', on=['matchId', 'groupId'])

def median_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'matchDuration', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].median().reset_index()
    return df.merge(agg, suffixes=['', '_median'], how='left', on=['matchId', 'groupId'])

def mean_by_team(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'matchDuration', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].mean().reset_index()
    return df.merge(agg, suffixes=['', '_mean'], how='left', on=['matchId', 'groupId'])

def rank_by_team_mean(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'matchDuration', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].mean().reset_index()
    agg = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    return df.merge(agg, suffixes=['', '_mean_rank'], how='left', on=['matchId', 'groupId'])

def rank_by_team_max(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'matchDuration', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].mean().reset_index()
    agg = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    return df.merge(agg, suffixes=['', '_max_rank'], how='left', on=['matchId', 'groupId'])

def rank_by_team_min(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'matchDuration', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].mean().reset_index()
    agg = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    return df.merge(agg, suffixes=['', '_min_rank'], how='left', on=['matchId', 'groupId'])

def rank_by_team_median(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'matchDuration', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId', 'groupId'])[features].median().reset_index()
    agg = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    return df.merge(agg, suffixes=['', '_median_rank'], how='left', on=['matchId', 'groupId'])

def mean_by_match(df):
    cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', 'matchDuration', 'winPlacePerc']
    features = [col for col in df.columns if col not in cols_to_drop]
    agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
    return df.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])

In [None]:
test_df = rank_by_team_max(train.copy())
print(test_df.columns)
display(test_df.head())

In [18]:
# Run Experiments!
run_experiments([
    original,
    min_by_team,
    max_by_team,
    sum_by_team,
    median_by_team,
    mean_by_team,
    rank_by_team_min,
    rank_by_team_max,
    rank_by_team_median,
    rank_by_team_mean
])

{'name': 'original', 'score': 0.06040569264166316, 'exection time': '24.1s'}
{'name': 'min_by_team', 'score': 0.05710791111924572, 'exection time': '75.6s'}
{'name': 'max_by_team', 'score': 0.049371616400010886, 'exection time': '76.27s'}
{'name': 'sum_by_team', 'score': 0.05305346761468867, 'exection time': '76.69s'}
{'name': 'median_by_team', 'score': 0.05293310586643201, 'exection time': '67.1s'}
{'name': 'mean_by_team', 'score': 0.05221497300384134, 'exection time': '64.31s'}


KeyError: 'matchId'

In [25]:
import eli5
from eli5.sklearn import PermutationImportance

target = 'winPlacePerc'
cols_to_drop = ['Id', 'groupId', 'matchId', 'matchType', target]
cols_to_fit = [col for col in train.columns if col not in cols_to_drop]

X = train[cols_to_fit]
y = train[target].fillna(train[target].mean())

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)

model = LGBMRegressor(random_state=0)
model.fit(X_train, y_train)

perm = PermutationImportance(model, random_state=42).fit(X_valid, y_valid)
eli5.show_weights(perm, feature_names=list(cols_to_fit))

  return f(*args, **kwds)


Weight,Feature
1.3475  ± 0.0029,killPlace
0.3965  ± 0.0007,walkDistance
0.2445  ± 0.0002,kills
0.0512  ± 0.0002,numGroups
0.0271  ± 0.0001,matchDuration
0.0226  ± 0.0000,maxPlace
0.0127  ± 0.0001,killStreaks
0.0061  ± 0.0001,boosts
0.0056  ± 0.0000,rideDistance
0.0028  ± 0.0001,weaponsAcquired


In [27]:
train.columns

Index(['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints', 'winPlacePerc'],
      dtype='object')