In [2]:
# Import and setup
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
import matplotlib.pyplot as plt
import gc
%matplotlib inline

In [30]:
# Read training data
df_train = pd.read_csv('train_V2.csv', nrows=None)

# Drop NA
df_train = df_train.dropna()
print(df_train.shape)
df_train.head()

(4446965, 29)


Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [31]:
# Define class to preprocess raw data
class DfTransformer:
    def __init__(self):
        self.mean_rankpoints = None
        self.mean_killpoints = None
        self.mean_winpoints = None
        self.df_test_meta = None
    
    def transform_train(self, df):
        # Replace None values of rankPoints with mean
        col_rankpoints = df['rankPoints']
        col_rankpoints = col_rankpoints.loc[col_rankpoints > 1e-4]
        self.mean_rankpoints = col_rankpoints.mean()
        # print('The mean of non-none rankPoints is %.3f' % mean_rankpoints)
        df.loc[df['rankPoints'] < 1e-4, 'rankPoints'] = self.mean_rankpoints
        
        # Replace None values of killPoints with mean
        col_killpoints = df['killPoints']
        col_killpoints = col_killpoints.loc[col_killpoints > 1e-4]
        self.mean_killpoints = col_killpoints.mean()
        # print('The mean of non-none killPoints is %.3f' % mean_killpoints)
        df.loc[df['killPoints'] < 1e-4, 'killPoints'] = self.mean_killpoints
        
        # Replace None values of winPoints with mean
        col_winpoints = df['winPoints']
        col_winpoints = col_winpoints.loc[col_winpoints > 1e-4]
        self.mean_winpoints = col_winpoints.mean()
        # print('The mean of non-none winPoints is %.3f' % mean_winpoints)
        df.loc[df['winPoints'] < 1e-4, 'winPoints'] = self.mean_winpoints
        
        # Feature engineering
        return self.feature_engineering(df, is_train=True)
    
    def transform_test(self, df):
        return self.transform_test_directly(df, self.mean_rankpoints,
                                            self.mean_killpoints, self.mean_winpoints)
    
    def transform_test_directly(self, df, mean_rankpoints, mean_killpoints, mean_winpoints):
        # Replace None values of rankPoints, killPoints, winPoints
        df.loc[df['rankPoints'] < 1e-4, 'rankPoints'] = mean_rankpoints
        df.loc[df['killPoints'] < 1e-4, 'killPoints'] = mean_killpoints
        df.loc[df['winPoints'] < 1e-4, 'winPoints'] = mean_winpoints
        
        # Feature engineering
        return self.feature_engineering(df, is_train=False)
    
    def transform_prediction(self, df_meta, y_predicted):
        df_y = df_meta.copy()
        df_y['winPlacePerc'] = y_predicted
        return (self.df_test_meta.merge(df_y, on=['matchId', 'groupId'], how='left'))[['Id', 'winPlacePerc']]
    
    def feature_engineering(self, df, is_train=True):
        # Add hand-engineered features
        df_walkDistance = df['walkDistance'] + 5
        df['heals_over_dist'] = df['heals'] / df_walkDistance
        df['boosts_over_dist'] = df['boosts'] / df_walkDistance
        df['kills_over_dist'] = df['kills'] / df_walkDistance
        df['headshots_over_dist'] = df['headshotKills'] / df_walkDistance
        df['killStreaks_over_dist'] = df['killStreaks'] / df_walkDistance
        df['damageDealt_over_dist'] = df['damageDealt'] / df_walkDistance
        df['dbnos_over_dist'] = df['DBNOs'] / df_walkDistance
        df['weapons_over_dist'] = df['weaponsAcquired'] / df_walkDistance
        df['revives_over_dist'] = df['revives'] / df_walkDistance
        df_walkDistance = None
        df_kills = df['kills'] + 0.001
        df['headshots_over_kills'] = df['headshotKills'] / df_kills
        df['killStreaks_over_kills'] = df['killStreaks'] / df_kills
        df_kills = None
        df['teamwork'] = df['assists'] + df['revives']
        df['totalDistance'] = df['walkDistance'] + df['rideDistance'] + df['swimDistance']
        df['items'] = df['heals'] + df['boosts']
        df['skills'] = df['headshotKills'] + df['roadKills']
        # df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace'] # No improvement
        
        features = df.columns.tolist()
        features.remove('Id')
        features.remove('groupId')
        features.remove('matchId')
        features.remove('matchType')
        if is_train: features.remove('winPlacePerc')
            
        # Define method to map column names (adding suffix)
        def map_col_names(df, features, suffix):
            col_name_dict = {}
            for name in features:
                col_name_dict[name] = name + suffix
            return df.rename(columns=col_name_dict)
            
        # Add group mean and group mean rank in match
        group_by = df.groupby(['matchId','groupId'])
        group_by_features = group_by[features]
        df_agg = group_by_features.agg('mean')
        df_agg_rank = df_agg.groupby('matchId')[features].rank(pct=True).reset_index()
        df_out = df_agg.reset_index().merge(df_agg_rank, on=['matchId', 'groupId'], how='left',
                                            suffixes=["_groupMean", "_groupMeanRank"])

        # Add group max and group max rank in match
        df_agg = group_by_features.agg('max')
        df_agg_rank = df_agg.groupby('matchId')[features].rank(pct=True).reset_index()
        df_agg = df_agg.reset_index().merge(df_agg_rank, on=['matchId', 'groupId'], how='left',
                                            suffixes=["_groupMax", "_groupMaxRank"])
        df_out = df_out.merge(df_agg, on=['matchId', 'groupId'], how='left')

        # Add group min and group min rank in match
        df_agg = group_by_features.agg('min')
        df_agg_rank = df_agg.groupby('matchId')[features].rank(pct=True).reset_index()
        df_agg = df_agg.reset_index().merge(df_agg_rank, on=['matchId', 'groupId'], how='left',
                                            suffixes=["_groupMin", "_groupMinRank"])
        df_out = df_out.merge(df_agg, on=['matchId', 'groupId'], how='left')
        
        # Add group sum and group sum rank in match
        df_agg = group_by_features.agg('sum')
        df_agg_rank = df_agg.groupby('matchId')[features].rank(pct=True).reset_index()
        df_agg = df_agg.reset_index().merge(df_agg_rank, on=['matchId', 'groupId'], how='left',
                                            suffixes=["_groupSum", "_groupSumRank"])
        df_out = df_out.merge(df_agg, on=['matchId', 'groupId'], how='left')
        
        # Add group std
        df_out = df_out.merge(group_by_features.agg('std').reset_index(), on=['matchId', 'groupId'], how='left')
        df_out = df_out.fillna(0)  # zero divisor is present for single player group
        df_out = map_col_names(df_out, features, '_groupStd')
        group_by_features = None
        
        # Add group size
        df_agg = group_by.size().to_frame('groupSize').reset_index()
        df_out = df_out.merge(df_agg, on=['matchId', 'groupId'], how='left')

        # Add target if for training
        if is_train:
            df_agg = group_by[['winPlacePerc']].first().reset_index()
            df_out = df_out.merge(df_agg, on=['matchId', 'groupId'], how='left')
            
        # Add match mean
        group_by = df.groupby(['matchId'])
        group_by_features = group_by[features]
        df_out = df_out.merge(group_by_features.agg('mean').reset_index(), on=['matchId'], how='left')
        df_out = map_col_names(df_out, features, '_matchMean')
        
        # Add match max
        df_out = df_out.merge(group_by_features.agg('max').reset_index(), on=['matchId'], how='left')
        df_out = map_col_names(df_out, features, '_matchMax')
        
        # Add match min
        df_out = df_out.merge(group_by_features.agg('min').reset_index(), on=['matchId'], how='left')
        df_out = map_col_names(df_out, features, '_matchMin')
        
        # Add match sum
        df_out = df_out.merge(group_by_features.agg('sum').reset_index(), on=['matchId'], how='left')
        df_out = map_col_names(df_out, features, '_matchSum')
        
        # Add match std
        df_out = df_out.merge(group_by_features.agg('std').reset_index(), on=['matchId'], how='left')
        df_out = df_out.fillna(0)
        df_out = map_col_names(df_out, features, '_matchStd')
        group_by_features = None
        
        # Add match size
        df_agg = group_by['groupId'].nunique().to_frame('matchSize').reset_index()
        df_out = df_out.merge(df_agg, on=['matchId'], how='left')

        # Add encoded matchType (no improvement)
        # df_agg = group_by['matchType'].first().reset_index()
        # df_out = df_out.merge(df_agg, on=['matchId'], how='left')
        # df_out = pd.get_dummies(df_out, columns=['matchType']) # Ont-hot encoding
        # df_out['matchType'] = df_out['matchType'].astype('category').cat.codes # Label encoding
        
        # Keep metadata of df_test for later restoring individual prediction
        if not is_train:
            self.df_test_meta = df[['Id', 'matchId', 'groupId']]
        
        # Return: features, metadata, weights
        return df_out.drop(columns=['matchId', 'groupId']),\
               df_out[['matchId', 'groupId']],\
               df_out['groupSize'].values

# Unit test for DfPreprocessor
df_bbb = df_train.iloc[:6, :].copy()
df_bbb.iloc[0, 26] = 0
df_bbb.iloc[1:5, 2] = df_bbb.iloc[0, 2]
df_bbb.iloc[1:3, 1] = df_bbb.iloc[0, 1]
df_bbb.iloc[1:3, 28] = df_bbb.iloc[0, 28]
df_bbb, df_bbb_meta, bbb_weights = DfTransformer().transform_train(df_bbb)
df_bbb

Unnamed: 0,assists_groupMean,boosts_groupMean,damageDealt_groupMean,DBNOs_groupMean,headshotKills_groupMean,heals_groupMean,killPlace_groupMean,killPoints_groupMean,kills_groupMean,killStreaks_groupMean,longestKill_groupMean,matchDuration_groupMean,maxPlace_groupMean,numGroups_groupMean,rankPoints_groupMean,revives_groupMean,rideDistance_groupMean,roadKills_groupMean,swimDistance_groupMean,teamKills_groupMean,vehicleDestroys_groupMean,walkDistance_groupMean,weaponsAcquired_groupMean,winPoints_groupMean,heals_over_dist_groupMean,boosts_over_dist_groupMean,kills_over_dist_groupMean,headshots_over_dist_groupMean,killStreaks_over_dist_groupMean,damageDealt_over_dist_groupMean,dbnos_over_dist_groupMean,weapons_over_dist_groupMean,revives_over_dist_groupMean,headshots_over_kills_groupMean,killStreaks_over_kills_groupMean,teamwork_groupMean,totalDistance_groupMean,items_groupMean,skills_groupMean,assists_groupMeanRank,boosts_groupMeanRank,damageDealt_groupMeanRank,DBNOs_groupMeanRank,headshotKills_groupMeanRank,heals_groupMeanRank,killPlace_groupMeanRank,killPoints_groupMeanRank,kills_groupMeanRank,killStreaks_groupMeanRank,longestKill_groupMeanRank,matchDuration_groupMeanRank,maxPlace_groupMeanRank,numGroups_groupMeanRank,rankPoints_groupMeanRank,revives_groupMeanRank,rideDistance_groupMeanRank,roadKills_groupMeanRank,swimDistance_groupMeanRank,teamKills_groupMeanRank,vehicleDestroys_groupMeanRank,walkDistance_groupMeanRank,weaponsAcquired_groupMeanRank,winPoints_groupMeanRank,heals_over_dist_groupMeanRank,boosts_over_dist_groupMeanRank,kills_over_dist_groupMeanRank,headshots_over_dist_groupMeanRank,killStreaks_over_dist_groupMeanRank,damageDealt_over_dist_groupMeanRank,dbnos_over_dist_groupMeanRank,weapons_over_dist_groupMeanRank,revives_over_dist_groupMeanRank,headshots_over_kills_groupMeanRank,killStreaks_over_kills_groupMeanRank,teamwork_groupMeanRank,totalDistance_groupMeanRank,items_groupMeanRank,skills_groupMeanRank,assists_groupMax,boosts_groupMax,damageDealt_groupMax,DBNOs_groupMax,headshotKills_groupMax,heals_groupMax,killPlace_groupMax,killPoints_groupMax,kills_groupMax,killStreaks_groupMax,longestKill_groupMax,matchDuration_groupMax,maxPlace_groupMax,numGroups_groupMax,rankPoints_groupMax,revives_groupMax,rideDistance_groupMax,roadKills_groupMax,swimDistance_groupMax,teamKills_groupMax,vehicleDestroys_groupMax,walkDistance_groupMax,weaponsAcquired_groupMax,winPoints_groupMax,heals_over_dist_groupMax,boosts_over_dist_groupMax,kills_over_dist_groupMax,headshots_over_dist_groupMax,killStreaks_over_dist_groupMax,damageDealt_over_dist_groupMax,dbnos_over_dist_groupMax,weapons_over_dist_groupMax,revives_over_dist_groupMax,headshots_over_kills_groupMax,killStreaks_over_kills_groupMax,teamwork_groupMax,totalDistance_groupMax,items_groupMax,skills_groupMax,assists_groupMaxRank,boosts_groupMaxRank,damageDealt_groupMaxRank,DBNOs_groupMaxRank,headshotKills_groupMaxRank,heals_groupMaxRank,killPlace_groupMaxRank,killPoints_groupMaxRank,kills_groupMaxRank,killStreaks_groupMaxRank,longestKill_groupMaxRank,matchDuration_groupMaxRank,maxPlace_groupMaxRank,numGroups_groupMaxRank,rankPoints_groupMaxRank,revives_groupMaxRank,rideDistance_groupMaxRank,roadKills_groupMaxRank,swimDistance_groupMaxRank,teamKills_groupMaxRank,vehicleDestroys_groupMaxRank,walkDistance_groupMaxRank,weaponsAcquired_groupMaxRank,winPoints_groupMaxRank,heals_over_dist_groupMaxRank,boosts_over_dist_groupMaxRank,kills_over_dist_groupMaxRank,headshots_over_dist_groupMaxRank,killStreaks_over_dist_groupMaxRank,damageDealt_over_dist_groupMaxRank,dbnos_over_dist_groupMaxRank,weapons_over_dist_groupMaxRank,revives_over_dist_groupMaxRank,headshots_over_kills_groupMaxRank,killStreaks_over_kills_groupMaxRank,teamwork_groupMaxRank,totalDistance_groupMaxRank,items_groupMaxRank,skills_groupMaxRank,assists_groupMin,boosts_groupMin,damageDealt_groupMin,DBNOs_groupMin,headshotKills_groupMin,heals_groupMin,killPlace_groupMin,killPoints_groupMin,kills_groupMin,killStreaks_groupMin,longestKill_groupMin,matchDuration_groupMin,maxPlace_groupMin,numGroups_groupMin,rankPoints_groupMin,revives_groupMin,rideDistance_groupMin,roadKills_groupMin,swimDistance_groupMin,teamKills_groupMin,vehicleDestroys_groupMin,walkDistance_groupMin,weaponsAcquired_groupMin,winPoints_groupMin,heals_over_dist_groupMin,boosts_over_dist_groupMin,kills_over_dist_groupMin,headshots_over_dist_groupMin,killStreaks_over_dist_groupMin,damageDealt_over_dist_groupMin,dbnos_over_dist_groupMin,weapons_over_dist_groupMin,revives_over_dist_groupMin,headshots_over_kills_groupMin,killStreaks_over_kills_groupMin,teamwork_groupMin,totalDistance_groupMin,items_groupMin,skills_groupMin,assists_groupMinRank,boosts_groupMinRank,damageDealt_groupMinRank,DBNOs_groupMinRank,headshotKills_groupMinRank,heals_groupMinRank,killPlace_groupMinRank,killPoints_groupMinRank,kills_groupMinRank,killStreaks_groupMinRank,longestKill_groupMinRank,matchDuration_groupMinRank,maxPlace_groupMinRank,numGroups_groupMinRank,rankPoints_groupMinRank,revives_groupMinRank,rideDistance_groupMinRank,roadKills_groupMinRank,swimDistance_groupMinRank,teamKills_groupMinRank,vehicleDestroys_groupMinRank,walkDistance_groupMinRank,weaponsAcquired_groupMinRank,winPoints_groupMinRank,heals_over_dist_groupMinRank,boosts_over_dist_groupMinRank,kills_over_dist_groupMinRank,headshots_over_dist_groupMinRank,killStreaks_over_dist_groupMinRank,damageDealt_over_dist_groupMinRank,dbnos_over_dist_groupMinRank,weapons_over_dist_groupMinRank,revives_over_dist_groupMinRank,headshots_over_kills_groupMinRank,killStreaks_over_kills_groupMinRank,teamwork_groupMinRank,totalDistance_groupMinRank,items_groupMinRank,skills_groupMinRank,assists_groupSum,boosts_groupSum,damageDealt_groupSum,DBNOs_groupSum,headshotKills_groupSum,heals_groupSum,killPlace_groupSum,killPoints_groupSum,kills_groupSum,killStreaks_groupSum,longestKill_groupSum,matchDuration_groupSum,maxPlace_groupSum,numGroups_groupSum,rankPoints_groupSum,revives_groupSum,rideDistance_groupSum,roadKills_groupSum,swimDistance_groupSum,teamKills_groupSum,vehicleDestroys_groupSum,walkDistance_groupSum,weaponsAcquired_groupSum,winPoints_groupSum,heals_over_dist_groupSum,boosts_over_dist_groupSum,kills_over_dist_groupSum,headshots_over_dist_groupSum,killStreaks_over_dist_groupSum,damageDealt_over_dist_groupSum,dbnos_over_dist_groupSum,weapons_over_dist_groupSum,revives_over_dist_groupSum,headshots_over_kills_groupSum,killStreaks_over_kills_groupSum,teamwork_groupSum,totalDistance_groupSum,items_groupSum,skills_groupSum,assists_groupSumRank,boosts_groupSumRank,damageDealt_groupSumRank,DBNOs_groupSumRank,headshotKills_groupSumRank,heals_groupSumRank,killPlace_groupSumRank,killPoints_groupSumRank,kills_groupSumRank,killStreaks_groupSumRank,longestKill_groupSumRank,matchDuration_groupSumRank,maxPlace_groupSumRank,numGroups_groupSumRank,rankPoints_groupSumRank,revives_groupSumRank,rideDistance_groupSumRank,roadKills_groupSumRank,swimDistance_groupSumRank,teamKills_groupSumRank,vehicleDestroys_groupSumRank,walkDistance_groupSumRank,weaponsAcquired_groupSumRank,winPoints_groupSumRank,heals_over_dist_groupSumRank,boosts_over_dist_groupSumRank,kills_over_dist_groupSumRank,headshots_over_dist_groupSumRank,killStreaks_over_dist_groupSumRank,damageDealt_over_dist_groupSumRank,dbnos_over_dist_groupSumRank,weapons_over_dist_groupSumRank,revives_over_dist_groupSumRank,headshots_over_kills_groupSumRank,killStreaks_over_kills_groupSumRank,teamwork_groupSumRank,totalDistance_groupSumRank,items_groupSumRank,skills_groupSumRank,groupSize,winPlacePerc,assists_matchMean,boosts_matchMean,damageDealt_matchMean,DBNOs_matchMean,headshotKills_matchMean,heals_matchMean,killPlace_matchMean,killPoints_matchMean,kills_matchMean,killStreaks_matchMean,longestKill_matchMean,matchDuration_matchMean,maxPlace_matchMean,numGroups_matchMean,rankPoints_matchMean,revives_matchMean,rideDistance_matchMean,roadKills_matchMean,swimDistance_matchMean,teamKills_matchMean,vehicleDestroys_matchMean,walkDistance_matchMean,weaponsAcquired_matchMean,winPoints_matchMean,heals_over_dist_matchMean,boosts_over_dist_matchMean,kills_over_dist_matchMean,headshots_over_dist_matchMean,killStreaks_over_dist_matchMean,damageDealt_over_dist_matchMean,dbnos_over_dist_matchMean,weapons_over_dist_matchMean,revives_over_dist_matchMean,headshots_over_kills_matchMean,killStreaks_over_kills_matchMean,teamwork_matchMean,totalDistance_matchMean,items_matchMean,skills_matchMean,matchSize
0,0.333333,0.0,53.156667,0.0,0.0,0.0,54.666667,1241.0,0.0,0.0,0.0,1467.0,34.666667,32.666667,1482.4,0.0,0.0015,0.0,3.68,0.0,0.0,613.533333,2.333333,1466.0,0.0,0.0,0.0,0.0,0.0,0.15708,0.0,0.005155,0.0,0.0,0.0,0.333333,617.214833,0.0,0.0,1.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,1.0,0.666667,0.666667,0.666667,0.666667,1.0,0.666667,1.0,0.666667,0.666667,1.0,0.666667,0.666667,0.666667,0.666667,0.5,0.666667,0.5,0.333333,0.666667,0.333333,0.666667,0.666667,0.5,1.0,1.0,0.666667,0.666667,1,0,91.47,0,0,0,60,1241.0,0,0,0.0,1777,50,47,1491.0,0,0.0045,0,11.04,0,0,1434.0,5,1466.0,0.0,0.0,0.0,0.0,0.0,0.407674,0.0,0.01199,0.0,0.0,0.0,1,1445.0445,0,0,1.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,1.0,0.666667,0.666667,0.666667,0.666667,1.0,0.666667,1.0,0.666667,0.666667,1.0,1.0,0.666667,0.666667,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.333333,0.666667,0.666667,0.5,1.0,1.0,0.666667,0.666667,0,0,0.0,0,0,0,47,1241.0,0,0,0.0,1306,26,25,1472.2,0,0.0,0,0.0,0,0,161.8,0,1466.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,161.8,0,0,0.666667,0.666667,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,0.333333,0.333333,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.333333,0.666667,0.666667,0.666667,0.5,0.666667,0.5,0.333333,0.666667,0.333333,0.666667,0.666667,0.5,0.666667,0.666667,0.666667,0.666667,1,0,159.47,0,0,0,164,3723.0,0,0,0.0,4401,104,98,4447.2,0,0.0045,0,11.04,0,0,1840.6,7,4398.0,0.0,0.0,0.0,0.0,0.0,0.471239,0.0,0.015465,0.0,0.0,0.0,1,1851.6445,0,0,1.0,0.666667,1.0,0.666667,0.666667,0.666667,1.0,1.0,0.5,0.5,0.5,1.0,1.0,1.0,1.0,0.666667,1.0,0.666667,1.0,0.666667,0.666667,1.0,1.0,1.0,0.666667,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,1.0,1.0,0.666667,0.666667,3,0.4444,0.2,0.0,58.474,0.0,0.0,0.0,56.8,1241.0,0.2,0.2,11.706,1452.2,46.4,44.6,1483.04,0.0,0.0009,0.0,2.208,0.0,0.0,418.61,2.4,1466.0,0.0,0.0,0.003653,0.0,0.003653,0.491225,0.0,0.013288,0.0,0.0,0.1998,0.2,420.8189,0.0,0.0,3
1,0.0,0.0,32.9,0.0,0.0,0.0,75.0,1241.0,0.0,0.0,0.0,1436.0,31.0,30.0,1408.0,0.0,0.0,0.0,0.0,0.0,0.0,202.7,3.0,1466.0,0.0,0.0,0.0,0.0,0.0,0.158402,0.0,0.014444,0.0,0.0,0.0,0.0,202.7,0.0,0.0,0.5,0.666667,0.333333,0.666667,0.666667,0.666667,1.0,0.666667,0.5,0.5,0.5,0.666667,0.333333,0.333333,0.333333,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.666667,1.0,0.666667,0.666667,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.666667,0.666667,0.666667,0,0,32.9,0,0,0,75,1241.0,0,0,0.0,1436,31,30,1408.0,0,0.0,0,0.0,0,0,202.7,3,1466.0,0.0,0.0,0.0,0.0,0.0,0.158402,0.0,0.014444,0.0,0.0,0.0,0,202.7,0,0,0.5,0.666667,0.333333,0.666667,0.666667,0.666667,1.0,0.666667,0.5,0.5,0.5,0.666667,0.333333,0.333333,0.333333,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.666667,0.5,0.333333,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.666667,0.666667,0.666667,0,0,32.9,0,0,0,75,1241.0,0,0,0.0,1436,31,30,1408.0,0,0.0,0,0.0,0,0,202.7,3,1466.0,0.0,0.0,0.0,0.0,0.0,0.158402,0.0,0.014444,0.0,0.0,0.0,0,202.7,0,0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,1.0,0.666667,0.5,0.5,0.5,1.0,0.666667,0.666667,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,1.0,1.0,0.666667,0.666667,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.666667,1.0,0.666667,0.666667,0,0,32.9,0,0,0,75,1241.0,0,0,0.0,1436,31,30,1408.0,0,0.0,0,0.0,0,0,202.7,3,1466.0,0.0,0.0,0.0,0.0,0.0,0.158402,0.0,0.014444,0.0,0.0,0.0,0,202.7,0,0,0.5,0.666667,0.333333,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,0.5,0.666667,0.333333,0.333333,0.333333,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.666667,0.666667,0.5,0.666667,0.666667,0.5,0.666667,0.5,0.333333,0.666667,0.333333,0.666667,0.666667,0.5,0.5,0.666667,0.666667,0.666667,1,0.1667,0.2,0.0,58.474,0.0,0.0,0.0,56.8,1241.0,0.2,0.2,11.706,1452.2,46.4,44.6,1483.04,0.0,0.0009,0.0,2.208,0.0,0.0,418.61,2.4,1466.0,0.0,0.0,0.003653,0.0,0.003653,0.491225,0.0,0.013288,0.0,0.0,0.1998,0.2,420.8189,0.0,0.0,3
2,0.0,0.0,100.0,0.0,0.0,0.0,45.0,1241.0,1.0,1.0,58.53,1424.0,97.0,95.0,1560.0,0.0,0.0,0.0,0.0,0.0,0.0,49.75,2.0,1466.0,0.0,0.0,0.018265,0.0,0.018265,1.826484,0.0,0.03653,0.0,0.0,0.999001,0.0,49.75,0.0,0.0,0.5,0.666667,1.0,0.666667,0.666667,0.666667,0.333333,0.666667,1.0,1.0,1.0,0.333333,1.0,1.0,1.0,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.333333,0.333333,0.666667,0.666667,0.666667,1.0,0.666667,1.0,1.0,0.666667,1.0,0.666667,0.666667,1.0,0.5,0.333333,0.666667,0.666667,0,0,100.0,0,0,0,45,1241.0,1,1,58.53,1424,97,95,1560.0,0,0.0,0,0.0,0,0,49.75,2,1466.0,0.0,0.0,0.018265,0.0,0.018265,1.826484,0.0,0.03653,0.0,0.0,0.999001,0,49.75,0,0,0.5,0.666667,1.0,0.666667,0.666667,0.666667,0.333333,0.666667,1.0,1.0,1.0,0.333333,1.0,1.0,1.0,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.333333,0.333333,0.666667,0.666667,0.666667,1.0,0.666667,1.0,1.0,0.666667,1.0,0.666667,0.666667,1.0,0.5,0.333333,0.666667,0.666667,0,0,100.0,0,0,0,45,1241.0,1,1,58.53,1424,97,95,1560.0,0,0.0,0,0.0,0,0,49.75,2,1466.0,0.0,0.0,0.018265,0.0,0.018265,1.826484,0.0,0.03653,0.0,0.0,0.999001,0,49.75,0,0,0.666667,0.666667,1.0,0.666667,0.666667,0.666667,0.333333,0.666667,1.0,1.0,1.0,0.666667,1.0,1.0,1.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.333333,0.666667,0.666667,0.666667,0.666667,1.0,0.666667,1.0,1.0,0.666667,1.0,0.666667,0.666667,1.0,0.666667,0.333333,0.666667,0.666667,0,0,100.0,0,0,0,45,1241.0,1,1,58.53,1424,97,95,1560.0,0,0.0,0,0.0,0,0,49.75,2,1466.0,0.0,0.0,0.018265,0.0,0.018265,1.826484,0.0,0.03653,0.0,0.0,0.999001,0,49.75,0,0,0.5,0.666667,0.666667,0.666667,0.666667,0.666667,0.333333,0.5,1.0,1.0,1.0,0.333333,0.666667,0.666667,0.666667,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.333333,0.333333,0.5,0.666667,0.666667,1.0,0.666667,1.0,1.0,0.666667,1.0,0.666667,0.666667,1.0,0.5,0.333333,0.666667,0.666667,1,0.1875,0.2,0.0,58.474,0.0,0.0,0.0,56.8,1241.0,0.2,0.2,11.706,1452.2,46.4,44.6,1483.04,0.0,0.0009,0.0,2.208,0.0,0.0,418.61,2.4,1466.0,0.0,0.0,0.003653,0.0,0.003653,0.491225,0.0,0.013288,0.0,0.0,0.1998,0.2,420.8189,0.0,0.0,3
3,0.0,0.0,100.0,1.0,1.0,0.0,44.0,1241.0,1.0,1.0,18.44,1395.0,28.0,28.0,1418.0,0.0,0.0,0.0,0.0,0.0,0.0,34.7,1.0,1466.0,0.0,0.0,0.025189,0.025189,0.025189,2.518892,0.025189,0.025189,0.0,0.999001,0.999001,0.0,34.7,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0,100.0,1,1,0,44,1241.0,1,1,18.44,1395,28,28,1418.0,0,0.0,0,0.0,0,0,34.7,1,1466.0,0.0,0.0,0.025189,0.025189,0.025189,2.518892,0.025189,0.025189,0.0,0.999001,0.999001,0,34.7,0,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0,100.0,1,1,0,44,1241.0,1,1,18.44,1395,28,28,1418.0,0,0.0,0,0.0,0,0,34.7,1,1466.0,0.0,0.0,0.025189,0.025189,0.025189,2.518892,0.025189,0.025189,0.0,0.999001,0.999001,0,34.7,0,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0,100.0,1,1,0,44,1241.0,1,1,18.44,1395,28,28,1418.0,0,0.0,0,0.0,0,0,34.7,1,1466.0,0.0,0.0,0.025189,0.025189,0.025189,2.518892,0.025189,0.025189,0.0,0.999001,0.999001,0,34.7,0,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1,0.037,0.0,0.0,100.0,1.0,1.0,0.0,44.0,1241.0,1.0,1.0,18.44,1395.0,28.0,28.0,1418.0,0.0,0.0,0.0,0.0,0.0,0.0,34.7,1.0,1466.0,0.0,0.0,0.025189,0.025189,0.025189,2.518892,0.025189,0.025189,0.0,0.999001,0.999001,0.0,34.7,0.0,1.0,1


In [32]:
# Preprocess data
df_transformer = DfTransformer()
df_train, df_train_meta, weight_train = df_transformer.transform_train(df_train)

persist_processed_data = False  # For debug
if persist_processed_data:
    df_train.to_csv('df_train.csv', index=False)
    df_train_meta.to_csv('df_train_meta.csv', index=False)
    df_train_weight = pd.DataFrame()
    df_train_weight['weight_train'] = weight_train
    df_train_weight.to_csv('df_train_weight.csv', index=False)

In [3]:
# For debug: load processed data from saved file directly
df_train = pd.read_csv('df_train.csv')
df_train_meta = pd.read_csv('df_train_meta.csv')
df_train_weight = pd.read_csv('df_train_weight.csv')
weight_train = df_train_weight['weight_train'].values
df_train_weight = None

In [4]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

# df_train = reduce_mem_usage(df_train)

In [5]:
# Get X and y
y_train = df_train['winPlacePerc'].values
X_train = df_train.drop(columns='winPlacePerc').values

feature_name = df_train.columns
df_train = None

print(X_train.shape)
gc.collect()

(2026744, 548)


15

In [6]:
# Standardize
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

In [7]:
# Define method to build model
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import optimizers
from keras.callbacks import EarlyStopping, LearningRateScheduler
from keras.regularizers import l2

def build_model(input_dim, hidden_layers, reg_strength, dropout_rate, lr):
    model = Sequential()
    
    for idx, size in enumerate(hidden_layers):
        regularizer = None if reg_strength < 1e-6 else l2(reg_strength)
        if idx == 0:
            model.add(Dense(size, activation='relu', kernel_regularizer=regularizer, input_dim=input_dim))
        else:
            model.add(Dense(size, activation='relu', kernel_regularizer=regularizer))
        if dropout_rate > 1e-6: model.add(Dropout(dropout_rate))
            
    model.add(Dense(1, activation='linear'))

    optimizer = optimizers.Adam(lr)
    model.compile(optimizer, loss='mse', weighted_metrics=['mae'])
    
    return model

Using TensorFlow backend.


In [8]:
# Define method to fit model with callbacks
def fit_model(model, X, y, weight, validation_data=None, epochs=100,
              early_stop_patience=None, lr=None, decay_factor=None, step_size=None):
    callbacks = []
    if early_stop_patience is not None:
        callbacks.append(EarlyStopping(monitor='val_weighted_mean_absolute_error', min_delta=1e-5,
                                       patience=early_stop_patience, restore_best_weights=True))
    if decay_factor is not None and step_size is not None:
        callbacks.append(LearningRateScheduler(
            lambda epoch, curr_lr: lr * (decay_factor ** np.floor(epoch/step_size)),
            verbose=1))
    
    return model.fit(X, y, sample_weight=weight, validation_data=validation_data,
                     callbacks=callbacks, batch_size=20000, epochs=epochs, verbose=1)

In [9]:
# Prepare validation set
ratio_valid = 0.05

idx_shuffle = np.arange(X_train.shape[0])
np.random.shuffle(idx_shuffle)
idx_split = int(X_train.shape[0] * ratio_valid)
idx_valid = idx_shuffle[:idx_split]
idx_train = idx_shuffle[idx_split:]

X_valid = X_train[idx_valid]
y_valid = y_train[idx_valid]
weight_valid = weight_train[idx_valid]

X_train = X_train[idx_train]
y_train = y_train[idx_train]
weight_train = weight_train[idx_train]

gc.collect()

0

In [10]:
# Define model parameters
hidden_layers = [1024, 512, 256, 128, 64, 32]
reg_strength = 0
dropout_rate = 0
lr = 0.001
epochs = 80

In [11]:
# Train model
model = build_model(X_train.shape[1], hidden_layers, reg_strength, dropout_rate, lr)
history = fit_model(model, X_train, y_train, weight_train,
                    validation_data=(X_valid, y_valid, weight_valid),
                    epochs=epochs, early_stop_patience=10, lr=lr, decay_factor=0.85, step_size=10)

Train on 1925407 samples, validate on 101337 samples
Epoch 1/80

Epoch 00001: LearningRateScheduler setting learning rate to 0.001.
Epoch 2/80

Epoch 00002: LearningRateScheduler setting learning rate to 0.001.
Epoch 3/80

Epoch 00003: LearningRateScheduler setting learning rate to 0.001.
Epoch 4/80

Epoch 00004: LearningRateScheduler setting learning rate to 0.001.
Epoch 5/80

Epoch 00005: LearningRateScheduler setting learning rate to 0.001.
Epoch 6/80

Epoch 00006: LearningRateScheduler setting learning rate to 0.001.
Epoch 7/80

Epoch 00007: LearningRateScheduler setting learning rate to 0.001.
Epoch 8/80

Epoch 00008: LearningRateScheduler setting learning rate to 0.001.
Epoch 9/80

Epoch 00009: LearningRateScheduler setting learning rate to 0.001.
Epoch 10/80

Epoch 00010: LearningRateScheduler setting learning rate to 0.001.
Epoch 11/80

Epoch 00011: LearningRateScheduler setting learning rate to 0.00085.
Epoch 12/80

Epoch 00012: LearningRateScheduler setting learning rate to 0

Epoch 62/80

Epoch 00062: LearningRateScheduler setting learning rate to 0.00037714951562499996.
Epoch 63/80

Epoch 00063: LearningRateScheduler setting learning rate to 0.00037714951562499996.
Epoch 64/80

Epoch 00064: LearningRateScheduler setting learning rate to 0.00037714951562499996.
Epoch 65/80

Epoch 00065: LearningRateScheduler setting learning rate to 0.00037714951562499996.
Epoch 66/80

Epoch 00066: LearningRateScheduler setting learning rate to 0.00037714951562499996.
Epoch 67/80

Epoch 00067: LearningRateScheduler setting learning rate to 0.00037714951562499996.
Epoch 68/80

Epoch 00068: LearningRateScheduler setting learning rate to 0.00037714951562499996.
Epoch 69/80

Epoch 00069: LearningRateScheduler setting learning rate to 0.00037714951562499996.
Epoch 70/80

Epoch 00070: LearningRateScheduler setting learning rate to 0.00037714951562499996.
Epoch 71/80

Epoch 00071: LearningRateScheduler setting learning rate to 0.00032057708828124994.
Epoch 72/80

Epoch 00072: Lear

In [12]:
# Save model
model.save('NN_Model.h5')

In [14]:
from sklearn.metrics import mean_absolute_error

mae_valid = mean_absolute_error(y_valid, model.predict(X_valid), sample_weight=weight_valid)
print('MAE on validation: %.4f' % mae_valid)

MAE on validation: 0.0252
