In [1]:
# Import and setup
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
import matplotlib.pyplot as plt
import gc
%matplotlib inline

In [2]:
# Read data
df_train = pd.read_csv('train_V2.csv', nrows=None)
df_test = pd.read_csv('test_V2.csv', nrows=50)

# Drop NA
df_train = df_train.dropna()
print(df_train.shape)
print(df_test.shape)
df_train.head()

(4446965, 29)
(50, 28)


Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [3]:
# Define class to preprocess raw data
class DfTransformer:
    def __init__(self):
        self.mean_rankpoints = None
        self.mean_killpoints = None
        self.mean_winpoints = None
        self.df_test_meta = None
    
    def transform_train(self, df):
        # Replace None values of rankPoints with mean
        col_rankpoints = df['rankPoints']
        col_rankpoints = col_rankpoints.loc[col_rankpoints > 1e-4]
        self.mean_rankpoints = col_rankpoints.mean()
        # print('The mean of non-none rankPoints is %.3f' % mean_rankpoints)
        df.loc[df['rankPoints'] < 1e-4, 'rankPoints'] = self.mean_rankpoints
        
        # Replace None values of killPoints with mean
        col_killpoints = df['killPoints']
        col_killpoints = col_killpoints.loc[col_killpoints > 1e-4]
        self.mean_killpoints = col_killpoints.mean()
        # print('The mean of non-none killPoints is %.3f' % mean_killpoints)
        df.loc[df['killPoints'] < 1e-4, 'killPoints'] = self.mean_killpoints
        
        # Replace None values of winPoints with mean
        col_winpoints = df['winPoints']
        col_winpoints = col_winpoints.loc[col_winpoints > 1e-4]
        self.mean_winpoints = col_winpoints.mean()
        # print('The mean of non-none winPoints is %.3f' % mean_winpoints)
        df.loc[df['winPoints'] < 1e-4, 'winPoints'] = self.mean_winpoints
        
        # Feature engineering
        return self.feature_engineering(df, True)
    
    def transform_test(self, df):
        # Replace None values of rankPoints, killPoints, winPoints
        df.loc[df['rankPoints'] < 1e-4, 'rankPoints'] = self.mean_rankpoints
        df.loc[df['killPoints'] < 1e-4, 'killPoints'] = self.mean_killpoints
        df.loc[df['winPoints'] < 1e-4, 'winPoints'] = self.mean_winpoints
        
        # Feature engineering
        return self.feature_engineering(df, False)
    
    def transform_prediction(self, df_meta, y_predicted):
        df_y = df_meta.copy()
        df_y['winPlacePerc'] = y_predicted
        return (self.df_test_meta.merge(df_y, on=['matchId', 'groupId'], how='left'))[['Id', 'winPlacePerc']]
        
    
    def feature_engineering(self, df, is_train=True):
        # Add hand-engineered features
        df_walkDistance = df['walkDistance'] + 5
        df['heals_over_dist'] = df['heals'] / df_walkDistance
        df['boosts_over_dist'] = df['boosts'] / df_walkDistance
        df['kills_over_dist'] = df['kills'] / df_walkDistance
        df['headshots_over_dist'] = df['headshotKills'] / df_walkDistance
        df['killStreaks_over_dist'] = df['killStreaks'] / df_walkDistance
        df['damageDealt_over_dist'] = df['damageDealt'] / df_walkDistance
        df['dbnos_over_dist'] = df['DBNOs'] / df_walkDistance
        df['weapons_over_dist'] = df['weaponsAcquired'] / df_walkDistance
        df['revives_over_dist'] = df['revives'] / df_walkDistance
        df_walkDistance = None
        df_kills = df['kills'] + 0.001
        df['headshots_over_kills'] = df['headshotKills'] / df_kills
        df['killStreaks_over_kills'] = df['killStreaks'] / df_kills
        df_kills = None
        
        features = df.columns.tolist()
        features.remove('Id')
        features.remove('groupId')
        features.remove('matchId')
        features.remove('matchType')
        if is_train: features.remove('winPlacePerc')
            
        # Add group mean and group mean rank in match
        group_by = df.groupby(['matchId','groupId'])
        group_by_features = group_by[features]
        df_agg = group_by_features.agg('mean')
        df_agg_rank = df_agg.groupby('matchId')[features].rank(pct=True).reset_index()
        df_out = df_agg.reset_index().merge(df_agg_rank, on=['matchId', 'groupId'], how='left',
                                            suffixes=["_groupMean", "_groupMeanRank"])

        # Add group max and group max rank in match
        df_agg = group_by_features.agg('max')
        df_agg_rank = df_agg.groupby('matchId')[features].rank(pct=True).reset_index()
        df_agg = df_agg.reset_index().merge(df_agg_rank, on=['matchId', 'groupId'], how='left',
                                            suffixes=["_groupMax", "_groupMaxRank"])
        df_out = df_out.merge(df_agg, on=['matchId', 'groupId'], how='left')

        # Add group min and group min rank in match
        df_agg = group_by_features.agg('min')
        df_agg_rank = df_agg.groupby('matchId')[features].rank(pct=True).reset_index()
        df_agg = df_agg.reset_index().merge(df_agg_rank, on=['matchId', 'groupId'], how='left',
                                            suffixes=["_groupMin", "_groupMinRank"])
        df_out = df_out.merge(df_agg, on=['matchId', 'groupId'], how='left')
        group_by_features = None
        
        # Add group size
        df_agg = group_by.size().to_frame('groupSize').reset_index()
        df_out = df_out.merge(df_agg, on=['matchId', 'groupId'], how='left')

        # Add target if for training
        if is_train:
            df_agg = group_by[['winPlacePerc']].first().reset_index()
            df_out = df_out.merge(df_agg, on=['matchId', 'groupId'], how='left')
            
        # Add match mean
        group_by = df.groupby(['matchId'])
        df_agg = group_by[features].agg('mean').reset_index()
        df_out = df_out.merge(df_agg, on=['matchId'], how='left') # Original name now represents match mean
            
        # Add match size
        df_agg = group_by['groupId'].nunique().to_frame('matchSize').reset_index()
        df_out = df_out.merge(df_agg, on=['matchId'], how='left')

        # Add encoded matchType (no improvement)
        # df_agg = group_by['matchType'].first().reset_index()
        # df_out = df_out.merge(df_agg, on=['matchId'], how='left')
        # df_out = pd.get_dummies(df_out, columns=['matchType']) # Ont-hot encoding
        # df_out['matchType'] = df_out['matchType'].astype('category').cat.codes # Label encoding
        
        # Keep metadata of df_test for later restoring individual prediction
        if not is_train:
            self.df_test_meta = df[['Id', 'matchId', 'groupId']]
        
        # print(df_out)
        # Return: features, metadata, weights
        return df_out.drop(columns=['matchId', 'groupId']),\
               df_out[['matchId', 'groupId']],\
               df_out['groupSize'].values

# Unit test for DfPreprocessor
df_bbb = df_train.iloc[:6, :].copy()
df_bbb.iloc[0, 26] = 0
df_bbb.iloc[1:5, 2] = df_bbb.iloc[0, 2]
df_bbb.iloc[1:3, 1] = df_bbb.iloc[0, 1]
df_bbb.iloc[1:3, 28] = df_bbb.iloc[0, 28]
df_bbb, df_bbb_meta, bbb_weights = DfTransformer().transform_train(df_bbb)
df_bbb

Unnamed: 0,assists_groupMean,boosts_groupMean,damageDealt_groupMean,DBNOs_groupMean,headshotKills_groupMean,heals_groupMean,killPlace_groupMean,killPoints_groupMean,kills_groupMean,killStreaks_groupMean,longestKill_groupMean,matchDuration_groupMean,maxPlace_groupMean,numGroups_groupMean,rankPoints_groupMean,revives_groupMean,rideDistance_groupMean,roadKills_groupMean,swimDistance_groupMean,teamKills_groupMean,vehicleDestroys_groupMean,walkDistance_groupMean,weaponsAcquired_groupMean,winPoints_groupMean,heals_over_dist_groupMean,boosts_over_dist_groupMean,kills_over_dist_groupMean,headshots_over_dist_groupMean,killStreaks_over_dist_groupMean,damageDealt_over_dist_groupMean,dbnos_over_dist_groupMean,weapons_over_dist_groupMean,revives_over_dist_groupMean,headshots_over_kills_groupMean,killStreaks_over_kills_groupMean,assists_groupMeanRank,boosts_groupMeanRank,damageDealt_groupMeanRank,DBNOs_groupMeanRank,headshotKills_groupMeanRank,heals_groupMeanRank,killPlace_groupMeanRank,killPoints_groupMeanRank,kills_groupMeanRank,killStreaks_groupMeanRank,longestKill_groupMeanRank,matchDuration_groupMeanRank,maxPlace_groupMeanRank,numGroups_groupMeanRank,rankPoints_groupMeanRank,revives_groupMeanRank,rideDistance_groupMeanRank,roadKills_groupMeanRank,swimDistance_groupMeanRank,teamKills_groupMeanRank,vehicleDestroys_groupMeanRank,walkDistance_groupMeanRank,weaponsAcquired_groupMeanRank,winPoints_groupMeanRank,heals_over_dist_groupMeanRank,boosts_over_dist_groupMeanRank,kills_over_dist_groupMeanRank,headshots_over_dist_groupMeanRank,killStreaks_over_dist_groupMeanRank,damageDealt_over_dist_groupMeanRank,dbnos_over_dist_groupMeanRank,weapons_over_dist_groupMeanRank,revives_over_dist_groupMeanRank,headshots_over_kills_groupMeanRank,killStreaks_over_kills_groupMeanRank,assists_groupMax,boosts_groupMax,damageDealt_groupMax,DBNOs_groupMax,headshotKills_groupMax,heals_groupMax,killPlace_groupMax,killPoints_groupMax,kills_groupMax,killStreaks_groupMax,longestKill_groupMax,matchDuration_groupMax,maxPlace_groupMax,numGroups_groupMax,rankPoints_groupMax,revives_groupMax,rideDistance_groupMax,roadKills_groupMax,swimDistance_groupMax,teamKills_groupMax,vehicleDestroys_groupMax,walkDistance_groupMax,weaponsAcquired_groupMax,winPoints_groupMax,heals_over_dist_groupMax,boosts_over_dist_groupMax,kills_over_dist_groupMax,headshots_over_dist_groupMax,killStreaks_over_dist_groupMax,damageDealt_over_dist_groupMax,dbnos_over_dist_groupMax,weapons_over_dist_groupMax,revives_over_dist_groupMax,headshots_over_kills_groupMax,killStreaks_over_kills_groupMax,assists_groupMaxRank,boosts_groupMaxRank,damageDealt_groupMaxRank,DBNOs_groupMaxRank,headshotKills_groupMaxRank,heals_groupMaxRank,killPlace_groupMaxRank,killPoints_groupMaxRank,kills_groupMaxRank,killStreaks_groupMaxRank,longestKill_groupMaxRank,matchDuration_groupMaxRank,maxPlace_groupMaxRank,numGroups_groupMaxRank,rankPoints_groupMaxRank,revives_groupMaxRank,rideDistance_groupMaxRank,roadKills_groupMaxRank,swimDistance_groupMaxRank,teamKills_groupMaxRank,vehicleDestroys_groupMaxRank,walkDistance_groupMaxRank,weaponsAcquired_groupMaxRank,winPoints_groupMaxRank,heals_over_dist_groupMaxRank,boosts_over_dist_groupMaxRank,kills_over_dist_groupMaxRank,headshots_over_dist_groupMaxRank,killStreaks_over_dist_groupMaxRank,damageDealt_over_dist_groupMaxRank,dbnos_over_dist_groupMaxRank,weapons_over_dist_groupMaxRank,revives_over_dist_groupMaxRank,headshots_over_kills_groupMaxRank,killStreaks_over_kills_groupMaxRank,assists_groupMin,boosts_groupMin,damageDealt_groupMin,DBNOs_groupMin,headshotKills_groupMin,heals_groupMin,killPlace_groupMin,killPoints_groupMin,kills_groupMin,killStreaks_groupMin,longestKill_groupMin,matchDuration_groupMin,maxPlace_groupMin,numGroups_groupMin,rankPoints_groupMin,revives_groupMin,rideDistance_groupMin,roadKills_groupMin,swimDistance_groupMin,teamKills_groupMin,vehicleDestroys_groupMin,walkDistance_groupMin,weaponsAcquired_groupMin,winPoints_groupMin,heals_over_dist_groupMin,boosts_over_dist_groupMin,kills_over_dist_groupMin,headshots_over_dist_groupMin,killStreaks_over_dist_groupMin,damageDealt_over_dist_groupMin,dbnos_over_dist_groupMin,weapons_over_dist_groupMin,revives_over_dist_groupMin,headshots_over_kills_groupMin,killStreaks_over_kills_groupMin,assists_groupMinRank,boosts_groupMinRank,damageDealt_groupMinRank,DBNOs_groupMinRank,headshotKills_groupMinRank,heals_groupMinRank,killPlace_groupMinRank,killPoints_groupMinRank,kills_groupMinRank,killStreaks_groupMinRank,longestKill_groupMinRank,matchDuration_groupMinRank,maxPlace_groupMinRank,numGroups_groupMinRank,rankPoints_groupMinRank,revives_groupMinRank,rideDistance_groupMinRank,roadKills_groupMinRank,swimDistance_groupMinRank,teamKills_groupMinRank,vehicleDestroys_groupMinRank,walkDistance_groupMinRank,weaponsAcquired_groupMinRank,winPoints_groupMinRank,heals_over_dist_groupMinRank,boosts_over_dist_groupMinRank,kills_over_dist_groupMinRank,headshots_over_dist_groupMinRank,killStreaks_over_dist_groupMinRank,damageDealt_over_dist_groupMinRank,dbnos_over_dist_groupMinRank,weapons_over_dist_groupMinRank,revives_over_dist_groupMinRank,headshots_over_kills_groupMinRank,killStreaks_over_kills_groupMinRank,groupSize,winPlacePerc,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,heals_over_dist,boosts_over_dist,kills_over_dist,headshots_over_dist,killStreaks_over_dist,damageDealt_over_dist,dbnos_over_dist,weapons_over_dist,revives_over_dist,headshots_over_kills,killStreaks_over_kills,matchSize
0,0.333333,0.0,53.156667,0.0,0.0,0.0,54.666667,1241.0,0.0,0.0,0.0,1467.0,34.666667,32.666667,1482.4,0.0,0.0015,0.0,3.68,0.0,0.0,613.533333,2.333333,1466.0,0.0,0.0,0.0,0.0,0.0,0.15708,0.0,0.005155,0.0,0.0,0.0,1.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,1.0,0.666667,0.666667,0.666667,0.666667,1.0,0.666667,1.0,0.666667,0.666667,1.0,0.666667,0.666667,0.666667,0.666667,0.5,0.666667,0.5,0.333333,0.666667,0.333333,0.666667,0.666667,0.5,1,0,91.47,0,0,0,60,1241.0,0,0,0.0,1777,50,47,1491.0,0,0.0045,0,11.04,0,0,1434.0,5,1466.0,0.0,0.0,0.0,0.0,0.0,0.407674,0.0,0.01199,0.0,0.0,0.0,1.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,1.0,0.666667,0.666667,0.666667,0.666667,1.0,0.666667,1.0,0.666667,0.666667,1.0,1.0,0.666667,0.666667,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.333333,0.666667,0.666667,0.5,0,0,0.0,0,0,0,47,1241.0,0,0,0.0,1306,26,25,1472.2,0,0.0,0,0.0,0,0,161.8,0,1466.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,0.666667,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,0.333333,0.333333,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.333333,0.666667,0.666667,0.666667,0.5,0.666667,0.5,0.333333,0.666667,0.333333,0.666667,0.666667,0.5,3,0.4444,0.2,0.0,58.474,0.0,0.0,0.0,56.8,1241.0,0.2,0.2,11.706,1452.2,46.4,44.6,1483.04,0.0,0.0009,0.0,2.208,0.0,0.0,418.61,2.4,1466.0,0.0,0.0,0.003653,0.0,0.003653,0.491225,0.0,0.013288,0.0,0.0,0.1998,3
1,0.0,0.0,32.9,0.0,0.0,0.0,75.0,1241.0,0.0,0.0,0.0,1436.0,31.0,30.0,1408.0,0.0,0.0,0.0,0.0,0.0,0.0,202.7,3.0,1466.0,0.0,0.0,0.0,0.0,0.0,0.158402,0.0,0.014444,0.0,0.0,0.0,0.5,0.666667,0.333333,0.666667,0.666667,0.666667,1.0,0.666667,0.5,0.5,0.5,0.666667,0.333333,0.333333,0.333333,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.666667,1.0,0.666667,0.666667,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0,0,32.9,0,0,0,75,1241.0,0,0,0.0,1436,31,30,1408.0,0,0.0,0,0.0,0,0,202.7,3,1466.0,0.0,0.0,0.0,0.0,0.0,0.158402,0.0,0.014444,0.0,0.0,0.0,0.5,0.666667,0.333333,0.666667,0.666667,0.666667,1.0,0.666667,0.5,0.5,0.5,0.666667,0.333333,0.333333,0.333333,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.666667,0.5,0.333333,0.666667,0.666667,0.666667,0.666667,0.5,0,0,32.9,0,0,0,75,1241.0,0,0,0.0,1436,31,30,1408.0,0,0.0,0,0.0,0,0,202.7,3,1466.0,0.0,0.0,0.0,0.0,0.0,0.158402,0.0,0.014444,0.0,0.0,0.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,1.0,0.666667,0.5,0.5,0.5,1.0,0.666667,0.666667,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,1.0,1.0,0.666667,0.666667,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,1,0.1667,0.2,0.0,58.474,0.0,0.0,0.0,56.8,1241.0,0.2,0.2,11.706,1452.2,46.4,44.6,1483.04,0.0,0.0009,0.0,2.208,0.0,0.0,418.61,2.4,1466.0,0.0,0.0,0.003653,0.0,0.003653,0.491225,0.0,0.013288,0.0,0.0,0.1998,3
2,0.0,0.0,100.0,0.0,0.0,0.0,45.0,1241.0,1.0,1.0,58.53,1424.0,97.0,95.0,1560.0,0.0,0.0,0.0,0.0,0.0,0.0,49.75,2.0,1466.0,0.0,0.0,0.018265,0.0,0.018265,1.826484,0.0,0.03653,0.0,0.0,0.999001,0.5,0.666667,1.0,0.666667,0.666667,0.666667,0.333333,0.666667,1.0,1.0,1.0,0.333333,1.0,1.0,1.0,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.333333,0.333333,0.666667,0.666667,0.666667,1.0,0.666667,1.0,1.0,0.666667,1.0,0.666667,0.666667,1.0,0,0,100.0,0,0,0,45,1241.0,1,1,58.53,1424,97,95,1560.0,0,0.0,0,0.0,0,0,49.75,2,1466.0,0.0,0.0,0.018265,0.0,0.018265,1.826484,0.0,0.03653,0.0,0.0,0.999001,0.5,0.666667,1.0,0.666667,0.666667,0.666667,0.333333,0.666667,1.0,1.0,1.0,0.333333,1.0,1.0,1.0,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.333333,0.333333,0.666667,0.666667,0.666667,1.0,0.666667,1.0,1.0,0.666667,1.0,0.666667,0.666667,1.0,0,0,100.0,0,0,0,45,1241.0,1,1,58.53,1424,97,95,1560.0,0,0.0,0,0.0,0,0,49.75,2,1466.0,0.0,0.0,0.018265,0.0,0.018265,1.826484,0.0,0.03653,0.0,0.0,0.999001,0.666667,0.666667,1.0,0.666667,0.666667,0.666667,0.333333,0.666667,1.0,1.0,1.0,0.666667,1.0,1.0,1.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.333333,0.666667,0.666667,0.666667,0.666667,1.0,0.666667,1.0,1.0,0.666667,1.0,0.666667,0.666667,1.0,1,0.1875,0.2,0.0,58.474,0.0,0.0,0.0,56.8,1241.0,0.2,0.2,11.706,1452.2,46.4,44.6,1483.04,0.0,0.0009,0.0,2.208,0.0,0.0,418.61,2.4,1466.0,0.0,0.0,0.003653,0.0,0.003653,0.491225,0.0,0.013288,0.0,0.0,0.1998,3
3,0.0,0.0,100.0,1.0,1.0,0.0,44.0,1241.0,1.0,1.0,18.44,1395.0,28.0,28.0,1418.0,0.0,0.0,0.0,0.0,0.0,0.0,34.7,1.0,1466.0,0.0,0.0,0.025189,0.025189,0.025189,2.518892,0.025189,0.025189,0.0,0.999001,0.999001,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0,100.0,1,1,0,44,1241.0,1,1,18.44,1395,28,28,1418.0,0,0.0,0,0.0,0,0,34.7,1,1466.0,0.0,0.0,0.025189,0.025189,0.025189,2.518892,0.025189,0.025189,0.0,0.999001,0.999001,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0,100.0,1,1,0,44,1241.0,1,1,18.44,1395,28,28,1418.0,0,0.0,0,0.0,0,0,34.7,1,1466.0,0.0,0.0,0.025189,0.025189,0.025189,2.518892,0.025189,0.025189,0.0,0.999001,0.999001,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1,0.037,0.0,0.0,100.0,1.0,1.0,0.0,44.0,1241.0,1.0,1.0,18.44,1395.0,28.0,28.0,1418.0,0.0,0.0,0.0,0.0,0.0,0.0,34.7,1.0,1466.0,0.0,0.0,0.025189,0.025189,0.025189,2.518892,0.025189,0.025189,0.0,0.999001,0.999001,1


In [4]:
# Preprocess data
df_transformer = DfTransformer()
df_train, df_train_meta, weight_train = df_transformer.transform_train(df_train)
df_test, df_test_meta, weight_test = df_transformer.transform_test(df_test)

In [5]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

Memory usage of dataframe is 4037274048.00 MB
Memory usage after optimization is: 960676656.00 MB
Decreased by 76.2%
Memory usage of dataframe is 99200.00 MB
Memory usage after optimization is: 22000.00 MB
Decreased by 77.8%


In [5]:
# Get X and y
y_train = df_train['winPlacePerc'].values
X_train = df_train.drop(columns='winPlacePerc').values
X_test = df_test.values
df_train = None
df_test = None
print(X_train.shape)
print(X_test.shape)
gc.collect()

(2026744, 247)
(50, 247)


196

In [7]:
# Standardize
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

standardize = False
if standardize:
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

In [114]:
# Define method to get k-fold CV MAE
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression

def report_mae_sklearn(model, X, y, weight, K=2):
    kf = KFold(n_splits=K, shuffle=True)
    maes = []
    for idx_train, idx_valid in kf.split(X):
        print('processing fold...')
        X_train = X[idx_train]
        y_train = y[idx_train]
        X_valid = X[idx_valid]
        y_valid = y[idx_valid]
        weight_train = weight[idx_train]
        weight_valid = weight[idx_valid]
        
        model.fit(X_train, y_train, sample_weight=weight_train)
        y_predicted = model.predict(X_valid)
        mae = mean_absolute_error(y_valid, y_predicted, sample_weight=weight_valid)
        print('current MAE: %.5f' % mae)
        maes.append(mae)
    return np.array(maes).mean()

In [6]:
K = 2

In [116]:
# Compute MAE by 2-fold CV
mae_mean = report_mae(LinearRegression(), X_train, y_train, weight_train, K)
print('Linear Regression MAE: %.5f' % mae_mean)

processing fold...
current MAE: 0.04072
processing fold...
current MAE: 0.04065
Linear Regression MAE: 0.04069


In [8]:
# Define method to get k-fold CV MAE
import lightgbm as lgb

def report_mae_lightgbm(lgb_params, lgb_data, K=2):
    cv_metrics = lgb.cv(lgb_params, lgb_data, nfold=K, stratified=False, verbose_eval=1000)
    return cv_metrics['l1-mean'][-1]

In [9]:
lgb_data = lgb.Dataset(X_train, label=y_train, weight=weight_train)
X_train = None
y_train = None
weight_train = None
gc.collect()

lgb_params = {"objective" : "regression", "metric" : "mae", 'n_estimators':20000, 'early_stopping_rounds':200,
              "num_leaves" : 31, "learning_rate" : 0.05, "bagging_fraction" : 0.7,
               "bagging_seed" : 0, "num_threads" : 4,"colsample_bytree" : 0.7
}

mae_mean = report_mae_lightgbm(lgb_params, lgb_data, K)
print('LightGBM MAE: %.5f' % mae_mean)



LightGBM MAE: 0.02404
