In [2]:
# Import and setup
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
# Read data
df_train = pd.read_csv('train_V2.csv', nrows=50)
df_test = pd.read_csv('test_V2.csv', nrows=50)

# Drop NA
df_train = df_train.dropna()
print(df_train.shape)
print(df_test.shape)
df_train.head()

(50, 29)
(50, 28)


Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [9]:
# Define class to preprocess raw data
class DfPreprocessor:
    def __init__(self):
        self.mean_rankpoints = None
        self.mean_killpoints = None
        self.mean_winpoints = None
    
    def transform_train(self, df, agg=True):
        # Replace None values of rankPoints with mean
        col_rankpoints = df['rankPoints']
        col_rankpoints = col_rankpoints.loc[col_rankpoints > 1e-4]
        self.mean_rankpoints = col_rankpoints.mean()
        # print('The mean of non-none rankPoints is %.3f' % mean_rankpoints)
        df.loc[df['rankPoints'] < 1e-4, 'rankPoints'] = self.mean_rankpoints
        
        # Replace None values of killPoints with mean
        col_killpoints = df['killPoints']
        col_killpoints = col_killpoints.loc[col_killpoints > 1e-4]
        self.mean_killpoints = col_killpoints.mean()
        # print('The mean of non-none killPoints is %.3f' % mean_killpoints)
        df.loc[df['killPoints'] < 1e-4, 'killPoints'] = self.mean_killpoints
        
        # Replace None values of winPoints with mean
        col_winpoints = df['winPoints']
        col_winpoints = col_winpoints.loc[col_winpoints > 1e-4]
        self.mean_winpoints = col_winpoints.mean()
        # print('The mean of non-none winPoints is %.3f' % mean_winpoints)
        df.loc[df['winPoints'] < 1e-4, 'winPoints'] = self.mean_winpoints
        
        # Feature engineering
        return self.feature_engineering(df, agg)
    
    def transform_test(self, df, agg=True):
        # Replace None values of rankPoints, killPoints, winPoints
        df.loc[df['rankPoints'] < 1e-4, 'rankPoints'] = self.mean_rankpoints
        df.loc[df['killPoints'] < 1e-4, 'killPoints'] = self.mean_killpoints
        df.loc[df['winPoints'] < 1e-4, 'winPoints'] = self.mean_winpoints
        
        # Feature engineering
        return self.feature_engineering(df, agg)
    
    def feature_engineering(self, df, agg):
        if agg:
            features = df.columns.tolist()
            features.remove('Id')
            features.remove('groupId')
            features.remove('matchId')
            features.remove('matchType')
            if 'winPlacePerc' in features: features.remove('winPlacePerc')
            
            # Add group mean and group mean rank in match
            df_agg = df.groupby(['matchId','groupId'])[features].agg('mean')
            df_agg_rank = df_agg.groupby('matchId')[features].rank(pct=True).reset_index()
            df = df.merge(df_agg.reset_index(), on=['matchId', 'groupId'], how='left', suffixes=["", "_groupMean"])
            df = df.merge(df_agg_rank, on=['matchId', 'groupId'], how='left', suffixes=["", "_groupMeanRank"])
            
            # Add group max and group max rank in match
            df_agg = df.groupby(['matchId','groupId'])[features].agg('max')
            df_agg_rank = df_agg.groupby('matchId')[features].rank(pct=True).reset_index()
            df = df.merge(df_agg.reset_index(), on=['matchId', 'groupId'], how='left', suffixes=["", "_groupMax"])
            df = df.merge(df_agg_rank, on=['matchId', 'groupId'], how='left', suffixes=["", "_groupMaxRank"])
            
            # Add group min and group min rank in match
            df_agg = df.groupby(['matchId','groupId'])[features].agg('min')
            df_agg_rank = df_agg.groupby('matchId')[features].rank(pct=True).reset_index()
            df = df.merge(df_agg.reset_index(), on=['matchId', 'groupId'], how='left', suffixes=["", "_groupMin"])
            df = df.merge(df_agg_rank, on=['matchId', 'groupId'], how='left', suffixes=["", "_groupMinRank"])
            
            # Add group size
            df_agg = df.groupby(['matchId','groupId']).size().to_frame('groupSize').reset_index()
            df = df.merge(df_agg, on=['matchId', 'groupId'], how='left')
            
            # Add match mean
            df_agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
            df = df.merge(df_agg, on=['matchId'], how='left', suffixes=["", "_matchMean"])
            
            # Add match size
            df_agg = df.groupby(['matchId']).size().to_frame('matchSize').reset_index()
            df = df.merge(df_agg, on=['matchId'], how='left')
            print(df_agg)
        
        df_meta = df[['Id', 'groupId', 'matchId', 'maxPlace', 'numGroups']]
        df = df.drop(columns=['Id', 'groupId', 'matchId', 'matchType'])
        # df = df.drop(columns=features)
        return df, df_meta

# Unit test for DfPreprocessor
df_bbb = df_train.iloc[:6, :].copy()
df_bbb.iloc[1:5, 2] = df_bbb.iloc[0, 2]
df_bbb.iloc[1:3, 1] = df_bbb.iloc[0, 1]
df_bbb.iloc[1:3, 28] = df_bbb.iloc[0, 28]
df_bbb, df_bbb_meta = DfPreprocessor().transform_train(df_bbb)
df_bbb

          matchId  matchSize
0  a10357fd1a4a91          5
1  bac52627a12114          1


Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,assists_groupMean,boosts_groupMean,damageDealt_groupMean,DBNOs_groupMean,headshotKills_groupMean,heals_groupMean,killPlace_groupMean,killPoints_groupMean,kills_groupMean,killStreaks_groupMean,longestKill_groupMean,matchDuration_groupMean,maxPlace_groupMean,numGroups_groupMean,rankPoints_groupMean,revives_groupMean,rideDistance_groupMean,roadKills_groupMean,swimDistance_groupMean,teamKills_groupMean,vehicleDestroys_groupMean,walkDistance_groupMean,weaponsAcquired_groupMean,winPoints_groupMean,assists_groupMeanRank,boosts_groupMeanRank,damageDealt_groupMeanRank,DBNOs_groupMeanRank,headshotKills_groupMeanRank,heals_groupMeanRank,killPlace_groupMeanRank,killPoints_groupMeanRank,kills_groupMeanRank,killStreaks_groupMeanRank,longestKill_groupMeanRank,matchDuration_groupMeanRank,maxPlace_groupMeanRank,numGroups_groupMeanRank,rankPoints_groupMeanRank,revives_groupMeanRank,rideDistance_groupMeanRank,roadKills_groupMeanRank,swimDistance_groupMeanRank,teamKills_groupMeanRank,vehicleDestroys_groupMeanRank,walkDistance_groupMeanRank,weaponsAcquired_groupMeanRank,winPoints_groupMeanRank,assists_groupMax,boosts_groupMax,damageDealt_groupMax,DBNOs_groupMax,headshotKills_groupMax,heals_groupMax,killPlace_groupMax,killPoints_groupMax,kills_groupMax,killStreaks_groupMax,longestKill_groupMax,matchDuration_groupMax,maxPlace_groupMax,numGroups_groupMax,rankPoints_groupMax,revives_groupMax,rideDistance_groupMax,roadKills_groupMax,swimDistance_groupMax,teamKills_groupMax,vehicleDestroys_groupMax,walkDistance_groupMax,weaponsAcquired_groupMax,winPoints_groupMax,assists_groupMaxRank,boosts_groupMaxRank,damageDealt_groupMaxRank,DBNOs_groupMaxRank,headshotKills_groupMaxRank,heals_groupMaxRank,killPlace_groupMaxRank,killPoints_groupMaxRank,kills_groupMaxRank,killStreaks_groupMaxRank,longestKill_groupMaxRank,matchDuration_groupMaxRank,maxPlace_groupMaxRank,numGroups_groupMaxRank,rankPoints_groupMaxRank,revives_groupMaxRank,rideDistance_groupMaxRank,roadKills_groupMaxRank,swimDistance_groupMaxRank,teamKills_groupMaxRank,vehicleDestroys_groupMaxRank,walkDistance_groupMaxRank,weaponsAcquired_groupMaxRank,winPoints_groupMaxRank,assists_groupMin,boosts_groupMin,damageDealt_groupMin,DBNOs_groupMin,headshotKills_groupMin,heals_groupMin,killPlace_groupMin,killPoints_groupMin,kills_groupMin,killStreaks_groupMin,longestKill_groupMin,matchDuration_groupMin,maxPlace_groupMin,numGroups_groupMin,rankPoints_groupMin,revives_groupMin,rideDistance_groupMin,roadKills_groupMin,swimDistance_groupMin,teamKills_groupMin,vehicleDestroys_groupMin,walkDistance_groupMin,weaponsAcquired_groupMin,winPoints_groupMin,assists_groupMinRank,boosts_groupMinRank,damageDealt_groupMinRank,DBNOs_groupMinRank,headshotKills_groupMinRank,heals_groupMinRank,killPlace_groupMinRank,killPoints_groupMinRank,kills_groupMinRank,killStreaks_groupMinRank,longestKill_groupMinRank,matchDuration_groupMinRank,maxPlace_groupMinRank,numGroups_groupMinRank,rankPoints_groupMinRank,revives_groupMinRank,rideDistance_groupMinRank,roadKills_groupMinRank,swimDistance_groupMinRank,teamKills_groupMinRank,vehicleDestroys_groupMinRank,walkDistance_groupMinRank,weaponsAcquired_groupMinRank,winPoints_groupMinRank,groupSize,assists_matchMean,boosts_matchMean,damageDealt_matchMean,DBNOs_matchMean,headshotKills_matchMean,heals_matchMean,killPlace_matchMean,killPoints_matchMean,kills_matchMean,killStreaks_matchMean,longestKill_matchMean,matchDuration_matchMean,maxPlace_matchMean,numGroups_matchMean,rankPoints_matchMean,revives_matchMean,rideDistance_matchMean,roadKills_matchMean,swimDistance_matchMean,teamKills_matchMean,vehicleDestroys_matchMean,walkDistance_matchMean,weaponsAcquired_matchMean,winPoints_matchMean,matchSize
0,0,0,0.0,0,0,0,60,1241.0,0,0,0.0,1306,28,26,1472.2,0,0.0,0,0.0,0,0,244.8,1,1466.0,0.4444,0.333333,0.0,53.156667,0.0,0.0,0.0,54.666667,1241.0,0.0,0.0,0.0,1467.0,34.666667,32.666667,1482.4,0.0,0.0015,0.0,3.68,0.0,0.0,613.533333,2.666667,1466.0,1.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,1.0,0.666667,0.666667,0.666667,0.666667,1.0,0.666667,1.0,0.666667,0.666667,1.0,0.666667,0.666667,1,0,91.47,0,0,0,60,1241.0,0,0,0.0,1777,50,47,1491.0,0,0.0045,0,11.04,0,0,1434.0,5,1466.0,1.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,1.0,0.666667,0.666667,0.666667,0.666667,1.0,0.666667,1.0,0.666667,0.666667,1.0,1.0,0.666667,0,0,0.0,0,0,0,47,1241.0,0,0,0.0,1306,26,25,1472.2,0,0.0,0,0.0,0,0,161.8,1,1466.0,0.666667,0.666667,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,0.333333,0.333333,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.333333,0.666667,3,0.2,0.0,58.474,0.0,0.0,0.0,56.8,1241.0,0.2,0.2,11.706,1452.2,46.4,44.6,1483.04,0.0,0.0009,0.0,2.208,0.0,0.0,418.61,2.6,1466.0,5
1,0,0,91.47,0,0,0,57,1241.0,0,0,0.0,1777,26,25,1484.0,0,0.0045,0,11.04,0,0,1434.0,5,1466.0,0.4444,0.333333,0.0,53.156667,0.0,0.0,0.0,54.666667,1241.0,0.0,0.0,0.0,1467.0,34.666667,32.666667,1482.4,0.0,0.0015,0.0,3.68,0.0,0.0,613.533333,2.666667,1466.0,1.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,1.0,0.666667,0.666667,0.666667,0.666667,1.0,0.666667,1.0,0.666667,0.666667,1.0,0.666667,0.666667,1,0,91.47,0,0,0,60,1241.0,0,0,0.0,1777,50,47,1491.0,0,0.0045,0,11.04,0,0,1434.0,5,1466.0,1.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,1.0,0.666667,0.666667,0.666667,0.666667,1.0,0.666667,1.0,0.666667,0.666667,1.0,1.0,0.666667,0,0,0.0,0,0,0,47,1241.0,0,0,0.0,1306,26,25,1472.2,0,0.0,0,0.0,0,0,161.8,1,1466.0,0.666667,0.666667,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,0.333333,0.333333,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.333333,0.666667,3,0.2,0.0,58.474,0.0,0.0,0.0,56.8,1241.0,0.2,0.2,11.706,1452.2,46.4,44.6,1483.04,0.0,0.0009,0.0,2.208,0.0,0.0,418.61,2.6,1466.0,5
2,1,0,68.0,0,0,0,47,1241.0,0,0,0.0,1318,50,47,1491.0,0,0.0,0,0.0,0,0,161.8,2,1466.0,0.4444,0.333333,0.0,53.156667,0.0,0.0,0.0,54.666667,1241.0,0.0,0.0,0.0,1467.0,34.666667,32.666667,1482.4,0.0,0.0015,0.0,3.68,0.0,0.0,613.533333,2.666667,1466.0,1.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,1.0,0.666667,0.666667,0.666667,0.666667,1.0,0.666667,1.0,0.666667,0.666667,1.0,0.666667,0.666667,1,0,91.47,0,0,0,60,1241.0,0,0,0.0,1777,50,47,1491.0,0,0.0045,0,11.04,0,0,1434.0,5,1466.0,1.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,1.0,0.666667,0.666667,0.666667,0.666667,1.0,0.666667,1.0,0.666667,0.666667,1.0,1.0,0.666667,0,0,0.0,0,0,0,47,1241.0,0,0,0.0,1306,26,25,1472.2,0,0.0,0,0.0,0,0,161.8,1,1466.0,0.666667,0.666667,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,0.333333,0.333333,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.333333,0.666667,3,0.2,0.0,58.474,0.0,0.0,0.0,56.8,1241.0,0.2,0.2,11.706,1452.2,46.4,44.6,1483.04,0.0,0.0009,0.0,2.208,0.0,0.0,418.61,2.6,1466.0,5
3,0,0,32.9,0,0,0,75,1241.0,0,0,0.0,1436,31,30,1408.0,0,0.0,0,0.0,0,0,202.7,3,1466.0,0.1667,0.0,0.0,32.9,0.0,0.0,0.0,75.0,1241.0,0.0,0.0,0.0,1436.0,31.0,30.0,1408.0,0.0,0.0,0.0,0.0,0.0,0.0,202.7,3.0,1466.0,0.5,0.666667,0.333333,0.666667,0.666667,0.666667,1.0,0.666667,0.5,0.5,0.5,0.666667,0.333333,0.333333,0.333333,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.666667,1.0,0.666667,0,0,32.9,0,0,0,75,1241.0,0,0,0.0,1436,31,30,1408.0,0,0.0,0,0.0,0,0,202.7,3,1466.0,0.5,0.666667,0.333333,0.666667,0.666667,0.666667,1.0,0.666667,0.5,0.5,0.5,0.666667,0.333333,0.333333,0.333333,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.666667,0.666667,0.666667,0,0,32.9,0,0,0,75,1241.0,0,0,0.0,1436,31,30,1408.0,0,0.0,0,0.0,0,0,202.7,3,1466.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,1.0,0.666667,0.5,0.5,0.5,1.0,0.666667,0.666667,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,1.0,1.0,0.666667,1,0.2,0.0,58.474,0.0,0.0,0.0,56.8,1241.0,0.2,0.2,11.706,1452.2,46.4,44.6,1483.04,0.0,0.0009,0.0,2.208,0.0,0.0,418.61,2.6,1466.0,5
4,0,0,100.0,0,0,0,45,1241.0,1,1,58.53,1424,97,95,1560.0,0,0.0,0,0.0,0,0,49.75,2,1466.0,0.1875,0.0,0.0,100.0,0.0,0.0,0.0,45.0,1241.0,1.0,1.0,58.53,1424.0,97.0,95.0,1560.0,0.0,0.0,0.0,0.0,0.0,0.0,49.75,2.0,1466.0,0.5,0.666667,1.0,0.666667,0.666667,0.666667,0.333333,0.666667,1.0,1.0,1.0,0.333333,1.0,1.0,1.0,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.333333,0.333333,0.666667,0,0,100.0,0,0,0,45,1241.0,1,1,58.53,1424,97,95,1560.0,0,0.0,0,0.0,0,0,49.75,2,1466.0,0.5,0.666667,1.0,0.666667,0.666667,0.666667,0.333333,0.666667,1.0,1.0,1.0,0.333333,1.0,1.0,1.0,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.333333,0.333333,0.666667,0,0,100.0,0,0,0,45,1241.0,1,1,58.53,1424,97,95,1560.0,0,0.0,0,0.0,0,0,49.75,2,1466.0,0.666667,0.666667,1.0,0.666667,0.666667,0.666667,0.333333,0.666667,1.0,1.0,1.0,0.666667,1.0,1.0,1.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.333333,0.666667,0.666667,1,0.2,0.0,58.474,0.0,0.0,0.0,56.8,1241.0,0.2,0.2,11.706,1452.2,46.4,44.6,1483.04,0.0,0.0009,0.0,2.208,0.0,0.0,418.61,2.6,1466.0,5
5,0,0,100.0,1,1,0,44,1241.0,1,1,18.44,1395,28,28,1418.0,0,0.0,0,0.0,0,0,34.7,1,1466.0,0.037,0.0,0.0,100.0,1.0,1.0,0.0,44.0,1241.0,1.0,1.0,18.44,1395.0,28.0,28.0,1418.0,0.0,0.0,0.0,0.0,0.0,0.0,34.7,1.0,1466.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0,100.0,1,1,0,44,1241.0,1,1,18.44,1395,28,28,1418.0,0,0.0,0,0.0,0,0,34.7,1,1466.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0,100.0,1,1,0,44,1241.0,1,1,18.44,1395,28,28,1418.0,0,0.0,0,0.0,0,0,34.7,1,1466.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1,0.0,0.0,100.0,1.0,1.0,0.0,44.0,1241.0,1.0,1.0,18.44,1395.0,28.0,28.0,1418.0,0.0,0.0,0.0,0.0,0.0,0.0,34.7,1.0,1466.0,1


In [21]:
# Preprocess data
df_preprocessor = DfPreprocessor()
df_train, df_train_meta = df_preprocessor.transform_train(df_train)
df_test, df_test_meta = df_preprocessor.transform_test(df_test)

# Get X and y
y_train = df_train['winPlacePerc'].values
X_train = df_train.drop(columns='winPlacePerc').values
X_test = df_test.values
print(X_train.shape)
print(X_test.shape)

              matchId  matchSize
0      0000a43bce5eec         95
1      0000eb01ea6cdd         98
2      0002912fe5ed71         95
3      0003b92987589e        100
4      0006eb8c17708d         93
5      00077604e50a63         98
6      00086c74bb4efc         98
7      00086e740a5804         98
8      001125344b660c         96
9      001360264d4b5f         91
10     0014d9d1b0aff6         94
11     001616ed5da99b         97
12     0016fe3ee17ce7         97
13     00177a6ce4dfb5         92
14     00188d50e054f5         92
15     001937f739426c         95
16     0019bc34b3c58e         97
17     0019d729577e9c         94
18     001cd8e7e6b737         24
19     001e5e4799a31c         92
20     001e7bc06b1611         59
21     001eeedf57047a         99
22     00200fe5d9aae5         99
23     0022adebf59be6         96
24     0022e660571ee8         97
25     00232f9d9c5421         96
26     0025477f88bbd9         94
27     002771bd25fe0e         92
28     0027a504cd3b0c         99
29     002

In [22]:
# Standardize
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [23]:
# Define method to get k-fold CV MAE
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression

def report_mae(model, X, y, K, with_postprocess=False, df_meta=None):
    kf = KFold(n_splits=K)
    maes = []
    for idx_train, idx_valid in kf.split(X):
        print('processing fold...')
        X_train = X[idx_train]
        y_train = y[idx_train]
        X_valid = X[idx_valid]
        y_valid = y[idx_valid]
        df_train = None
        df_valid= None
        if with_postprocess:
            df_train = df_meta.iloc[idx_train]
            df_valid = df_meta.iloc[idx_valid]
        
        model.fit(X_train, y_train)
        y_predicted = model.predict(X_valid)
        if with_postprocess:
            y_predicted = postprocess(df_valid, y_predicted)
        #print(y_predicted)
        mae = mean_absolute_error(y_valid, (y_predicted['winPlacePerc'] if with_postprocess else y_predicted))
        print('current MAE: %.5f' % mae)
        maes.append(mae)
    return np.array(maes).mean()

In [24]:
K = 2

In [25]:
# Compute MAE by 2-fold CV
mae_mean = report_mae(LinearRegression(), X_train_std, y_train, K)
print('Baseline MAE: %.5f' % mae_mean)

processing fold...
current MAE: 0.04537
processing fold...
current MAE: 0.04536
Baseline MAE: 0.04536


In [27]:
# Compute MAE by 2-fold CV
mae_mean = report_mae(LinearRegression(), X_train_std, y_train, K, True, df_train_meta)
print('MAE with postprocessing: %.5f' % mae_mean)

processing fold...
current MAE: 0.04933
processing fold...
current MAE: 0.04922
MAE with postprocessing: 0.04927
