In [5]:
# Import and setup
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
import matplotlib.pyplot as plt
%matplotlib inline

In [20]:
# Read data
df_train = pd.read_csv('train_V2.csv', nrows=None)
df_test = pd.read_csv('test_V2.csv', nrows=50)

# Drop NA
df_train = df_train.dropna()
print(df_train.shape)
print(df_test.shape)
df_train.head()

(4446965, 29)
(50, 28)


Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [21]:
# Define class to preprocess raw data
class DfTransformer:
    def __init__(self):
        self.mean_rankpoints = None
        self.mean_killpoints = None
        self.mean_winpoints = None
        self.df_test_meta = None
    
    def transform_train(self, df):
        # Replace None values of rankPoints with mean
        col_rankpoints = df['rankPoints']
        col_rankpoints = col_rankpoints.loc[col_rankpoints > 1e-4]
        self.mean_rankpoints = col_rankpoints.mean()
        # print('The mean of non-none rankPoints is %.3f' % mean_rankpoints)
        df.loc[df['rankPoints'] < 1e-4, 'rankPoints'] = self.mean_rankpoints
        
        # Replace None values of killPoints with mean
        col_killpoints = df['killPoints']
        col_killpoints = col_killpoints.loc[col_killpoints > 1e-4]
        self.mean_killpoints = col_killpoints.mean()
        # print('The mean of non-none killPoints is %.3f' % mean_killpoints)
        df.loc[df['killPoints'] < 1e-4, 'killPoints'] = self.mean_killpoints
        
        # Replace None values of winPoints with mean
        col_winpoints = df['winPoints']
        col_winpoints = col_winpoints.loc[col_winpoints > 1e-4]
        self.mean_winpoints = col_winpoints.mean()
        # print('The mean of non-none winPoints is %.3f' % mean_winpoints)
        df.loc[df['winPoints'] < 1e-4, 'winPoints'] = self.mean_winpoints
        
        # Feature engineering
        return self.feature_engineering(df, True)
    
    def transform_test(self, df):
        # Replace None values of rankPoints, killPoints, winPoints
        df.loc[df['rankPoints'] < 1e-4, 'rankPoints'] = self.mean_rankpoints
        df.loc[df['killPoints'] < 1e-4, 'killPoints'] = self.mean_killpoints
        df.loc[df['winPoints'] < 1e-4, 'winPoints'] = self.mean_winpoints
        
        # Feature engineering
        return self.feature_engineering(df, False)
    
    def transform_prediction(self, df_meta, y_predicted):
        df_y = df_meta.copy()
        df_y['winPlacePerc'] = y_predicted
        return (self.df_test_meta.merge(df_y, on=['matchId', 'groupId'], how='left'))[['Id', 'winPlacePerc']]
        
    
    def feature_engineering(self, df, is_train=True):
        # Add hand-engineered features
        df_walkDistance = df['walkDistance'] + 5
        df['heals_over_dist'] = df['heals'] / df_walkDistance
        df['boosts_over_dist'] = df['boosts'] / df_walkDistance
        df['kills_over_dist'] = df['kills'] / df_walkDistance
        df['damageDealt_over_dist'] = df['damageDealt'] / df_walkDistance
        df_walkDistance = None
        
        features = df.columns.tolist()
        features.remove('Id')
        features.remove('groupId')
        features.remove('matchId')
        features.remove('matchType')
        if is_train: features.remove('winPlacePerc')
            
        # Add group mean and group mean rank in match
        group_by = df.groupby(['matchId','groupId'])
        group_by_features = group_by[features]
        df_agg = group_by_features.agg('mean')
        df_agg_rank = df_agg.groupby('matchId')[features].rank(pct=True).reset_index()
        df_out = df_agg.reset_index().merge(df_agg_rank, on=['matchId', 'groupId'], how='left',
                                            suffixes=["_groupMean", "_groupMeanRank"])

        # Add group max and group max rank in match
        df_agg = group_by_features.agg('max')
        df_agg_rank = df_agg.groupby('matchId')[features].rank(pct=True).reset_index()
        df_agg = df_agg.reset_index().merge(df_agg_rank, on=['matchId', 'groupId'], how='left',
                                            suffixes=["_groupMax", "_groupMaxRank"])
        df_out = df_out.merge(df_agg, on=['matchId', 'groupId'], how='left')

        # Add group min and group min rank in match
        df_agg = group_by_features.agg('min')
        df_agg_rank = df_agg.groupby('matchId')[features].rank(pct=True).reset_index()
        df_agg = df_agg.reset_index().merge(df_agg_rank, on=['matchId', 'groupId'], how='left',
                                            suffixes=["_groupMin", "_groupMinRank"])
        df_out = df_out.merge(df_agg, on=['matchId', 'groupId'], how='left')
        group_by_features = None
        
        # Add group size
        df_agg = group_by.size().to_frame('groupSize').reset_index()
        df_out = df_out.merge(df_agg, on=['matchId', 'groupId'], how='left')

        # Add target if for training
        if is_train:
            df_agg = group_by[['winPlacePerc']].first().reset_index()
            df_out = df_out.merge(df_agg, on=['matchId', 'groupId'], how='left')
            
        # Add match mean
        group_by = df.groupby(['matchId'])
        df_agg = group_by[features].agg('mean').reset_index()
        df_out = df_out.merge(df_agg, on=['matchId'], how='left') # Original name now represents match mean
            
        # Add match size
        df_agg = group_by['groupId'].agg({'matchSize': 'nunique'}).reset_index()
        df_out = df_out.merge(df_agg[['matchId', 'matchSize']], on=['matchId'], how='left')

        # Add encoded matchType (no improvement)
        # df_agg = group_by['matchType'].first().reset_index()
        # df_out = df_out.merge(df_agg, on=['matchId'], how='left')
        # df_out = pd.get_dummies(df_out, columns=['matchType']) # Ont-hot encoding
        # df_out['matchType'] = df_out['matchType'].astype('category').cat.codes # Label encoding
        
        # Keep metadata of df_test for later restoring individual prediction
        if not is_train:
            self.df_test_meta = df[['Id', 'matchId', 'groupId']]
        
        # print(df_out)
        # Return: features, metadata, weights
        return df_out.drop(columns=['matchId', 'groupId']),\
               df_out[['matchId', 'groupId']],\
               df_out['groupSize'].values

# Unit test for DfPreprocessor
df_bbb = df_train.iloc[:6, :].copy()
df_bbb.iloc[0, 26] = 0
df_bbb.iloc[1:5, 2] = df_bbb.iloc[0, 2]
df_bbb.iloc[1:3, 1] = df_bbb.iloc[0, 1]
df_bbb.iloc[1:3, 28] = df_bbb.iloc[0, 28]
df_bbb, df_bbb_meta, bbb_weights = DfTransformer().transform_train(df_bbb)
df_bbb

is deprecated and will be removed in a future version


Unnamed: 0,assists_groupMean,boosts_groupMean,damageDealt_groupMean,DBNOs_groupMean,headshotKills_groupMean,heals_groupMean,killPlace_groupMean,killPoints_groupMean,kills_groupMean,killStreaks_groupMean,longestKill_groupMean,matchDuration_groupMean,maxPlace_groupMean,numGroups_groupMean,rankPoints_groupMean,revives_groupMean,rideDistance_groupMean,roadKills_groupMean,swimDistance_groupMean,teamKills_groupMean,vehicleDestroys_groupMean,walkDistance_groupMean,weaponsAcquired_groupMean,winPoints_groupMean,heals_over_dist_groupMean,boosts_over_dist_groupMean,kills_over_dist_groupMean,damageDealt_over_dist_groupMean,assists_groupMeanRank,boosts_groupMeanRank,damageDealt_groupMeanRank,DBNOs_groupMeanRank,headshotKills_groupMeanRank,heals_groupMeanRank,killPlace_groupMeanRank,killPoints_groupMeanRank,kills_groupMeanRank,killStreaks_groupMeanRank,longestKill_groupMeanRank,matchDuration_groupMeanRank,maxPlace_groupMeanRank,numGroups_groupMeanRank,rankPoints_groupMeanRank,revives_groupMeanRank,rideDistance_groupMeanRank,roadKills_groupMeanRank,swimDistance_groupMeanRank,teamKills_groupMeanRank,vehicleDestroys_groupMeanRank,walkDistance_groupMeanRank,weaponsAcquired_groupMeanRank,winPoints_groupMeanRank,heals_over_dist_groupMeanRank,boosts_over_dist_groupMeanRank,kills_over_dist_groupMeanRank,damageDealt_over_dist_groupMeanRank,assists_groupMax,boosts_groupMax,damageDealt_groupMax,DBNOs_groupMax,headshotKills_groupMax,heals_groupMax,killPlace_groupMax,killPoints_groupMax,kills_groupMax,killStreaks_groupMax,longestKill_groupMax,matchDuration_groupMax,maxPlace_groupMax,numGroups_groupMax,rankPoints_groupMax,revives_groupMax,rideDistance_groupMax,roadKills_groupMax,swimDistance_groupMax,teamKills_groupMax,vehicleDestroys_groupMax,walkDistance_groupMax,weaponsAcquired_groupMax,winPoints_groupMax,heals_over_dist_groupMax,boosts_over_dist_groupMax,kills_over_dist_groupMax,damageDealt_over_dist_groupMax,assists_groupMaxRank,boosts_groupMaxRank,damageDealt_groupMaxRank,DBNOs_groupMaxRank,headshotKills_groupMaxRank,heals_groupMaxRank,killPlace_groupMaxRank,killPoints_groupMaxRank,kills_groupMaxRank,killStreaks_groupMaxRank,longestKill_groupMaxRank,matchDuration_groupMaxRank,maxPlace_groupMaxRank,numGroups_groupMaxRank,rankPoints_groupMaxRank,revives_groupMaxRank,rideDistance_groupMaxRank,roadKills_groupMaxRank,swimDistance_groupMaxRank,teamKills_groupMaxRank,vehicleDestroys_groupMaxRank,walkDistance_groupMaxRank,weaponsAcquired_groupMaxRank,winPoints_groupMaxRank,heals_over_dist_groupMaxRank,boosts_over_dist_groupMaxRank,kills_over_dist_groupMaxRank,damageDealt_over_dist_groupMaxRank,assists_groupMin,boosts_groupMin,damageDealt_groupMin,DBNOs_groupMin,headshotKills_groupMin,heals_groupMin,killPlace_groupMin,killPoints_groupMin,kills_groupMin,killStreaks_groupMin,longestKill_groupMin,matchDuration_groupMin,maxPlace_groupMin,numGroups_groupMin,rankPoints_groupMin,revives_groupMin,rideDistance_groupMin,roadKills_groupMin,swimDistance_groupMin,teamKills_groupMin,vehicleDestroys_groupMin,walkDistance_groupMin,weaponsAcquired_groupMin,winPoints_groupMin,heals_over_dist_groupMin,boosts_over_dist_groupMin,kills_over_dist_groupMin,damageDealt_over_dist_groupMin,assists_groupMinRank,boosts_groupMinRank,damageDealt_groupMinRank,DBNOs_groupMinRank,headshotKills_groupMinRank,heals_groupMinRank,killPlace_groupMinRank,killPoints_groupMinRank,kills_groupMinRank,killStreaks_groupMinRank,longestKill_groupMinRank,matchDuration_groupMinRank,maxPlace_groupMinRank,numGroups_groupMinRank,rankPoints_groupMinRank,revives_groupMinRank,rideDistance_groupMinRank,roadKills_groupMinRank,swimDistance_groupMinRank,teamKills_groupMinRank,vehicleDestroys_groupMinRank,walkDistance_groupMinRank,weaponsAcquired_groupMinRank,winPoints_groupMinRank,heals_over_dist_groupMinRank,boosts_over_dist_groupMinRank,kills_over_dist_groupMinRank,damageDealt_over_dist_groupMinRank,groupSize,winPlacePerc,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,heals_over_dist,boosts_over_dist,kills_over_dist,damageDealt_over_dist,matchSize
0,0.333333,0.0,53.156667,0.0,0.0,0.0,54.666667,1241.0,0.0,0.0,0.0,1467.0,34.666667,32.666667,1482.4,0.0,0.0015,0.0,3.68,0.0,0.0,613.533333,2.333333,1466.0,0.0,0.0,0.0,0.153051,1.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,1.0,0.666667,0.666667,0.666667,0.666667,1.0,0.666667,1.0,0.666667,0.666667,1.0,0.666667,0.666667,0.666667,0.666667,0.5,0.333333,1,0,91.47,0,0,0,60,1241.0,0,0,0.0,1777,50,47,1491.0,0,0.0045,0,11.04,0,0,1434.0,5,1466.0,0.0,0.0,0.0,0.395809,1.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,1.0,0.666667,0.666667,0.666667,0.666667,1.0,0.666667,1.0,0.666667,0.666667,1.0,1.0,0.666667,0.666667,0.666667,0.5,0.666667,0,0,0.0,0,0,0,47,1241.0,0,0,0.0,1306,26,25,1472.2,0,0.0,0,0.0,0,0,161.8,0,1466.0,0.0,0.0,0.0,0.0,0.666667,0.666667,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.5,0.5,0.333333,0.333333,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.333333,0.666667,0.666667,0.666667,0.5,0.333333,3,0.4444,0.2,0.0,58.474,0.0,0.0,0.0,56.8,1241.0,0.2,0.2,11.706,1452.2,46.4,44.6,1483.04,0.0,0.0009,0.0,2.208,0.0,0.0,418.61,2.4,1466.0,0.0,0.0,0.003347,0.457494,3
1,0.0,0.0,32.9,0.0,0.0,0.0,75.0,1241.0,0.0,0.0,0.0,1436.0,31.0,30.0,1408.0,0.0,0.0,0.0,0.0,0.0,0.0,202.7,3.0,1466.0,0.0,0.0,0.0,0.154678,0.5,0.666667,0.333333,0.666667,0.666667,0.666667,1.0,0.666667,0.5,0.5,0.5,0.666667,0.333333,0.333333,0.333333,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.666667,1.0,0.666667,0.666667,0.666667,0.5,0.666667,0,0,32.9,0,0,0,75,1241.0,0,0,0.0,1436,31,30,1408.0,0,0.0,0,0.0,0,0,202.7,3,1466.0,0.0,0.0,0.0,0.154678,0.5,0.666667,0.333333,0.666667,0.666667,0.666667,1.0,0.666667,0.5,0.5,0.5,0.666667,0.333333,0.333333,0.333333,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.5,0.333333,0,0,32.9,0,0,0,75,1241.0,0,0,0.0,1436,31,30,1408.0,0,0.0,0,0.0,0,0,202.7,3,1466.0,0.0,0.0,0.0,0.154678,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,1.0,0.666667,0.5,0.5,0.5,1.0,0.666667,0.666667,0.333333,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,1.0,1.0,0.666667,0.666667,0.666667,0.5,0.666667,1,0.1667,0.2,0.0,58.474,0.0,0.0,0.0,56.8,1241.0,0.2,0.2,11.706,1452.2,46.4,44.6,1483.04,0.0,0.0009,0.0,2.208,0.0,0.0,418.61,2.4,1466.0,0.0,0.0,0.003347,0.457494,3
2,0.0,0.0,100.0,0.0,0.0,0.0,45.0,1241.0,1.0,1.0,58.53,1424.0,97.0,95.0,1560.0,0.0,0.0,0.0,0.0,0.0,0.0,49.75,2.0,1466.0,0.0,0.0,0.016736,1.67364,0.5,0.666667,1.0,0.666667,0.666667,0.666667,0.333333,0.666667,1.0,1.0,1.0,0.333333,1.0,1.0,1.0,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.333333,0.333333,0.666667,0.666667,0.666667,1.0,1.0,0,0,100.0,0,0,0,45,1241.0,1,1,58.53,1424,97,95,1560.0,0,0.0,0,0.0,0,0,49.75,2,1466.0,0.0,0.0,0.016736,1.67364,0.5,0.666667,1.0,0.666667,0.666667,0.666667,0.333333,0.666667,1.0,1.0,1.0,0.333333,1.0,1.0,1.0,0.666667,0.5,0.666667,0.5,0.666667,0.666667,0.333333,0.333333,0.666667,0.666667,0.666667,1.0,1.0,0,0,100.0,0,0,0,45,1241.0,1,1,58.53,1424,97,95,1560.0,0,0.0,0,0.0,0,0,49.75,2,1466.0,0.0,0.0,0.016736,1.67364,0.666667,0.666667,1.0,0.666667,0.666667,0.666667,0.333333,0.666667,1.0,1.0,1.0,0.666667,1.0,1.0,1.0,0.666667,0.666667,0.666667,0.666667,0.666667,0.666667,0.333333,0.666667,0.666667,0.666667,0.666667,1.0,1.0,1,0.1875,0.2,0.0,58.474,0.0,0.0,0.0,56.8,1241.0,0.2,0.2,11.706,1452.2,46.4,44.6,1483.04,0.0,0.0009,0.0,2.208,0.0,0.0,418.61,2.4,1466.0,0.0,0.0,0.003347,0.457494,3
3,0.0,0.0,100.0,1.0,1.0,0.0,44.0,1241.0,1.0,1.0,18.44,1395.0,28.0,28.0,1418.0,0.0,0.0,0.0,0.0,0.0,0.0,34.7,1.0,1466.0,0.0,0.0,0.022371,2.237136,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0,100.0,1,1,0,44,1241.0,1,1,18.44,1395,28,28,1418.0,0,0.0,0,0.0,0,0,34.7,1,1466.0,0.0,0.0,0.022371,2.237136,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,0,100.0,1,1,0,44,1241.0,1,1,18.44,1395,28,28,1418.0,0,0.0,0,0.0,0,0,34.7,1,1466.0,0.0,0.0,0.022371,2.237136,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1,0.037,0.0,0.0,100.0,1.0,1.0,0.0,44.0,1241.0,1.0,1.0,18.44,1395.0,28.0,28.0,1418.0,0.0,0.0,0.0,0.0,0.0,0.0,34.7,1.0,1466.0,0.0,0.0,0.022371,2.237136,1


In [22]:
# Preprocess data
df_transformer = DfTransformer()
df_train, df_train_meta, weight_train = df_transformer.transform_train(df_train)
df_test, df_test_meta, weight_test = df_transformer.transform_test(df_test)

# Get X and y
y_train = df_train['winPlacePerc'].values
X_train = df_train.drop(columns='winPlacePerc').values
X_test = df_test.values
print(X_train.shape)
print(X_test.shape)

is deprecated and will be removed in a future version


(2026744, 198)
(50, 198)


In [23]:
# Standardize
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [24]:
# Define method to get k-fold CV MAE
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression

def report_mae(model, X, y, weight, K=2):
    kf = KFold(n_splits=K)
    maes = []
    for idx_train, idx_valid in kf.split(X):
        print('processing fold...')
        X_train = X[idx_train]
        y_train = y[idx_train]
        X_valid = X[idx_valid]
        y_valid = y[idx_valid]
        weight_train = weight[idx_train]
        weight_valid = weight[idx_valid]
        
        model.fit(X_train, y_train, sample_weight=weight_train)
        y_predicted = model.predict(X_valid)
        mae = mean_absolute_error(y_valid, y_predicted, sample_weight=weight_valid)
        print('current MAE: %.5f' % mae)
        maes.append(mae)
    return np.array(maes).mean()

In [25]:
K = 2

In [26]:
# Compute MAE by 2-fold CV
mae_mean = report_mae(LinearRegression(), X_train_std, y_train, weight_train, K)
print('Linear Regression MAE: %.5f' % mae_mean)

processing fold...
current MAE: 0.04182
processing fold...
current MAE: 0.04167
Linear Regression MAE: 0.04174
