In [1]:
# Import and setup
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
import matplotlib.pyplot as plt
%matplotlib inline

In [101]:
# Read data
df_train = pd.read_csv('train_V2.csv', nrows=None)
df_test = pd.read_csv('test_V2.csv', nrows=50)

# Drop NA
df_train = df_train.dropna()
print(df_train.shape)
print(df_test.shape)
df_train.head()

(4446965, 29)
(50, 28)


Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


In [102]:
# Define class to preprocess raw data
class DfPreprocessor:
    def __init__(self):
        self.mean_rankpoints = None
        self.mean_killpoints = None
        self.mean_winpoints = None
    
    def transform_train(self, df):
        # Replace None values of rankPoints with mean
        col_rankpoints = df['rankPoints']
        col_rankpoints = col_rankpoints.loc[col_rankpoints > 1e-4]
        self.mean_rankpoints = col_rankpoints.mean()
        # print('The mean of non-none rankPoints is %.3f' % mean_rankpoints)
        df.loc[df['rankPoints'] < 1e-4, 'rankPoints'] = self.mean_rankpoints
        
        # Replace None values of killPoints with mean
        col_killpoints = df['killPoints']
        col_killpoints = col_killpoints.loc[col_killpoints > 1e-4]
        self.mean_killpoints = col_killpoints.mean()
        # print('The mean of non-none killPoints is %.3f' % mean_killpoints)
        df.loc[df['killPoints'] < 1e-4, 'killPoints'] = self.mean_killpoints
        
        # Replace None values of winPoints with mean
        col_winpoints = df['winPoints']
        col_winpoints = col_winpoints.loc[col_winpoints > 1e-4]
        self.mean_winpoints = col_winpoints.mean()
        # print('The mean of non-none winPoints is %.3f' % mean_winpoints)
        df.loc[df['winPoints'] < 1e-4, 'winPoints'] = self.mean_winpoints
        
        # Drop Id, groupId, matchId, matchType
        df_meta = df[['Id', 'groupId', 'matchId', 'maxPlace', 'numGroups']]
        df = df.drop(columns=['Id', 'groupId', 'matchId', 'matchType'])
        
        return df, df_meta
    
    def transform_test(self, df):
        # Replace None values of rankPoints, killPoints, winPoints
        df.loc[df['rankPoints'] < 1e-4, 'rankPoints'] = self.mean_rankpoints
        df.loc[df['killPoints'] < 1e-4, 'killPoints'] = self.mean_killpoints
        df.loc[df['winPoints'] < 1e-4, 'winPoints'] = self.mean_winpoints
        
        # Drop Id, groupId, matchId, matchType
        df_meta = df[['Id', 'groupId', 'matchId', 'maxPlace', 'numGroups']]
        df = df.drop(columns=['Id', 'groupId', 'matchId', 'matchType'])
        
        return df, df_meta

In [103]:
# Preprocess data
df_preprocessor = DfPreprocessor()
df_train, df_train_meta = df_preprocessor.transform_train(df_train)
df_test, df_test_meta = df_preprocessor.transform_test(df_test)

# Get X and y
y_train = df_train['winPlacePerc'].values
X_train = df_train.drop(columns='winPlacePerc').values
X_test = df_test.values
print(X_train.shape)
print(X_test.shape)

(4446965, 24)
(50, 24)


In [104]:
# Standardize
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [105]:
# Define method to get k-fold CV MAE
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

def report_mae(model, X, y, K, with_postprocess=False, df_meta=None):
    kf = KFold(n_splits=K)
    maes = []
    for idx_train, idx_valid in kf.split(X):
        print('processing fold...')
        X_train = X[idx_train]
        y_train = y[idx_train]
        X_valid = X[idx_valid]
        y_valid = y[idx_valid]
        df_train = None
        df_valid= None
        if with_postprocess:
            df_train = df_meta.iloc[idx_train]
            df_valid = df_meta.iloc[idx_valid]
        
        model.fit(X_train, y_train)
        y_predicted = model.predict(X_valid)
        if with_postprocess:
            y_predicted = postprocess(df_valid, y_predicted)
        #print(y_predicted)
        mae = mean_absolute_error(y_valid, (y_predicted['winPlacePerc'] if with_postprocess else y_predicted))
        print('current MAE: %.5f' % mae)
        maes.append(mae)
    return np.array(maes).mean()

In [92]:
# Compute MAE by 5-fold CV
from sklearn.linear_model import LinearRegression

K = 2
mae_mean = report_mae(LinearRegression(), X_train_std, y_train, K)
print('Baseline MAE: %.5f' % mae_mean)

processing fold...
current MAE: 0.09151
processing fold...
current MAE: 0.09292
Baseline MAE: 0.09221


In [119]:
# Define method to do postprocessing
def postprocess(df_meta, y_predicted):
    df = df_meta.copy()
    df['winPlacePerc'] = y_predicted
    
    df_group = df.groupby(['matchId', 'groupId'])['winPlacePerc'].max().to_frame('maxPerc').reset_index()
    df_group['rank'] = df_group.groupby(['matchId'])['maxPerc'].rank()
    df_group = df_group.merge(
        df_group.groupby('matchId')['rank'].max().to_frame('count').reset_index(),
        on='matchId', how='left'
    )
    df = df.merge(df_group, on=['matchId', 'groupId'], how='left')
    df = df.assign(finalPrec=df.apply(
        lambda row: 0.0 if row['count'] == 1 else (row['rank'] - 1) * (1 / (row['count'] - 1)),
        axis=1
    ))
    df['winPlacePerc'] = df['finalPrec']
    #print(df)
    return df[['Id', 'winPlacePerc']]

# Unit test for postprocess
df_aaa = pd.DataFrame([['m01', 'g01', 'id1', 3, 2],
                      ['m01', 'g01', 'id2', 3, 2],
                      ['m01', 'g01', 'id3', 3, 2],
                      ['m01', 'g02', 'id4', 3, 2],
                      ['m02', 'g03', 'id5', 2, 1]],
                     [0, 1, 2, 3, 4],
                     ['matchId', 'groupId', 'Id', 'maxPlace', 'numGroups'])
postprocess(df_aaa, np.array([0.553207, 0.218437, 0.716247, 0.371352, 0.468933]))

  matchId groupId   Id  maxPlace  numGroups  winPlacePerc   maxPerc  rank  \
0     m01     g01  id1         3          2           1.0  0.716247   2.0   
1     m01     g01  id2         3          2           1.0  0.716247   2.0   
2     m01     g01  id3         3          2           1.0  0.716247   2.0   
3     m01     g02  id4         3          2           0.0  0.371352   1.0   
4     m02     g03  id5         2          1           0.0  0.468933   1.0   

   count  finalPrec  
0    2.0        1.0  
1    2.0        1.0  
2    2.0        1.0  
3    2.0        0.0  
4    1.0        0.0  


Unnamed: 0,Id,winPlacePerc
0,id1,1.0
1,id2,1.0
2,id3,1.0
3,id4,0.0
4,id5,0.0


In [120]:
# Compute MAE by 5-fold CV
mae_mean = report_mae(LinearRegression(), X_train_std, y_train, K, True, df_train_meta)
print(mae_mean)

processing fold...
                     Id         groupId         matchId  maxPlace  numGroups  \
0        7f96b2f878858a  4d4b580de459be  a10357fd1a4a91        28         26   
1        eef90569b9d03c  684d5656442f9e  aeb375fc57110c        26         25   
2        1eaf90ac73de72  6a4a42c3245a74  110163d8bb94ae        50         47   
3        4616d365dd2853  a930a9c79cd721  f1f1f4ef412d7e        31         30   
4        315c96c26c9aac  de04010b3458dd  6dc8ff871e21e6        97         95   
5        ff79c12f326506  289a6836a88d27  bac52627a12114        28         28   
6        95959be0e21ca3  2c485a1ad3d0f1  a8274e903927a2        28         28   
7        311b84c6ff4390  eaba5fcb7fc1ae  292611730ca862        96         92   
8        1a68204ccf9891  47cfbb04e1b1a2  df014fbee741c6        28         27   
9        e5bb5a43587253  759bb6f7514fd2  3d3031c795305b        29         27   
10       2b574d43972813  c549efede67ad3  2dd6ddb8320fc1        29         29   
11       8de328a74658

current MAE: 0.08358
processing fold...
                     Id         groupId         matchId  maxPlace  numGroups  \
0        a1c71501d530da  f71d60ccb258c9  6fb1a0d8a5f6a3        47         45   
1        9c852b19e819e0  1ef60c22aa3fe5  18f2e81426b5ad        28         25   
2        8db60c669fb0e6  b65a5019540045  b08dc455962c5d        99         98   
3        e7fa5ef567b3da  89dcc68c0abc7a  28e508058b132a        30         29   
4        6458f59647dc40  cf6628d8898425  5c02588dd90fa3        29         28   
5        f9de3ba518138b  f53d3d7115e78a  a7c4f9e42e77b0        48         47   
6        b50cca2048764e  6ae997f5e3604b  fbdcebfa50b20e        26         24   
7        ac07550116df45  0e3bc8ef3f7462  2ca31b42a4cfe5        29         27   
8        71456133c14740  0064ff17029f8b  c3c8170a76df50        27         25   
9        3a31356c000a76  8b0cee9e151dbc  b9850522cb8e29        45         43   
10       43b1e912a27a09  3ca0a5492fe933  da577c471e612d        46         45   


current MAE: 0.08320
0.08338727295356253
