In [1]:
# Import and setup
import numpy as np
import pandas as pd
pd.options.display.max_columns = None
import matplotlib.pyplot as plt
%matplotlib inline

### Read data from file

In [2]:
# Define file path
file_train = 'train_V2.csv'

In [15]:
# Define method to read data, providing abilities to subsample and read by chunks
def read_data(file_path, n_rows=None, chunk_size=None, smpl_rate=1):
    
    def sample_data(df, smpl_rate):
        n_samples = int(df.shape[0] * smpl_rate)
        return data.iloc[np.random.choice(df.shape[0], n_samples, replace=False)]
    
    data = pd.read_csv(file_path, nrows=n_rows, chunksize=chunk_size)
    if chunk_size is not None:
        data = pd.concat([chunk for chunk in data])
        
    return sample_data(data, smpl_rate)

In [18]:
# Read data from file
data = read_data(file_train)
data.shape

(4500, 29)

In [51]:
data.head()

Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
0,7f96b2f878858a,4d4b580de459be,a10357fd1a4a91,0,0,0.0,0,0,0,60,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444
1,eef90569b9d03c,684d5656442f9e,aeb375fc57110c,0,0,91.47,0,0,0,57,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64
2,1eaf90ac73de72,6a4a42c3245a74,110163d8bb94ae,1,0,68.0,0,0,0,47,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755
3,4616d365dd2853,a930a9c79cd721,f1f1f4ef412d7e,0,0,32.9,0,0,0,75,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667
4,315c96c26c9aac,de04010b3458dd,6dc8ff871e21e6,0,0,100.0,0,0,0,45,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875


### Clean missing values

In [45]:
# Show the number of NAN fields
data.isnull().sum(axis=0)

Id                 0
groupId            0
matchId            0
assists            0
boosts             0
damageDealt        0
DBNOs              0
headshotKills      0
heals              0
killPlace          0
killPoints         0
kills              0
killStreaks        0
longestKill        0
matchDuration      0
matchType          0
maxPlace           0
numGroups          0
rankPoints         0
revives            0
rideDistance       0
roadKills          0
swimDistance       0
teamKills          0
vehicleDestroys    0
walkDistance       0
weaponsAcquired    0
winPoints          0
winPlacePerc       1
dtype: int64

In [47]:
# Drop the row with null winPlacePerc
data = data.dropna()
data.shape

(4446965, 29)

### Show data description

In [50]:
data.describe()

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
count,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0,4446965.0
mean,0.233815,1.106908,130.7172,0.6578757,0.2268196,1.370148,47.59936,505.0062,0.9247835,0.5439553,22.9976,1579.507,44.50468,43.0076,892.0103,0.164659,606.1158,0.003496092,4.509323,0.02386841,0.007918209,1154.218,3.660488,606.4603,0.4728216
std,0.5885731,1.715794,170.7806,1.145743,0.6021553,2.679982,27.46293,627.5049,1.558445,0.7109721,50.97262,258.7388,23.8281,23.28949,736.6478,0.4721671,1498.344,0.07337297,30.5022,0.1673935,0.09261158,1183.497,2.456543,739.7005,0.307405
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,133.0,2.0,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0,1367.0,28.0,27.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,155.1,2.0,0.0,0.2
50%,0.0,0.0,84.24,0.0,0.0,0.0,47.0,0.0,0.0,0.0,0.0,1438.0,30.0,30.0,1443.0,0.0,0.0,0.0,0.0,0.0,0.0,685.6,3.0,0.0,0.4583
75%,0.0,2.0,186.0,1.0,0.0,2.0,71.0,1172.0,1.0,1.0,21.32,1851.0,49.0,47.0,1500.0,0.0,0.191,0.0,0.0,0.0,0.0,1976.0,5.0,1495.0,0.7407
max,22.0,33.0,6616.0,53.0,64.0,80.0,101.0,2170.0,72.0,20.0,1094.0,2237.0,100.0,100.0,5910.0,39.0,40710.0,18.0,3823.0,12.0,5.0,25780.0,236.0,2013.0,1.0


There are four features (not shown in the description) with string type: Id, groupId, matchId, matchType. These four types will be handled separately.

All features in the description have values in a valid range. Only killPlace has maximum ($101$) slightly off.

Last thing to notice is:
* rankPoints (external ranking of players) has at least 25% but no more than 50% samples with value $-1$ which means "None".
* killPoints and winPoints (external kill/win based ranking of players): if there is a value other than $-1$ in rankPoints, then any 0 in killPoints and winPoints should be treated as "None".

In [None]:
(data.loc[data['rankPoints'] == -1 & data['killPoints'] == -1])[['killPoints', 'winPoints']]