In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import gc
import os
import sys

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from timeit import default_timer as timer

import lightgbm as lgb

from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [2]:
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Load and show data

In [3]:
def state(message,start = True, time = 0):
    if(start):
        print(f'Working on {message} ... ')
    else :
        print(f'Working on {message} took ({round(time , 3)}) Sec \n')

In [4]:
# Import dataset
df_train = pd.read_csv('../input/train_V2.csv')
df_test = pd.read_csv('../input/test_V2.csv')

# Reduce memory use
df_train=reduce_mem_usage(df_train)
df_test=reduce_mem_usage(df_test)

# Show some data
df_train.head()
df_train.describe()

Memory usage of dataframe is 983.90 MB --> 339.28 MB (Decreased by 65.5%)
Memory usage of dataframe is 413.18 MB --> 140.19 MB (Decreased by 66.1%)


Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
count,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,...,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446965.0
mean,0.2338149,1.106908,130.7172,0.6578755,0.2268196,1.370147,47.59935,505.006,0.9247833,0.5439551,...,0.164659,606.116,0.003496091,4.509323,0.02386841,0.007918208,1154.218,3.660488,606.4601,0.4728218
std,0.5885731,1.715794,170.7806,1.145743,0.6021553,2.679982,27.46294,627.5049,1.558445,0.7109721,...,0.4721671,1498.344,0.07337297,30.5022,0.1673935,0.09261157,1183.497,2.456544,739.7004,0.307405
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,155.1,2.0,0.0,0.2
50%,0.0,0.0,84.24,0.0,0.0,0.0,47.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,685.6,3.0,0.0,0.4583
75%,0.0,2.0,186.0,1.0,0.0,2.0,71.0,1172.0,1.0,1.0,...,0.0,0.190975,0.0,0.0,0.0,0.0,1976.0,5.0,1495.0,0.7407
max,22.0,33.0,6616.0,53.0,64.0,80.0,101.0,2170.0,72.0,20.0,...,39.0,40710.0,18.0,3823.0,12.0,5.0,25780.0,236.0,2013.0,1.0


# Clean the data

In [5]:
# Reference link: https://www.kaggle.com/melonaded/a-beginner-guide-to-top-35-lasso-rf-lgbm

# Drop features
df_train = df_train.drop(['longestKill', 'numGroups'], axis=1)
df_test = df_test.drop(['longestKill', 'numGroups'], axis=1)

# Check row with NaN value
df_train[df_train['winPlacePerc'].isnull()]
# Drop row with NaN 'winPlacePerc' value
df_train.drop(2744604, inplace=True)

df_train['kills'].value_counts()
df_train['DBNOs'].value_counts()
df_train['weaponsAcquired'].value_counts()

2      768836
3      768347
4      689622
1      580951
5      540721
        ...  
77          1
75          1
74          1
71          1
236         1
Name: weaponsAcquired, Length: 97, dtype: int64

# III. Feature engineering

In [6]:
def feature_engineering(df,is_train=True):
    if is_train: 
        df = df[df['maxPlace'] > 1]

    state('totalDistance')
    s = timer()
    df['totalDistance'] = df['rideDistance'] + df["walkDistance"] + df["swimDistance"]
    e = timer()
    state('totalDistance', False, e - s)
          
    state('killPlace_over_maxPlace')
    s = timer()
    df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']
    e = timer()                                  
    state('killPlace_over_maxPlace', False, e - s)
    
    state('healsandboosts')
    s = timer()
    df['healsandboosts'] = df['heals'] + df['boosts']
    e = timer()                                  
    state('healsandboosts', False, e - s)
    
    target = 'winPlacePerc'
    features = list(df.columns)
    
    # Remove some features from the features list :
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchDuration")
    features.remove("matchType")
    
    y = None
    if is_train: 
        y = np.array(df.groupby(['matchId', 'groupId'])[target].agg('mean'), dtype=np.float64)
        # Remove the target from the features list :
        features.remove(target)
    
    # Make new features indicating the mean of the features ( grouped by match and group ) :
    print("get group mean feature")
    agg = df.groupby(['matchId', 'groupId'])[features].agg('mean')
    agg_rank = agg.groupby(['matchId'])[features].rank(pct=True).reset_index()
    
    
    # If we are processing the training data let df_out = the grouped  'matchId' and 'groupId'
    if is_train: 
        df_out = agg.reset_index()[['matchId', 'groupId']]
    else: 
        df_out = df[['matchId', 'groupId']]
    
    # Merge agg and agg_rank (that we got before) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])
    
    # Make new features indicating the median value of the features for each group ( grouped by match )
    print("get group median feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('median')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    # Merge the new (agg and agg_rank) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_median", "_median_rank"], how='left', on=['matchId', 'groupId'])
    
    # Make new features indicating the max value of the features for each group ( grouped by match )
    print("get group max feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('max')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    # Merge the new (agg and agg_rank) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])
    
    # Make new features indicating the minimum value of the features for each group ( grouped by match )
    print("get group min feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('min')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    # Merge the new (agg and agg_rank) with df_out :
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])
    
     # Make new features indicating the sum value of the features for each group ( grouped by match )
    print("get group max feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('sum')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    # Merge the new (agg and agg_rank) with df_out :
     print("get group sum feature")
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_sum", "_sum_rank"], how='left', on=['matchId', 'groupId'])
    
    # Make new features indicating the number of players in each group ( grouped by match )
    print("get group size feature")
    agg = df.groupby(['matchId','groupId']).size().reset_index(name='group_size')
     
    # Merge the group_size feature with df_out :
    df_out = df_out.merge(agg, how='left', on=['matchId', 'groupId'])
    
    # Make new features indicating the mean value of each features for each match :
    print("get match mean feature")
    agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
    # Merge the new agg with df_out :
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    
    # Make new features indicating the number of groups in each match :
    print("get match size feature")
    agg = df.groupby(['matchId']).size().reset_index(name='match_size')
    # Merge the match_size feature with df_out :
    df_out = df_out.merge(agg, how='left', on=['matchId'])
    
    # Drop matchId and groupId
    df_out.drop(["matchId", "groupId"], axis=1, inplace=True)
    df_out = reduce_mem_usage(df_out)
    
    X = np.array(df_out, dtype=np.float64)
    
    del df, df_out, agg, agg_rank
    gc.collect()

    return X, y


In [7]:
x_train, y_train = feature_engineering(df_train,True)
x_test, _ = feature_engineering(df_test,False)

Working on totalDistance ... 
Working on totalDistance took (0.016) Sec 

Working on killPlace_over_maxPlace ... 
Working on killPlace_over_maxPlace took (0.022) Sec 

Working on healsandboosts ... 
Working on healsandboosts took (0.005) Sec 

get group mean feature
get group max feature
get group min feature
get group size feature
get match mean feature
get match size feature
Memory usage of dataframe is 2017.90 MB --> 1124.92 MB (Decreased by 44.3%)
Working on totalDistance ... 
Working on totalDistance took (0.011) Sec 

Working on killPlace_over_maxPlace ... 
Working on killPlace_over_maxPlace took (0.013) Sec 

Working on healsandboosts ... 
Working on healsandboosts took (0.003) Sec 

get group mean feature
get group max feature
get group min feature
get group size feature
get match mean feature
get match size feature
Memory usage of dataframe is 1925.73 MB --> 1071.70 MB (Decreased by 44.3%)


# Create model for train

In [8]:
# Split the train and the validation set for the fitting
random_seed=1
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.05, random_state=random_seed)

## Random Forest

In [9]:
# Random Forest
RF = RandomForestRegressor(n_estimators=10, min_samples_leaf=3, max_features=0.5, n_jobs=-1)

In [10]:
%%time
RF.fit(x_train, y_train)

Wall time: 5min 41s


RandomForestRegressor(max_features=0.5, min_samples_leaf=3, n_estimators=10,
                      n_jobs=-1)

In [11]:
mae_train_RF = mean_absolute_error(RF.predict(x_train), y_train)
mae_val_RF = mean_absolute_error(RF.predict(x_val), y_val)
print('mae train RF: ', mae_train_RF)
print('mae val RF: ', mae_val_RF)

mae train RF:  0.015622325231198733
mae val RF:  0.03291754968842933


## LightGBM

In [12]:
# Reference link: https://www.kaggle.com/chocozzz/lightgbm-baseline
def run_lgb(train_X, train_y, val_X, val_y, x_test):
    params = {"objective" : "regression", 
              "metric" : "mae", 
              'n_estimators':20000, 
              'early_stopping_rounds':200,
              "num_leaves" : 31, 
              "learning_rate" : 0.05, 
              "bagging_fraction" : 0.7,
              "bagging_seed" : 0, 
              "num_threads" : 4,
              "colsample_bytree" : 0.7
             }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, valid_sets=[lgtrain, lgval], early_stopping_rounds=200, verbose_eval=1000)
    
    pred_test_y = model.predict(x_test, num_iteration=model.best_iteration)
    return pred_test_y, model

In [None]:
%%time
# Training the model #
pred_test_lgb, model = run_lgb(x_train, y_train, x_val, y_val, x_test)



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32702
[LightGBM] [Info] Number of data points in the train set: 1925406, number of used features: 170
[LightGBM] [Info] Start training from score 0.499780
Training until validation scores don't improve for 200 rounds
[1000]	training's l1: 0.0294591	valid_1's l1: 0.0298832
[2000]	training's l1: 0.0285622	valid_1's l1: 0.0293086
[3000]	training's l1: 0.0279197	valid_1's l1: 0.0289643
[4000]	training's l1: 0.0273968	valid_1's l1: 0.0287243
[5000]	training's l1: 0.0269249	valid_1's l1: 0.0285189
[6000]	training's l1: 0.0264933	valid_1's l1: 0.0283476
[7000]	training's l1: 0.0260874	valid_1's l1: 0.0282013


In [None]:
mae_train_lgb = mean_absolute_error(model.predict(x_train, num_iteration=model.best_iteration), y_train)
mae_val_lgb = mean_absolute_error(model.predict(x_val, num_iteration=model.best_iteration), y_val)

print('mae train lgb: ', mae_train_lgb)
print('mae val lgb: ', mae_val_lgb)

## DNN

In [None]:
# Reference link: https://www.kaggle.com/qingyuanwu/deep-neural-network
def run_DNN(x_train, y_train, x_val, y_val, x_test):
    NN_model = Sequential()
    NN_model.add(Dense(x_train.shape[1],  input_dim = x_train.shape[1], activation='relu'))
    NN_model.add(Dense(136, activation='relu'))
    NN_model.add(Dense(136, activation='relu'))
    NN_model.add(Dense(136, activation='relu'))
    NN_model.add(Dense(136, activation='relu'))

    # output Layer
    NN_model.add(Dense(1, activation='linear'))

    # Compile the network :
    NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
    NN_model.summary()
    
    checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
    checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
    callbacks_list = [checkpoint]
    
    NN_model.fit(x=x_train, 
                 y=y_train, 
                 batch_size=1000,
                 epochs=30, 
                 verbose=1, 
                 callbacks=callbacks_list,
                 validation_split=0.15, 
                 validation_data=None, 
                 shuffle=True,
                 class_weight=None, 
                 sample_weight=None, 
                 initial_epoch=0,
                 steps_per_epoch=None, 
                 validation_steps=None)

    pred_test_y = NN_model.predict(x_test)
    pred_test_y = pred_test_y.reshape(-1)
    return pred_test_y, NN_model

In [None]:
%%time
# Training the model #
pred_test_DNN, model = run_DNN(x_train, y_train, x_val, y_val, x_test)

In [None]:
mae_train_DNN = mean_absolute_error(model.predict(x_train), y_train)
mae_val_DNN = mean_absolute_error(model.predict(x_val), y_val)
print('mae train dnn: ', mae_train_DNN)
print('mae val dnn: ', mae_val_DNN)

#  Use the model for prediction

## Random Forest

In [None]:
pred_test_RF = RF.predict(x_test)
df_test['winPlacePerc_RF'] = pred_test_RF
submission = df_test[['Id', 'winPlacePerc_RF']]
submission.to_csv('../output/submission_RF.csv', index=False)

##  LightGBM

In [None]:
df_test['winPlacePerc_lgb'] = pred_test_lgb
submission = df_test[['Id', 'winPlacePerc_lgb']]
submission.to_csv('../output/submission_lgb.csv', index=False)

## DNN

In [None]:
df_test['winPlacePerc_DNN'] = pred_test_DNN
submission = df_test[['Id', 'winPlacePerc_DNN']]
submission.to_csv('../output/submission_DNN.csv', index=False)

## Model ensembling(RF + DNN)

In [None]:
weight_DNN = (1 - mae_val_DNN) / (3 - mae_val_DNN - mae_val_RF - mae_val_lgb)
weight_RF = (1 - mae_val_RF) / (3 - mae_val_DNN - mae_val_RF - mae_val_lgb)
weight_lgb = (1 - mae_val_lgb) / (3 - mae_val_DNN - mae_val_RF - mae_val_lgb)

df_test['winPlacePerc'] = df_test.apply(lambda x: x['winPlacePerc_RF'] * weight_RF + x['winPlacePerc_DNN'] * weight_DNN + x['winPlacePerc_lgb'] * weight_lgb, axis=1)
submission = df_test[['Id', 'winPlacePerc']]
submission.to_csv('../output/submission.csv', index=False)