In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import gc
import os
import sys

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from timeit import default_timer as timer

import lightgbm as lgb

from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [2]:
# 通过类型转换节省内存空间
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# 加载数据集

In [3]:
def state(message,start = True, time = 0):
    if(start):
        print(f'Working on {message} ... ')
    else :
        print(f'Working on {message} took ({round(time , 3)}) Sec \n')

In [4]:
# Import dataset
df_train = pd.read_csv('../input/train_V2.csv')
df_test = pd.read_csv('../input/test_V2.csv')

# Reduce memory use
df_train=reduce_mem_usage(df_train)
df_test=reduce_mem_usage(df_test)

# Show some data
df_train.head()
df_train.describe()

Memory usage of dataframe is 983.90 MB --> 339.28 MB (Decreased by 65.5%)
Memory usage of dataframe is 413.18 MB --> 140.19 MB (Decreased by 66.1%)


Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,...,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc
count,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,...,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446966.0,4446965.0
mean,0.2338149,1.106908,130.7172,0.6578755,0.2268196,1.370147,47.59935,505.006,0.9247833,0.5439551,...,0.164659,606.116,0.003496091,4.509323,0.02386841,0.007918208,1154.218,3.660488,606.4601,0.4728218
std,0.5885731,1.715794,170.7806,1.145743,0.6021553,2.679982,27.46294,627.5049,1.558445,0.7109721,...,0.4721671,1498.344,0.07337297,30.5022,0.1673935,0.09261157,1183.497,2.456544,739.7004,0.307405
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,155.1,2.0,0.0,0.2
50%,0.0,0.0,84.24,0.0,0.0,0.0,47.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,685.6,3.0,0.0,0.4583
75%,0.0,2.0,186.0,1.0,0.0,2.0,71.0,1172.0,1.0,1.0,...,0.0,0.190975,0.0,0.0,0.0,0.0,1976.0,5.0,1495.0,0.7407
max,22.0,33.0,6616.0,53.0,64.0,80.0,101.0,2170.0,72.0,20.0,...,39.0,40710.0,18.0,3823.0,12.0,5.0,25780.0,236.0,2013.0,1.0


# 对数据进行简单清洗

In [5]:
# 由于百分比是按照本局的最差名次来计算的，而不是小队的数量，并且本局最差名次与小队数量存在冗余，因此删除
# 由于最远击杀距离统计并不准确 rankPoints官方建议谨慎使用，因此删除
df_train = df_train.drop(['longestKill', 'numGroups', 'rankPoints'], axis=1)
df_test = df_test.drop(['longestKill', 'numGroups', 'rankPoints'], axis=1)

# 删除缺失值
df_train[df_train['winPlacePerc'].isnull()]
df_train.drop(2744604, inplace=True)

2      768836
3      768347
4      689622
1      580951
5      540721
        ...  
77          1
75          1
74          1
71          1
236         1
Name: weaponsAcquired, Length: 97, dtype: int64

# 特征工程

In [7]:
def feature_engineering(df,is_train=True):
    if is_train: 
        df = df[df['maxPlace'] > 1]

    state('totalDistance')
    s = timer()
    df['totalDistance'] = df['rideDistance'] + df["walkDistance"] + df["swimDistance"]
    e = timer()
    state('totalDistance', False, e - s)
          
    state('killPlace_over_maxPlace')
    s = timer()
    df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']
    e = timer()                                  
    state('killPlace_over_maxPlace', False, e - s)
    
    state('healsandboosts')
    s = timer()
    df['healsandboosts'] = df['heals'] + df['boosts']
    e = timer()                                  
    state('healsandboosts', False, e - s)
    
    target = 'winPlacePerc'
    features = list(df.columns)
    
    # 去掉标称属性特征
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchDuration")
    features.remove("matchType")
    
    y = None
    if is_train: 
        y = np.array(df.groupby(['matchId', 'groupId'])[target].agg('mean'), dtype=np.float64)
        # 从特征中去掉百分比排名（预测目标）
        features.remove(target)
    
    # 统计同场比赛中同组内的各个特征的平均值及其在该场比赛下的百分比
    print("get group mean feature")
    agg = df.groupby(['matchId', 'groupId'])[features].agg('mean')
    agg_rank = agg.groupby(['matchId'])[features].rank(pct=True).reset_index()
    
    
    #创建一个以matchId和groupId为索引的新数据集
    if is_train: 
        df_out = agg.reset_index()[['matchId', 'groupId']]
    else: 
        df_out = df[['matchId', 'groupId']]
    
    # 将新特征与df_out根据matchId和groupId合并
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])
    
    # 统计同场比赛中同组内的各个特征的中值及其在该场比赛下的百分比
    print("get group median feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('median')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    # 将新特征与df_out根据matchId和groupId合并
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_median", "_median_rank"], how='left', on=['matchId', 'groupId'])
    
    # 统计同场比赛中同组内的各个特征的最大值及其在该场比赛下的百分比
    print("get group max feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('max')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    # 将新特征与df_out根据matchId和groupId合并
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])
    
    # 统计同场比赛中同组内的各个特征的最小值及其在该场比赛下的百分比
    print("get group min feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('min')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    # 将新特征与df_out根据matchId和groupId合并
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])
    
    # 统计同场比赛中同组内的各个特征的和及其在该场比赛下的百分比
    print("get group max feature")
    agg = df.groupby(['matchId','groupId'])[features].agg('sum')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    # 将新特征与df_out根据matchId和groupId合并
    print("get group sum feature")
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_sum", "_sum_rank"], how='left', on=['matchId', 'groupId'])
    
    # 统计同场比赛中每个小组的人员数量
    print("get group size feature")
    agg = df.groupby(['matchId','groupId']).size().reset_index(name='group_size')
     
    # 将Group_size特征与df_out根据matchId和groupId合并
    df_out = df_out.merge(agg, how='left', on=['matchId', 'groupId'])
    
    # 统计同场比赛下的特征平均值
    print("get match mean feature")
    agg = df.groupby(['matchId'])[features].agg('mean').reset_index()
    # 将新特征与df_out根据matchId合并
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    
    # 统计同场比赛中小组数量
    print("get match size feature")
    agg = df.groupby(['matchId']).size().reset_index(name='match_size')
    # 将新特征与df_out根据matchId合并
    df_out = df_out.merge(agg, how='left', on=['matchId'])
    
    # 删除matchId和groupId
    df_out.drop(["matchId", "groupId"], axis=1, inplace=True)
    df_out = reduce_mem_usage(df_out)
    
    X = np.array(df_out, dtype=np.float64)
    
    del df, df_out, agg, agg_rank
    gc.collect()

    return X, y


In [8]:
x_train, y_train = feature_engineering(df_train,True)
x_test, _ = feature_engineering(df_test,False)

Working on totalDistance ... 
Working on totalDistance took (0.019) Sec 

Working on killPlace_over_maxPlace ... 
Working on killPlace_over_maxPlace took (0.032) Sec 

Working on healsandboosts ... 
Working on healsandboosts took (0.009) Sec 

get group mean feature
get group median feature
get group max feature
get group min feature
get group max feature
get group sum feature
get group size feature
get match mean feature
get match size feature
Memory usage of dataframe is 3425.02 MB --> 1867.14 MB (Decreased by 45.5%)
Working on totalDistance ... 
Working on totalDistance took (0.012) Sec 

Working on killPlace_over_maxPlace ... 
Working on killPlace_over_maxPlace took (0.016) Sec 

Working on healsandboosts ... 
Working on healsandboosts took (0.005) Sec 

get group mean feature
get group median feature
get group max feature
get group min feature
get group max feature
get group sum feature
get group size feature
get match mean feature
get match size feature
Memory usage of dataframe 

# 建立模型

In [9]:
# 将数据集划分为训练集和验证集
random_seed=1
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.05, random_state=random_seed)

## Random Forest

In [10]:
RF = RandomForestRegressor(n_estimators=10, min_samples_leaf=3, max_features=0.5, n_jobs=-1)

In [11]:
%%time
RF.fit(x_train, y_train)

Wall time: 8min 26s


RandomForestRegressor(max_features=0.5, min_samples_leaf=3, n_estimators=10,
                      n_jobs=-1)

In [12]:
mae_train_RF = mean_absolute_error(RF.predict(x_train), y_train)
mae_val_RF = mean_absolute_error(RF.predict(x_val), y_val)
print('mae train RF: ', mae_train_RF)
print('mae val RF: ', mae_val_RF)

mae train RF:  0.015503626984996665
mae val RF:  0.032972121527289365


## LightGBM

In [13]:
def run_lgb(train_X, train_y, val_X, val_y, x_test):
    params = {"objective" : "regression", 
              "metric" : "mae", 
              'n_estimators':20000, 
              'early_stopping_rounds':200,
              "num_leaves" : 31, 
              "learning_rate" : 0.05, 
              "bagging_fraction" : 0.7,
              "bagging_seed" : 0, 
              "num_threads" : 4,
              "colsample_bytree" : 0.7
             }
    
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, valid_sets=[lgtrain, lgval], early_stopping_rounds=200, verbose_eval=1000)
    
    pred_test_y = model.predict(x_test, num_iteration=model.best_iteration)
    return pred_test_y, model

In [14]:
%%time
# 训练模型
pred_test_lgb, model = run_lgb(x_train, y_train, x_val, y_val, x_test)



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50691
[LightGBM] [Info] Number of data points in the train set: 1925406, number of used features: 266
[LightGBM] [Info] Start training from score 0.499780
Training until validation scores don't improve for 200 rounds
[1000]	training's l1: 0.0293458	valid_1's l1: 0.0297825
[2000]	training's l1: 0.0284614	valid_1's l1: 0.0292488
[3000]	training's l1: 0.0278106	valid_1's l1: 0.0289249
[4000]	training's l1: 0.0272724	valid_1's l1: 0.02869
[5000]	training's l1: 0.0267899	valid_1's l1: 0.0284865
[6000]	training's l1: 0.0263591	valid_1's l1: 0.0283217
[7000]	training's l1: 0.0259541	valid_1's l1: 0.0281813
[8000]	training's l1: 0.0255732	valid_1's l1: 0.0280472
[9000]	training's l1: 0.0252231	valid_1's l1: 0.0279395
[10000]	training's l1: 0.0248689	valid_1's l1: 0.0278219
[11000]	training's l1: 0.0245357	valid_1's l1: 0.0277238
[12000]	training's l1: 0.0242178	valid_1's l1: 0.0276358
[13000]	training's l1: 

In [15]:
mae_train_lgb = mean_absolute_error(model.predict(x_train, num_iteration=model.best_iteration), y_train)
mae_val_lgb = mean_absolute_error(model.predict(x_val, num_iteration=model.best_iteration), y_val)

print('mae train lgb: ', mae_train_lgb)
print('mae val lgb: ', mae_val_lgb)

mae train lgb:  0.022016762514129016
mae val lgb:  0.0270388973536143


## DNN

In [16]:
def run_DNN(x_train, y_train, x_val, y_val, x_test):
    NN_model = Sequential()
    NN_model.add(Dense(x_train.shape[1],  input_dim = x_train.shape[1], activation='relu'))
    NN_model.add(Dense(136, activation='relu'))
    NN_model.add(Dense(136, activation='relu'))
    NN_model.add(Dense(136, activation='relu'))
    NN_model.add(Dense(136, activation='relu'))

    NN_model.add(Dense(1, activation='linear'))

    NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
    NN_model.summary()
    
    checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
    checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
    callbacks_list = [checkpoint]
    
    NN_model.fit(x=x_train, 
                 y=y_train, 
                 batch_size=1000,
                 epochs=30, 
                 verbose=1, 
                 callbacks=callbacks_list,
                 validation_split=0.15, 
                 validation_data=None, 
                 shuffle=True,
                 class_weight=None, 
                 sample_weight=None, 
                 initial_epoch=0,
                 steps_per_epoch=None, 
                 validation_steps=None)

    pred_test_y = NN_model.predict(x_test)
    pred_test_y = pred_test_y.reshape(-1)
    return pred_test_y, NN_model

In [17]:
%%time
# 训练模型
pred_test_DNN, model = run_DNN(x_train, y_train, x_val, y_val, x_test)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 266)               71022     
_________________________________________________________________
dense_2 (Dense)              (None, 136)               36312     
_________________________________________________________________
dense_3 (Dense)              (None, 136)               18632     
_________________________________________________________________
dense_4 (Dense)              (None, 136)               18632     
_________________________________________________________________
dense_5 (Dense)              (None, 136)               18632     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 137       
Total params: 163,367
Trainable params:

In [18]:
mae_train_DNN = mean_absolute_error(model.predict(x_train), y_train)
mae_val_DNN = mean_absolute_error(model.predict(x_val), y_val)
print('mae train dnn: ', mae_train_DNN)
print('mae val dnn: ', mae_val_DNN)

mae train dnn:  0.04090054122667123
mae val dnn:  0.040935060316103625


#  使用训练好的模型进行预测

## Random Forest

In [19]:
pred_test_RF = RF.predict(x_test)
df_test['winPlacePerc_RF'] = pred_test_RF
submission = df_test[['Id', 'winPlacePerc_RF']]
submission.to_csv('../output/submission_RF.csv', index=False)

##  LightGBM

In [20]:
df_test['winPlacePerc_lgb'] = pred_test_lgb
submission = df_test[['Id', 'winPlacePerc_lgb']]
submission.to_csv('../output/submission_lgb.csv', index=False)

## DNN

In [21]:
df_test['winPlacePerc_DNN'] = pred_test_DNN
submission = df_test[['Id', 'winPlacePerc_DNN']]
submission.to_csv('../output/submission_DNN.csv', index=False)

## 根据验证集上的MAE值为模型划分权重进行集成(RF + DNN + LightGBM)

In [22]:
weight_DNN = (1 - mae_val_DNN) / (3 - mae_val_DNN - mae_val_RF - mae_val_lgb)
weight_RF = (1 - mae_val_RF) / (3 - mae_val_DNN - mae_val_RF - mae_val_lgb)
weight_lgb = (1 - mae_val_lgb) / (3 - mae_val_DNN - mae_val_RF - mae_val_lgb)

df_test['winPlacePerc'] = df_test.apply(lambda x: x['winPlacePerc_RF'] * weight_RF + x['winPlacePerc_DNN'] * weight_DNN + x['winPlacePerc_lgb'] * weight_lgb, axis=1)
submission = df_test[['Id', 'winPlacePerc']]
submission.to_csv('../output/submission.csv', index=False)