In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error

# 数据提取

In [2]:
train_data = pd.read_csv("../data/pubg/train_V2_clear.csv")
train_data.head()

Unnamed: 0,Id,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,...,matchType_flaretpp,matchType_normal-duo-fpp,matchType_normal-squad,matchType_normal-squad-fpp,matchType_solo,matchType_solo-fpp,matchType_squad,matchType_squad-fpp,groupId_cat,matchId_cat
0,7f96b2f878858a,0,0,0.0,0,0,0,60,1241,0,...,0,0,0,0,0,0,0,1,606565,29087
1,eef90569b9d03c,0,0,91.47,0,0,0,57,0,0,...,0,0,0,0,0,0,0,1,818060,31658
2,1eaf90ac73de72,1,0,68.0,0,0,0,47,0,0,...,0,0,0,0,0,0,0,0,833566,3036
3,4616d365dd2853,0,0,32.9,0,0,0,75,0,0,...,0,0,0,0,0,0,0,1,1324636,43738
4,315c96c26c9aac,0,0,100.0,0,0,0,45,0,1,...,0,0,0,0,0,1,0,0,1737142,19848


## sample函数截取部分数据 不使用截取和欠采样

In [3]:
train_data_sample = train_data.sample(100000)
train_data_sample.shape

(100000, 50)

In [4]:
train_data_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 3527205 to 2374065
Data columns (total 50 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Id                          100000 non-null  object 
 1   assists                     100000 non-null  int64  
 2   boosts                      100000 non-null  int64  
 3   damageDealt                 100000 non-null  float64
 4   DBNOs                       100000 non-null  int64  
 5   headshotKills               100000 non-null  int64  
 6   heals                       100000 non-null  int64  
 7   killPlace                   100000 non-null  int64  
 8   killPoints                  100000 non-null  int64  
 9   kills                       100000 non-null  int64  
 10  killStreaks                 100000 non-null  int64  
 11  longestKill                 100000 non-null  float64
 12  matchDuration               100000 non-null  int64  
 13  maxPlac

In [5]:
train_data_sample.describe()

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,...,matchType_flaretpp,matchType_normal-duo-fpp,matchType_normal-squad,matchType_normal-squad-fpp,matchType_solo,matchType_solo-fpp,matchType_squad,matchType_squad-fpp,groupId_cat,matchId_cat
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,0.22926,1.10618,128.461777,0.65092,0.22312,1.36109,47.96919,502.79394,0.9093,0.54223,...,0.00048,4e-05,2e-05,0.00018,0.039,0.12012,0.14296,0.39939,1000792.0,23146.24783
std,0.570669,1.715184,164.487405,1.115749,0.578481,2.682998,27.418679,628.142687,1.497436,0.709818,...,0.021904,0.006324,0.004472,0.013415,0.193596,0.325103,0.350034,0.489775,577614.3,13391.975864
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,502156.5,11486.75
50%,0.0,0.0,82.89,0.0,0.0,0.0,48.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1002043.0,23101.5
75%,0.0,2.0,184.4,1.0,0.0,2.0,72.0,1174.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1499619.0,34755.25
max,9.0,17.0,2336.0,22.0,18.0,51.0,100.0,2061.0,21.0,16.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2002871.0,46349.0


## 获取特征值和目标值

In [6]:
x = train_data_sample.drop(["winPlacePerc", "Id"], axis=1)
y = train_data_sample["winPlacePerc"]
x.shape, y.shape

((100000, 48), (100000,))

## 分隔数据集和测试集

In [7]:
x_train, x_val, y_train, y_val = train_test_split(x, y ,test_size=0.2, random_state=0)
x_train.shape, x_val.shape, y_train.shape, y_val.shape

((80000, 48), (20000, 48), (80000,), (20000,))

# 使用lightGBM对模型进行训练

## 模型初步尝试

In [8]:
lgbmr1 = LGBMRegressor(
                      learning_rate=0.05,
                      n_estimators=20
                      )

In [9]:
lgbmr1.fit(x_train, y_train, eval_set=[(x_val, y_val)], eval_metric="l1")

[1]	valid_0's l1: 0.256872	valid_0's l2: 0.0869639
[2]	valid_0's l1: 0.245599	valid_0's l2: 0.0797256
[3]	valid_0's l1: 0.234936	valid_0's l2: 0.0731633
[4]	valid_0's l1: 0.224854	valid_0's l2: 0.0672303
[5]	valid_0's l1: 0.215329	valid_0's l2: 0.0618625
[6]	valid_0's l1: 0.206386	valid_0's l2: 0.0570041
[7]	valid_0's l1: 0.197962	valid_0's l2: 0.0526291
[8]	valid_0's l1: 0.189959	valid_0's l2: 0.0486253
[9]	valid_0's l1: 0.182435	valid_0's l2: 0.0450197
[10]	valid_0's l1: 0.175241	valid_0's l2: 0.0417273
[11]	valid_0's l1: 0.168544	valid_0's l2: 0.0387766
[12]	valid_0's l1: 0.16215	valid_0's l2: 0.0360603
[13]	valid_0's l1: 0.156228	valid_0's l2: 0.0336334
[14]	valid_0's l1: 0.150565	valid_0's l2: 0.0313942
[15]	valid_0's l1: 0.145293	valid_0's l2: 0.0293748
[16]	valid_0's l1: 0.140334	valid_0's l2: 0.027556
[17]	valid_0's l1: 0.135647	valid_0's l2: 0.0258842
[18]	valid_0's l1: 0.131247	valid_0's l2: 0.0243696
[19]	valid_0's l1: 0.127009	valid_0's l2: 0.0229608
[20]	valid_0's l1: 0.12

## 得分

In [10]:
lgbmr1.score(x_val, y_val)

0.7715809591728603

In [11]:
lgbmr1_y_pred = lgbmr1.predict(x_val)

In [12]:
# 平均绝对误差
mean_absolute_error(y_val, lgbmr1_y_pred)

0.12307606308536258

In [13]:
# 均方误差
mean_squared_error(y_val, lgbmr1_y_pred)

0.021693027754634324

# lightGBM GridSearchCV

In [14]:
lgbmr2 = LGBMRegressor()

In [21]:
param_grid = {
    "max_depth":     [5, 7, 9, 11],
    "learning_rate": [0.001, 0.01, 0.1],
    "n_estimators":  [200, 300, 400, 500],
}

In [22]:
gs = GridSearchCV(lgbmr2, param_grid=param_grid, n_jobs=-1)

In [23]:
gs.fit(x_train, y_train)

In [24]:
gs.best_params_

{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 500}

## 得分

In [25]:
gs.score(x_val, y_val)

0.9363240551815714

In [26]:
lgbmr2_y_pred = gs.predict(x_val)

In [27]:
# 平均绝对误差
mean_absolute_error(y_val, lgbmr2_y_pred)

0.056219857926803785

In [28]:
# 均方误差
mean_squared_error(y_val, lgbmr2_y_pred)

0.00604732439662978