In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

**基于随机森林模型预测共享单车投放量**
- 加载并整理数据集
- 特征分析
- 打乱数据集，划分测试集与训练集
- 基于训练集数据训练随机森林回归模型
- 基于测试集数据验证模型优劣
- 输出特征重要性并分析

In [2]:
import sklearn.linear_model as lm  # 线性回归
import sklearn.tree as st  # 决策树
import sklearn.ensemble as se  # 集合算法
import sklearn.model_selection as ms  # 模型选择；一般涉及调参与模型选择等
import sklearn.metrics as sm  # 评估

In [3]:
# 加载并整理数据集
data = pd.read_csv('./data/bike_day.csv')
data.head(3)

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349


In [4]:
data.drop(['instant', 'dteday', 'casual', 'registered'],
          axis=1,
          inplace=True)
data

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,985
1,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,801
2,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,1349
3,1,0,1,0,2,1,1,0.200000,0.212122,0.590435,0.160296,1562
4,1,0,1,0,3,1,1,0.226957,0.229270,0.436957,0.186900,1600
...,...,...,...,...,...,...,...,...,...,...,...,...
726,1,1,12,0,4,1,2,0.254167,0.226642,0.652917,0.350133,2114
727,1,1,12,0,5,1,2,0.253333,0.255046,0.590000,0.155471,3095
728,1,1,12,0,6,0,2,0.253333,0.242400,0.752917,0.124383,1341
729,1,1,12,0,0,0,1,0.255833,0.231700,0.483333,0.350754,1796


In [5]:
data.describe()

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
count,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0,731.0
mean,2.49658,0.500684,6.519836,0.028728,2.997264,0.683995,1.395349,0.495385,0.474354,0.627894,0.190486,4504.348837
std,1.110807,0.500342,3.451913,0.167155,2.004787,0.465233,0.544894,0.183051,0.162961,0.142429,0.077498,1937.211452
min,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.05913,0.07907,0.0,0.022392,22.0
25%,2.0,0.0,4.0,0.0,1.0,0.0,1.0,0.337083,0.337842,0.52,0.13495,3152.0
50%,3.0,1.0,7.0,0.0,3.0,1.0,1.0,0.498333,0.486733,0.626667,0.180975,4548.0
75%,3.0,1.0,10.0,0.0,5.0,1.0,2.0,0.655417,0.608602,0.730209,0.233214,5956.0
max,4.0,1.0,12.0,1.0,6.0,1.0,3.0,0.861667,0.840896,0.9725,0.507463,8714.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      731 non-null    int64  
 1   yr          731 non-null    int64  
 2   mnth        731 non-null    int64  
 3   holiday     731 non-null    int64  
 4   weekday     731 non-null    int64  
 5   workingday  731 non-null    int64  
 6   weathersit  731 non-null    int64  
 7   temp        731 non-null    float64
 8   atemp       731 non-null    float64
 9   hum         731 non-null    float64
 10  windspeed   731 non-null    float64
 11  cnt         731 non-null    int64  
dtypes: float64(4), int64(8)
memory usage: 68.7 KB


In [7]:
pd.pivot_table(data,index=['mnth'],values=['cnt'],columns=['weekday'])

Unnamed: 0_level_0,cnt,cnt,cnt,cnt,cnt,cnt,cnt
weekday,0,1,2,3,4,5,6
mnth,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,1815.9,1926.8,2567.777778,2138.625,2513.25,2446.5,1956.555556
2,2247.5,2604.375,2824.0,2813.333333,2878.5,2933.125,2266.5
3,3301.0,3545.625,3574.0,3670.0,3817.0,3925.777778,3938.777778
4,4416.888889,4515.777778,4555.875,4330.875,4764.0,4387.0,4445.666667
5,5320.222222,4512.333333,5025.0,5119.222222,5892.666667,5751.0,5978.5
6,5940.375,5478.25,5680.875,5701.0,5621.777778,5616.333333,6343.777778
7,5298.1,5791.555556,5844.222222,5814.0,5623.625,5405.888889,5232.333333
8,4703.0,5518.333333,5930.444444,6076.9,6038.333333,5958.444444,5223.875
9,6159.555556,5637.125,5184.5,5668.25,5485.555556,5747.0,6393.666667
10,4734.888889,4632.4,5064.555556,5504.777778,5537.25,5623.25,5445.111111


In [8]:
# 整理数据集
x, y = data.iloc[:,:-1], data['cnt']
train_x, test_x, train_y, test_y = ms.train_test_split(x, y, test_size=0.2, random_state=22)

# 训练模型
model = se.RandomForestRegressor(max_depth=10,n_estimators=100,min_samples_split=5)
model.fit(train_x, train_y)

# 模型评估
pred_rfg_y = model.predict(test_x)
print(sm.r2_score(test_y, pred_rfg_y))
print(sm.mean_absolute_error(test_y,pred_rfg_y))

0.8834493897225517
440.3428880290351
