# 相关设置

In [121]:
from IPython.core.interactiveshell import InteractiveShell

import calendar
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
%matplotlib inline

InteractiveShell.ast_node_interactivity = "all"  # 一个cell显示多输出

pd.set_option('display.max_rows', 100)  # 设置最大显示100行
pd.set_option('display.max_columns', 100)  # 设置最大显示100列

# 数据读取

In [122]:
raw_train_data = pd.read_csv('data/train.csv')
raw_train_data.drop(['casual', 'registered'], inplace=True, axis=1)
raw_test_data = pd.read_csv('data/test.csv')

In [123]:
raw_train_data.head()
raw_test_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,1


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [124]:
raw_data = raw_train_data.append(raw_test_data)

In [125]:
raw_data.reset_index(inplace=True)

In [126]:
raw_data.drop('index', inplace=True, axis=1)

In [127]:
raw_data.head()
raw_data.info()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,16.0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,40.0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,32.0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,13.0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,1.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    17379 non-null  object 
 1   season      17379 non-null  int64  
 2   holiday     17379 non-null  int64  
 3   workingday  17379 non-null  int64  
 4   weather     17379 non-null  int64  
 5   temp        17379 non-null  float64
 6   atemp       17379 non-null  float64
 7   humidity    17379 non-null  int64  
 8   windspeed   17379 non-null  float64
 9   count       10886 non-null  float64
dtypes: float64(4), int64(5), object(1)
memory usage: 1.3+ MB


# 特征工程

## 时间特征

In [128]:
raw_data["date"] = raw_data.datetime.apply(lambda x: x.split()[0])
raw_data["year"] = raw_data.datetime.apply(lambda x: x.split()[0].split("-")[0])
raw_data['month'] = pd.DatetimeIndex(raw_data.datetime).month
raw_data['weekday'] = pd.DatetimeIndex(raw_data.datetime).dayofweek
raw_data['hour'] = pd.DatetimeIndex(raw_data.datetime).hour

In [129]:
raw_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,date,year,month,weekday,hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,16.0,2011-01-01,2011,1,5,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,40.0,2011-01-01,2011,1,5,1
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,32.0,2011-01-01,2011,1,5,2
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,13.0,2011-01-01,2011,1,5,3
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,1.0,2011-01-01,2011,1,5,4


## windspeed 特征的填充（使用随机森林）

In [130]:
wind_zero_data = raw_data[raw_data["windspeed"] == 0]
wind_not_zero_data = raw_data[raw_data["windspeed"] != 0]

wind_columns = ["season", "weather", "temp", "atemp", 
                "humidity", "year", "month"]

# 训练模型
rf_model_wind = RandomForestRegressor()
rf_model_wind.fit(wind_not_zero_data[wind_columns], wind_not_zero_data["windspeed"])

# 预测结果
wind_zero_values = rf_model_wind.predict(X=wind_zero_data[wind_columns])
wind_zero_data["windspeed"] = wind_zero_values

raw_data = wind_not_zero_data.append(wind_zero_data)

raw_data.reset_index(inplace=True)
raw_data.drop('index', inplace=True, axis=1)

RandomForestRegressor()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [131]:
raw_data.head(-10)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,date,year,month,weekday,hour
0,2011-01-01 05:00:00,1,0,0,2,9.84,12.880,75,6.003200,1.0,2011-01-01,2011,1,5,5
1,2011-01-01 10:00:00,1,0,0,1,15.58,19.695,76,16.997900,36.0,2011-01-01,2011,1,5,10
2,2011-01-01 11:00:00,1,0,0,1,14.76,16.665,81,19.001200,56.0,2011-01-01,2011,1,5,11
3,2011-01-01 12:00:00,1,0,0,1,17.22,21.210,77,19.001200,84.0,2011-01-01,2011,1,5,12
4,2011-01-01 13:00:00,1,0,0,2,18.86,22.725,72,19.999500,94.0,2011-01-01,2011,1,5,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17364,2012-12-24 23:00:00,1,0,1,3,9.84,14.395,93,7.265159,,2012-12-24,2012,12,0,23
17365,2012-12-25 08:00:00,1,1,0,2,9.84,14.395,87,9.372161,,2012-12-25,2012,12,1,8
17366,2012-12-25 09:00:00,1,1,0,2,9.84,12.880,87,6.742612,,2012-12-25,2012,12,1,9
17367,2012-12-25 10:00:00,1,1,0,1,11.48,15.910,81,7.096412,,2012-12-25,2012,12,1,10


## 划分数据集和标签

In [145]:
train_data = raw_data[pd.notnull(raw_data['count'])].sort_values(by=["datetime"])
test_data = raw_data[~pd.notnull(raw_data['count'])].sort_values(by=["datetime"])

train_data.reset_index(inplace=True)
test_data.reset_index(inplace=True)
train_data.drop('index', inplace=True, axis=1)
test_data.drop('index', inplace=True, axis=1)

train_labels = train_data["count"]

In [147]:
print(train_labels)

0         16.0
1         40.0
2         32.0
3         13.0
4          1.0
         ...  
10881    336.0
10882    241.0
10883    168.0
10884    129.0
10885     88.0
Name: count, Length: 10886, dtype: float64


In [148]:
features = ['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 
            'humidity', 'windspeed', 'year', 'month', 'weekday', 'hour']
train_data = train_data[features]
test_data = test_data[features]
train_data.head()
test_data.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,weekday,hour
0,1,0,0,1,9.84,14.395,81,6.923769,2011,1,5,0
1,1,0,0,1,9.02,13.635,80,6.760246,2011,1,5,1
2,1,0,0,1,9.02,13.635,80,6.760246,2011,1,5,2
3,1,0,0,1,9.84,14.395,75,6.707135,2011,1,5,3
4,1,0,0,1,9.84,14.395,75,6.707135,2011,1,5,4


Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,weekday,hour
0,1,0,1,1,10.66,11.365,56,26.0027,2011,1,3,0
1,1,0,1,1,10.66,13.635,56,8.746528,2011,1,3,1
2,1,0,1,1,10.66,13.635,56,8.746528,2011,1,3,2
3,1,0,1,1,10.66,12.88,56,11.0014,2011,1,3,3
4,1,0,1,1,10.66,12.88,56,11.0014,2011,1,3,4
