## 在不对数据进行独热编码的情况下，使用随机森林以及GBM算法进行学习

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
import xgboost as xgb



In [28]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [29]:
datetimecol = df_test['datetime']

In [30]:
#根据EDA的分析结果，删除casual离群点
df_train = df_train[np.abs(df_train["casual"]-df_train["casual"].mean())<=(0.8*df_train["casual"].std())] 

In [31]:
df_target = df_train[['casual','count','registered']]

In [32]:
df_train.drop(['casual','count','registered'],inplace=True,axis=1)

In [33]:
df_train_size = df_train.shape[0]

In [34]:
df_test.shape

(6493, 9)

In [35]:
#测试样本是从10739行开始的
df_data = pd.concat([df_train,df_test],ignore_index=True)

In [36]:
df_data['hour'] = pd.DatetimeIndex(df_data.datetime).hour
df_data['weekday'] = pd.DatetimeIndex(df_data.datetime).dayofweek
df_data['month'] = pd.DatetimeIndex(df_data.datetime).month
df_data['year'] = pd.DatetimeIndex(df_data.datetime).year

In [37]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15820 entries, 0 to 15819
Data columns (total 13 columns):
datetime      15820 non-null object
season        15820 non-null int64
holiday       15820 non-null int64
workingday    15820 non-null int64
weather       15820 non-null int64
temp          15820 non-null float64
atemp         15820 non-null float64
humidity      15820 non-null int64
windspeed     15820 non-null float64
hour          15820 non-null int64
weekday       15820 non-null int64
month         15820 non-null int64
year          15820 non-null int64
dtypes: float64(3), int64(9), object(1)
memory usage: 1.6+ MB


In [38]:
df_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,hour,weekday,month,year
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,0,5,1,2011
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,1,5,1,2011
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,2,5,1,2011
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,5,1,2011
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,4,5,1,2011


In [39]:
#对各种类型的特征分别进行处理
categoricalFeatureNames = ['season','weather','weekday','month','year','hour']
for var in categoricalFeatureNames:
    df_data[var] = df_data[var].astype('category')

In [40]:
df_data.drop(['datetime'],inplace=True, axis=1)

In [41]:
df_data.drop(['atemp','month'],inplace=True, axis=1)

#分别对casual，registered进行预测
df_data_casual = df_data.copy()
df_data_reg = df_data.copy()

In [42]:
m = df_data.shape[0]

#把训练集和测试集分开
df_train_casual = df_data_casual.iloc[:7498,:]
df_test_casual = df_data_casual.iloc[7498:,:].reset_index(drop=True)
df_train_reg = df_data_reg.iloc[:7498,:]
df_test_reg = df_data_reg.iloc[7498:,:].reset_index(drop=True)

In [43]:
df_train = df_data.iloc[:df_train_size,:]
df_test = df_data.iloc[df_train_size:,:].reset_index(drop=True)

## GBM

#通过网格搜索得到gbm的最优参数
est = GradientBoostingRegressor()
param_grid = [{'n_estimators':[1000,2000,3000],
              'alpha':[0.1,0.01],
              'max_depth':[2,3],
              'min_samples_leaf':[10,20]}]
est = GridSearchCV(estimator=est,param_grid=param_grid,scoring='neg_mean_squared_error',cv=5,n_jobs=2)

est.best_params_

In [61]:
#est = GradientBoostingRegressor(alpha=0.01, max_depth=3, min_samples_leaf=20, n_estimators=2000,random_state=1)
est = GradientBoostingRegressor(alpha=0.01,max_depth=3, min_samples_leaf=20,n_estimators=1000)

## 因为casual和registered、count的分布不一样，所以需要不同参数的模型

est_reg = est.fit(df_train,np.log(df_target['registered']+1))

est_casual = est.fit(df_train, np.log(df_target['casual']+1))

In [62]:
est_count = est.fit(df_train, np.log(df_target['count']+1))

In [63]:
#使用est_count进行预测
y_pred = est_count.predict(df_test)

In [64]:
result = np.round(np.exp(y_pred)-1)
df_result = pd.DataFrame({'datetime': datetimecol, 'count': result})
df_result.to_csv('data/submission-gbm.csv', index = False)

## 随机森林与gbm融合

In [65]:
df_gbm = pd.read_csv('data/submission-gbm.csv')
df_rf = pd.read_csv('data/submission-rf.csv')
df_avg = pd.DataFrame(df_rf)
df_avg['count'] = ((df_rf['count'] + df_gbm['count']) * 0.5).astype(int)
df_avg.to_csv('data/submission.csv', index = False)