# 随机森林

In [2]:
import pandas as pd
import numpy as np
import calendar
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
%matplotlib inline

In [3]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [4]:
datetimecol = df_test['datetime']

In [5]:
df_target = df_train[['casual','count','registered']]

In [6]:
df_train.drop(['casual','count','registered'],inplace=True,axis=1)

In [7]:
df_train_size = df_train.shape[0]

In [8]:
df_test.shape

(6493, 9)

In [9]:
df_data = pd.concat([df_train,df_test],ignore_index=True)

In [10]:
df_data['hour'] = pd.DatetimeIndex(df_data.datetime).hour
df_data['weekday'] = pd.DatetimeIndex(df_data.datetime).dayofweek
df_data['month'] = pd.DatetimeIndex(df_data.datetime).month
df_data['year'] = pd.DatetimeIndex(df_data.datetime).year

## 对mount、hour进行独热编码处理

In [11]:
#使用pandas的get_dummies来完成独热编码处理工作
dum_season = pd.get_dummies(df_data['season'], prefix='season_')
df_data = df_data.join(dum_season)
df_data.drop(['season'],inplace=True,axis=1)
df_data.shape

(17379, 16)

In [12]:
dum_weather = pd.get_dummies(df_data['weather'], prefix='weather_')
df_data = df_data.join(dum_weather)
df_data.drop(['weather'],inplace=True,axis=1)
df_data.shape

(17379, 19)

In [13]:
dum_tmp = pd.get_dummies(df_data['hour'], prefix='hour')
df_data = df_data.join(dum_tmp)
df_data.drop(['hour'],inplace=True,axis=1)
df_data.shape

(17379, 42)

In [14]:
dum_tmp = pd.get_dummies(df_data['month'], prefix='month')
df_data = df_data.join(dum_tmp)
df_data.drop(['month'],inplace=True,axis=1)
df_data.shape

(17379, 53)

In [15]:
dum_tmp = pd.get_dummies(df_data['weekday'], prefix='weekday')
df_data = df_data.join(dum_tmp)
df_data.drop(['weekday'],inplace=True,axis=1)
df_data.shape

(17379, 59)

In [16]:
df_data.drop(['datetime'],inplace=True, axis=1)

In [17]:
df_train = df_data.iloc[:df_train_size,:]
df_test = df_data.iloc[df_train_size:,:].reset_index(drop=True)

rf = RandomForestRegressor()
param_grid = [{'n_estimators':[100,200,500]}]
rf = GridSearchCV(estimator=rf,param_grid=param_grid,scoring='neg_mean_squared_error',cv=5,n_jobs=-1)

In [18]:
#score=0.40396,estimator=200
#score=0.40428,estimator=500
#score=0.40596,estimator=100
rf_casual = RandomForestRegressor(n_estimators=200)
rf_reg = RandomForestRegressor(n_estimators=200)

In [19]:
rf_casual = rf_casual.fit(df_train, np.log(df_target['casual']+1))
rf_reg = rf_reg.fit(df_train, np.log(df_target['registered']+1))

print rf_casual.best_params_
print rf_reg.best_params_

In [20]:
y_pred_casual = rf_casual.predict(df_test)
y_pred_reg = rf_reg.predict(df_test)

In [21]:
output = np.round(np.exp(y_pred_reg) - 1 + np.exp(y_pred_casual) - 1)
df_output = pd.DataFrame({'datetime': datetimecol, 'count': output})
df_output['count'] = df_output['count'].astype(int)
df_output.to_csv('data/submission-rf.csv', index = False)