In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sampleSubmission.csv')

In [None]:
# to extract non-4 weather data
train = train[train['weather'] != 4]

In [4]:
all_data = pd.concat([train, test], ignore_index=True)
all_data

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3.0,13.0,16.0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8.0,32.0,40.0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5.0,27.0,32.0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3.0,10.0,13.0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
17373,2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
17374,2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,,,
17375,2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,,,
17376,2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,,,


In [5]:
from datetime import datetime

# date feature
all_data['date'] = all_data['datetime'].apply(lambda x: x.split()[0])
# year feature
all_data['year'] = all_data['datetime'].apply(lambda x: x.split()[0].split('-')[0])
# month feature
all_data['month'] = all_data['datetime'].apply(lambda x: x.split()[0].split('-')[1])
# hour feature
all_data['hour'] = all_data['datetime'].apply(lambda x: x.split()[1].split(':')[0])
# weekday feature
all_data["weekday"] = all_data['date'].apply(lambda dateString : datetime.strptime(dateString,"%Y-%m-%d").weekday())

In [6]:
drop_features = ['casual', 'registered', 'datetime', 'date', 'month', 'windspeed']

all_data = all_data.drop(drop_features, axis=1)

In [7]:
X_train = all_data[~pd.isnull(all_data['count'])]
X_test = all_data[pd.isnull(all_data['count'])]

X_train = X_train.drop(['count'], axis=1)
X_test = X_test.drop(['count'], axis=1)

y = train['count']

In [8]:
import numpy as np

def rmsle(y_true, y_pred, convertExp=True):
    # Convert to exponential ...
    if convertExp:
        y_true = np.exp(y_true)
        y_pred = np.exp(y_pred)
        
    log_true = np.nan_to_num(np.log(y_true+1))
    log_pred = np.nan_to_num(np.log(y_pred+1))
    
    # calculate RMSLE
    output = np.sqrt(np.mean((log_true - log_pred)**2))
    return output

In [9]:
# transfer y value to logged y value , becuase log normalizatoin effect 
y_train = train['count']
log_y_train = np.log(y_train) 

In [10]:
# year, hour of dtypes are object. but XGBoost can't use object type, so type_casting needed.
X_train['year'] = pd.to_numeric(X_train['year'])
X_train['hour'] = pd.to_numeric(X_train['hour'])
print(X_train.dtypes)

season          int64
holiday         int64
workingday      int64
weather         int64
temp          float64
atemp         float64
humidity        int64
year            int64
hour            int64
weekday         int64
dtype: object


In [11]:
# import XGBRegressor
from xgboost import XGBRegressor

# create instance of XGBRegressor
xgb_reg = XGBRegressor()

# training xgb_reg model with X_train data 
xgb_reg.fit(X_train, log_y_train)

# predict estimation value 
pred_X_train = xgb_reg.predict(X_train)

# evaluate with RMSLE
print(f'XGBRegressor RMSLE value: {rmsle(log_y_train, pred_X_train, True):.5f}')

XGBRegressor RMSLE value: 0.19874


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

randomforest_model = RandomForestRegressor()
# evaluate metrics with RMSLE 
rmsle_scorer = metrics.make_scorer(rmsle, greater_is_better=False)

# gridsearch with parameters
rf_params = {'random_state':[20, 30, 40, 50, 60], 'n_estimators':[100, 150, 200, 250, 300]}
gridsearch_random_forest_model = GridSearchCV(estimator=randomforest_model,
                                              param_grid=rf_params,
                                              scoring=rmsle_scorer,
                                              cv=5)

log_y = np.log(y)
gridsearch_random_forest_model.fit(X_train, log_y)

print('Hyper-Parameter optimizaton :', gridsearch_random_forest_model.best_params_)
# predict
preds = gridsearch_random_forest_model.best_estimator_.predict(X_train)

# evaluation
print(f'RF regression RMSLE : {rmsle(log_y, preds, True):.4f}')

In [None]:
# random forest predict & submission
randomforest_preds = gridsearch_random_forest_model.best_estimator_.predict(X_test)
submission['count'] = np.exp(randomforest_preds)
# submission.to_csv('submission_RF.csv', index=False)

# xgboost predict & submission
# there is a lot of parameter in XGBoost optimizaton. 
# if some paras gridsearch, it gets better.
pred_X_test = xgb_reg.predict(X_test)
submission['count'] = np.exp(pred_X_test)
# submission.to_csv('submission_XGBoost.csv', index=False)
print(pred_X_test)
print(randomforest_preds)
print()

In [None]:
from sklearn.model_selection import train_test_split

reg = linear_model.LinearRegression()
  
# train the model using the training sets
reg.fit(X_train, y_train)
  
# regression coefficients
print('Coefficients: ', reg.coef_)
  
# variance score: 1 means perfect prediction
print('Variance score: {}'.format(reg.score(X_test, y_test)))