# 1. Загрузка данных

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime
import seaborn as sn

In [2]:
train = pd.read_csv('Data/train.csv', low_memory=False)
train.index.name = 'Index'
test = pd.read_csv('Data/test.csv', low_memory=False)
test.index.name = 'Index'

In [3]:
rawData = pd.concat([train, test], sort=False)

In [4]:
categoricalFeatures = ["hour","weekday","month","year","season","weather","holiday","workingday"]
numericalFeatures = ["atemp", "humidity", "windspeed", "count"]
featuresToBeDropped = ["temp", "registered", "casual", "datetime", "count"]

In [5]:
#Create year, month, day, time columns
rawData['year'] = pd.to_datetime(rawData['datetime']).apply(lambda x : x.year)
rawData['month'] = pd.to_datetime(rawData['datetime']).apply(lambda x : x.month)
rawData['weekday'] = pd.to_datetime(rawData['datetime']).apply(lambda x : x.isoweekday())
rawData['hour'] = pd.to_datetime(rawData['datetime']).apply(lambda x : x.hour)

In [6]:
for var in categoricalFeatures:
    rawData[var] = rawData[var].astype("category")

In [7]:
dataTrain = rawData[pd.notnull(rawData['count'])].sort_values(by=["datetime"])
dataTest = rawData[~pd.notnull(rawData['count'])].sort_values(by=["datetime"])
datetimecol = dataTest["datetime"]
yLabels = dataTrain["count"]

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_validate, y_train, y_validate = train_test_split( dataTrain, yLabels, test_size=0.3, random_state=42)
dateTimeColValidate = X_validate["datetime"]

In [9]:
X_train.head()

Unnamed: 0_level_0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,weekday,hour
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
613,2011-02-08 17:00:00,1,0,1,1,9.02,9.09,32,39.0007,4.0,172.0,176.0,2011,2,2,17
4030,2011-09-18 23:00:00,3,0,0,1,22.14,25.76,68,12.998,10.0,44.0,54.0,2011,9,7,23
3582,2011-08-19 04:00:00,3,0,1,1,26.24,28.79,83,0.0,1.0,7.0,8.0,2011,8,5,4
10101,2012-11-06 06:00:00,4,0,1,1,9.02,11.365,69,8.9981,6.0,143.0,149.0,2012,11,2,6
1430,2011-04-05 11:00:00,2,0,1,3,13.12,14.395,81,30.0026,1.0,18.0,19.0,2011,4,2,11


In [10]:
dataTrain  = dataTrain.drop(featuresToBeDropped,axis=1)
dataTest  = dataTest.drop(featuresToBeDropped,axis=1)
X_train = X_train.drop(featuresToBeDropped,axis=1)
X_validate = X_validate.drop(featuresToBeDropped,axis=1)

In [24]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
rfModel = RandomForestRegressor(n_estimators=100) ### Test 0.44
rfModel.fit(X = X_train,y = np.log1p(y_train))
preds = rfModel.predict(X= X_validate)
print ("RMSLE Value For Random Forest: ",rmsle(np.exp(np.log1p(y_validate)),np.exp(preds),False))

RMSLE Value For Random Forest:  0.2842968627886013


In [26]:
preds = rfModel.predict(X= dataTest)

In [27]:
submission = pd.DataFrame({
        "datetime": datetimecol,
        "count": [max(0, x) for x in np.exp(preds)]
    })
submission.to_csv('bike_prediction_output.csv', index=False)

In [28]:
submission

Unnamed: 0_level_0,datetime,count
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2011-01-20 00:00:00,12.484257
1,2011-01-20 01:00:00,5.853460
2,2011-01-20 02:00:00,3.902586
3,2011-01-20 03:00:00,3.912879
4,2011-01-20 04:00:00,3.202039
5,2011-01-20 05:00:00,5.953628
6,2011-01-20 06:00:00,35.056023
7,2011-01-20 07:00:00,89.054224
8,2011-01-20 08:00:00,209.645549
9,2011-01-20 09:00:00,124.986217
