In [352]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, make_scorer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [67]:
def check_rmsle_gb(x, y):
    x_t, x_v, y_t, y_v = train_test_split(x, y, test_size=0.2)
    params={'n_estimators':[700],
            'max_depth' : [5],
            'loss' : ['huber'],
            'learning_rate' : [0.1],
            'alpha' : [0.99],
            'max_features':["auto"],
            'min_samples_split' : [6]}
    grid_gb = GridSearchCV(GradientBoostingRegressor(), param_grid=params, cv=5)
    grid_gb.fit(x_t, np.log1p(y_t))
    y_pred = grid_gb.best_estimator_.predict(x_v)
    
    error = np.sqrt(mean_squared_log_error(y_v, np.exp(np.where(y_pred<=0, 0., y_pred))))
    
    print(grid_gb.best_estimator_)
    return error

In [244]:
def date_update(df):
    df['datetime'] = pd.to_datetime(df['datetime']) # 
    df['day'] = df.datetime.dt.day
    df['month'] = df.datetime.dt.month
    df['year'] = df.datetime.dt.year
    df['hour'] = df.datetime.dt.hour
    df['day_of_week'] = df.datetime.dt.dayofweek
    df['week_of_year'] = df.datetime.dt.weekofyear
    df = df.drop('datetime', axis=1)
    
    return df

In [154]:
def onehotencoding(df, categories):
    for categor in categories:
        df = pd.concat([df, pd.get_dummies(df[categor], prefix=categor)], axis=1)
        
    return df.drop(categories, axis=1)

In [589]:
def make_submission(x, y, x_pred):
    gbr = GradientBoostingRegressor(alpha=0.99, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='huber', max_depth=5,
                          max_features='auto', max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=6,
                          min_weight_fraction_leaf=0.0, n_estimators=800,
                          n_iter_no_change=None, presort='auto',
                          subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
    gbr.fit(x, np.log1p(y))
    predict = gbr.predict(x_pred)
    
    submission = pd.DataFrame({
        "datetime": pd.date_range('2012-12-20', '2013-3-19 23', freq='h'),
        "count": [max(0, x) for x in np.round(np.exp(predict))]
    })
    submission.to_csv('bike_predictions_gbm_separate_without_fe.csv', index=False)
    
    return pd.Series([max(0, x) for x in np.round(np.exp(predict))])

In [249]:
def make_submission_reg_plus_cas(x, y1, y2, x_pred):
    gbr = GradientBoostingRegressor(alpha=0.99, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='huber', max_depth=5,
                          max_features='auto', max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=6,
                          min_weight_fraction_leaf=0.0, n_estimators=800,
                          n_iter_no_change=None, presort='auto',
                          subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
    
    gbr.fit(x, np.log1p(y1))
    predict1 = gbr.predict(x_pred)
    
    gbr.fit(x, np.log1p(y2))
    predict2 = gbr.predict(x_pred)
    
    predict = np.where(np.round(np.exp(predict1))<0, 0, np.round(np.exp(predict1))) + np.where(np.round(np.exp(predict2))<0, 0, np.round(np.exp(predict2)))
    
    submission = pd.DataFrame({
        "datetime": datetime_test,
        "count": [max(0, x) for x in predict]
    })
    submission.to_csv('bike_predictions_gbm_separate_without_fe.csv', index=False)
    
    return pd.Series([max(0, x) for x in predict])

# Выбор модели

In [553]:
x_train = pd.read_csv('train.csv')

In [554]:
datetime_test = x_test['datetime']

x_train = date_update(x_train)
x_test = date_update(x_test)

In [555]:
y_train_reg = x_train['registered']
y_train_cas = x_train['casual']
y_train = x_train['count']
x_train = x_train.drop(['casual', 'registered', 'count'], axis=1)

In [68]:
# подбор параметров модели
print("Root Mean Square Logaritm Error GB:", check_rmsle_gb(x_train, y_train))

GradientBoostingRegressor(alpha=0.99, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='huber', max_depth=5,
                          max_features='auto', max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=6,
                          min_weight_fraction_leaf=0.0, n_estimators=500,
                          n_iter_no_change=None, presort='auto',
                          random_state=20, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
Root Mean Square Logaritm Error GB: 0.2833662633307364


# Трансформация датасета

In [174]:
# для содели с count
# features_for_onehotencoding = ['season', 'weather', 'day_of_week', 'holiday', 'workingday', 'year', 'month', 'hour']
# make_submission(onehotencoding(x_train, features_for_onehotencoding), y_train, onehotencoding(x_test.sort_values(by=['year', 'month', 'day', 'hour']), features_for_onehotencoding))

In [558]:
# для расдлельного предсказания reg и casual
# features_for_onehotencoding = ['season', 'weather', 'day_of_week', 'month', 'hour']
y_test = make_submission_reg_plus_cas(x_train, y_train_reg, y_train_cas, x_test.sort_values(by=['year', 'month', 'day', 'hour']))

In [567]:
y_test = pd.Series(y_test)

x_full = pd.concat([x_train, x_test])
y_full = pd.concat([y_train, y_test])

# Создаем данные для прогноза

In [526]:
# range datetime period
x_predict = pd.DataFrame({"datetime": pd.date_range('2012-12-20', '2013-3-19 23', freq='h')})

In [527]:
# holidays

from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
cal = calendar()
cal.holidays('2012-12-20', '2013-3-19 23')
x_predict['holiday'] = x_predict['datetime'].isin(cal.holidays('2012-12-20', '2013-3-19 23')).map({False : 0, True : 1})

In [528]:
# transform datetime
x_predict = date_update(x_predict)

In [529]:
# season
x_predict['season'] = (x_predict['month'] % 12 + 3) // 3

In [530]:
x_predict['workingday'] = (x_predict['day_of_week']<5).map({False : 0, True : 1}) # 0 - monday

In [536]:
def weather(x, y):
    
#     x = x.reset_index(drop=True, inplace=True)
#     y = y.reset_index(drop=True, inplace=True)
    
    a, b, c, d, e = [], [], [], [], []
    
    for i in x.index:
        day = x[x.index == i]['day'].values[0]
        month = x[x.index == i]['month'].values[0]
        hour = x[x.index == i]['hour'].values[0]

        weather = np.round(y[(y.day==day) & (y.month==month) & (y.hour==hour)]['weather'].values.mean())
        temp = y[(y.day==day) & (y.month==month) & (y.hour==hour)]['temp'].values.mean()
        atemp = y[(y.day==day) & (y.month==month) & (y.hour==hour)]['atemp'].values.mean()
        humidity = np.round(y[(y.day==day) & (y.month==month) & (y.hour==hour)]['humidity'].values.mean())
        wind = y[(y.day==day) & (y.month==month) & (y.hour==hour)]['windspeed'].values.mean()
    
        a.append(weather)
        b.append(temp)
        c.append(atemp)
        d.append(humidity)
        e.append(wind)

    
    
    return a, b, c, d, e

In [539]:
# weather, temp and etc.

a, b, c, d, e = weather(x_predict, x_full)

x_predict['weather'] = a
x_predict['temp'] = b
x_predict['atemp'] = c
x_predict['humidity'] = d
x_predict['windspeed'] = e

In [614]:
x_full

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,day,month,year,hour,day_of_week,week_of_year
0,1,0,0,1,9.84,14.395,81,0.0000,1,1,2011,0,5,52
1,1,0,0,1,9.02,13.635,80,0.0000,1,1,2011,1,5,52
2,1,0,0,1,9.02,13.635,80,0.0000,1,1,2011,2,5,52
3,1,0,0,1,9.84,14.395,75,0.0000,1,1,2011,3,5,52
4,1,0,0,1,9.84,14.395,75,0.0000,1,1,2011,4,5,52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,1,0,1,2,10.66,12.880,60,11.0014,31,12,2012,19,0,1
17375,1,0,1,2,10.66,12.880,60,11.0014,31,12,2012,20,0,1
17376,1,0,1,1,10.66,12.880,60,11.0014,31,12,2012,21,0,1
17377,1,0,1,1,10.66,13.635,56,8.9981,31,12,2012,22,0,1


In [615]:
x_predict

Unnamed: 0,holiday,day,month,year,hour,day_of_week,week_of_year,season,workingday,weather,temp,atemp,humidity,windspeed,count
0,0,20,12,2012,0,3,51,1,1,1.0,13.94,17.4225,59.0,7.99980,76.0
1,0,20,12,2012,1,3,51,1,1,1.0,13.94,18.1825,60.0,0.00000,50.0
2,0,20,12,2012,2,3,51,1,1,2.0,13.53,17.8025,64.0,0.00000,53.0
3,0,20,12,2012,3,3,51,1,1,2.0,13.12,17.0450,66.0,3.00160,83.0
4,0,20,12,2012,4,3,51,1,1,2.0,13.53,17.0450,62.0,6.50235,79.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2155,0,19,3,2013,19,1,12,2,1,1.0,21.73,26.1375,47.0,17.00125,14.0
2156,0,19,3,2013,20,1,12,2,1,1.0,20.91,25.7575,50.0,15.00130,14.0
2157,0,19,3,2013,21,1,12,2,1,1.0,20.50,25.3800,50.0,15.50045,17.0
2158,0,19,3,2013,22,1,12,2,1,2.0,19.68,23.4850,55.0,16.50210,22.0


In [631]:
# features_for_onehotencoding = ['day_of_week', 'hour']
x_full.reset_index(inplace=True, drop=True)
y_predict = make_submission(x_train.drop(['year', 'count'], axis=1), y_train, x_predict.drop(['year', 'count'], axis=1))

In [580]:
import seaborn as sns

In [633]:
y_test.mean(), y_train.mean(), y_full.mean(),  y_predict.mean()

(185.16879716617896, 191.57413191254824, 189.18102307382472, 85.0337962962963)

In [629]:
x_train['count'] = y_train
x_predict['count'] = y_predict
x_test['count'] = y_test


In [626]:
x_train.groupby(['year', 'month'])['count'].sum(), \
x_test.groupby(['year', 'month'])['count'].sum()


(year  month
 2011  1         23552
       2         32844
       3         38735
       4         50517
       5         79713
       6         89776
       7         92848
       8         83296
       9         79104
       10        79522
       11        70889
       12        61183
 2012  1         56332
       2         66269
       3         94766
       4        116885
       5        120434
       6        130957
       7        121769
       8        130220
       9        133425
       10       127912
       11       105551
       12        98977
 Name: count, dtype: int64, year  month
 2011  1        13316.0
       2        14565.0
       3        26546.0
       4        34006.0
       5        58864.0
       6        51524.0
       7        54617.0
       8        49481.0
       9        39351.0
       10       43616.0
       11       37475.0
       12       30478.0
 2012  1        37611.0
       2        37367.0
       3        67036.0
       4        58618.0
       5   

In [632]:
x_predict.groupby(['year', 'month'])['count'].sum()

year  month
2012  12       15941.0
2013  1        64090.0
      2        63442.0
      3        36407.0
Name: count, dtype: float64

In [627]:
x_train

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,day,month,year,hour,day_of_week,week_of_year,count
0,1,0,0,1,9.84,14.395,81,0.0000,1,1,2011,0,5,52,16
1,1,0,0,1,9.02,13.635,80,0.0000,1,1,2011,1,5,52,40
2,1,0,0,1,9.02,13.635,80,0.0000,1,1,2011,2,5,52,32
3,1,0,0,1,9.84,14.395,75,0.0000,1,1,2011,3,5,52,13
4,1,0,0,1,9.84,14.395,75,0.0000,1,1,2011,4,5,52,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,4,0,1,1,15.58,19.695,50,26.0027,19,12,2012,19,2,51,336
10882,4,0,1,1,14.76,17.425,57,15.0013,19,12,2012,20,2,51,241
10883,4,0,1,1,13.94,15.910,61,15.0013,19,12,2012,21,2,51,168
10884,4,0,1,1,13.94,17.425,61,6.0032,19,12,2012,22,2,51,129
