In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import lightgbm as lgb
from sklearn.feature_selection import SelectFromModel

import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
#from IPython.core.interactiveshell import InteractiveShell
#InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 30)

#sns.set_style("whitegrid")
#plt.style.use('bmh')
plt.style.use('seaborn-whitegrid')

# this allows plots to appear directly in the notebook
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
train_df = pd.read_csv('data/train.csv')
train_df['data_set'] = 'train'
test_df = pd.read_csv('data/test.csv')
test_df['data_set'] = 'test'
# combine train and test data into one df
test_df['registered'] = 0
test_df['casual'] = 0
test_df['count'] = 0

all_df = pd.concat([train_df, test_df])
# parse datetime colum & add new time related columns
dt = pd.DatetimeIndex(all_df['datetime'])
all_df.set_index(dt, inplace=True)

# logarithmic transformation of dependent cols
# (adding 1 first so that 0 values don't become -inf)
for col in ['casual', 'registered', 'count']:
    all_df[f'{col}_log'] = np.log(all_df[col] + 1)

all_df['date'] = dt.date # yyyymmdd
all_df['day'] = dt.day # dd
all_df['month'] = dt.month # mm
all_df['year'] = dt.year # yyyy
all_df['hour'] = dt.hour # hh
all_df['dow'] = dt.dayofweek #曜日 Mon:0 Tue:1 Wed:2 Thu:3 Fri:4 Sat:5 Sun:6
all_df['woy'] = dt.isocalendar().week #その日の週が年間で見ると何番目の週かを表す数字 [dt.weekofyear]は deprecated

# add a count_season column using join
by_season = all_df[all_df['data_set'] == 'train'].copy().groupby(['season'])[['count']].agg(sum)
by_season.columns = ['count_season']
all_df = all_df.join(by_season, on='season')


# feature engineer a new column whether its a peak hour or not
all_df['peak'] = all_df[['hour', 'workingday']]\
    .apply(lambda df: 3 if ((df['workingday'] == 1 and (df['hour'] == 8 or 17 <= df['hour'] <= 18)) \
                            or (df['workingday'] == 0 and 11 <= df['hour'] <= 17)) else \
                            ( 2 if ((df['workingday'] == 1 and (df['hour'] == 7 or df['hour'] == 9 or df['hour'] == 16 or 19 <= df['hour'] <= 20)) \
                            or (df['workingday'] == 0 and (df['hour'] == 10 or 18 <= df['hour'] <= 19))) else \
                            ( 1 if ((df['workingday'] == 1 and (10 <= df['hour'] <= 15 or 21 <= df['hour'] <= 22)) \
                            or (df['workingday'] == 0 and (8 <= df['hour'] <= 9 or 20 <= df['hour'] <= 23))) else 0)), axis = 1)

#ここの修正の仕方は、間違っているので要修正！
# sandy
#all_df['holiday'] = all_df[['month', 'day', 'holiday', 'year']]\
#    .apply(lambda df: 1 if (df['year'] == 2012 and df['month'] == 10 and df['day'] == 30) else 0, axis = 1)
# 修正後↓
all_df['holiday'] = all_df[['month', 'day', 'holiday', 'year']]\
    .apply(lambda df: 1 if (df['year'] == 2012 and df['month'] == 10 and df['day'] == 30) else df['holiday'], axis = 1)


# christmas and others
all_df['holiday'] = all_df[['month', 'day', 'holiday']]\
    .apply(lambda df: 1 if (df['month'] == 12 and df['day'] in [24, 26, 31]) else df['holiday'], axis = 1)
all_df['workingday'] = all_df[['month', 'day', 'workingday']]\
    .apply(lambda df: 0 if df['month'] == 12 and df['day'] in [24, 31] else df['workingday'], axis = 1)
# これは流石に気づかない気がする。。。気づけない気がする。。。
def get_day(day_start):
    day_end = day_start + pd.offsets.DateOffset(hours=23)
    return pd.date_range(day_start, day_end, freq="H")

# tax day
all_df.loc[get_day(datetime(2011, 4, 15)), "workingday"] = 1
all_df.loc[get_day(datetime(2012, 4, 16)), "workingday"] = 1

# thanksgiving friday
all_df.loc[get_day(datetime(2011, 11, 25)), "workingday"] = 0
all_df.loc[get_day(datetime(2012, 11, 23)), "workingday"] = 0

# tax day
all_df.loc[get_day(datetime(2011, 4, 15)), "holiday"] = 0
all_df.loc[get_day(datetime(2012, 4, 16)), "holiday"] = 0

# thanksgiving friday
all_df.loc[get_day(datetime(2011, 11, 25)), "holiday"] = 1
all_df.loc[get_day(datetime(2012, 11, 23)), "holiday"] = 1

#storms
all_df.loc[get_day(datetime(2012, 5, 21)), "holiday"] = 1

#tornado
all_df.loc[get_day(datetime(2012, 6, 1)), "holiday"] = 1
# from histogram
all_df['ideal'] = all_df[['temp', 'windspeed']]\
    .apply(lambda df: 1 if (df['temp'] > 27 and df['windspeed'] < 30) else 0, axis = 1)
    
all_df['sticky'] = all_df[['humidity', 'workingday']]\
    .apply(lambda df: 1 if (df['workingday'] == 1 and df['humidity'] >= 60) else 0, axis = 1)

# One-hot-Encoding for season
#season_map = {1:'Spring', 2:'Summer', 3:'Fall', 4:'Winter'}
#all_df['season'] = all_df['season'].map(lambda d : season_map[d])
#temporary = pd.get_dummies(all_df['season'])
#all_df['season_Fall'] = temporary['Fall']
#all_df['season_Spring'] = temporary['Spring']
#all_df['season_Summer'] = temporary['Summer']
#all_df['season_Winter'] = temporary['Winter']

# temperature
all_df.loc[all_df['temp'] < 10,'temp_cat'] = 1
all_df.loc[(all_df['temp'] >= 10) & (all_df['temp'] < 15),'temp_cat'] = 2
all_df.loc[(all_df['temp'] >= 15) & (all_df['temp'] < 20),'temp_cat'] = 3
all_df.loc[(all_df['temp'] >= 20) & (all_df['temp'] < 25),'temp_cat'] = 4
all_df.loc[(all_df['temp'] >= 25) & (all_df['temp'] < 30),'temp_cat'] = 5
all_df.loc[(all_df['temp'] >= 30) & (all_df['temp'] < 35),'temp_cat'] = 6
all_df.loc[(all_df['temp'] >= 35),'temp_cat'] = 7

# humidity many category
all_df.loc[all_df['humidity'] < 10,'humidity_cat_many'] = 0
all_df.loc[(all_df['humidity'] >= 10) & (all_df['humidity'] < 20),'humidity_cat_many'] = 1
all_df.loc[(all_df['humidity'] >= 20) & (all_df['humidity'] < 30),'humidity_cat_many'] = 2
all_df.loc[(all_df['humidity'] >= 30) & (all_df['humidity'] < 40),'humidity_cat_many'] = 3
all_df.loc[(all_df['humidity'] >= 40) & (all_df['humidity'] < 50),'humidity_cat_many'] = 4
all_df.loc[(all_df['humidity'] >= 50) & (all_df['humidity'] < 60),'humidity_cat_many'] = 5
all_df.loc[(all_df['humidity'] >= 60) & (all_df['humidity'] < 70),'humidity_cat_many'] = 6
all_df.loc[(all_df['humidity'] >= 70) & (all_df['humidity'] < 80),'humidity_cat_many'] = 7
all_df.loc[(all_df['humidity'] >= 80) & (all_df['humidity'] < 90),'humidity_cat_many'] = 8
all_df.loc[(all_df['humidity'] >= 90),'humidity_cat_many'] = 9

# humidity not many category
all_df.loc[all_df['humidity'] < 20,'humidity_cat_less'] = 0
all_df.loc[(all_df['humidity'] >= 20) & (all_df['humidity'] < 40),'humidity_cat_less'] = 1
all_df.loc[(all_df['humidity'] >= 40) & (all_df['humidity'] < 60),'humidity_cat_less'] = 2
all_df.loc[(all_df['humidity'] >= 60) & (all_df['humidity'] < 80),'humidity_cat_less'] = 3
all_df.loc[(all_df['humidity'] >= 80),'humidity_cat_less'] = 4

# windspeed
all_df.loc[all_df['windspeed'] < 5,'wind_cat'] = 0
all_df.loc[(all_df['windspeed'] >= 5) & (all_df['windspeed'] < 10),'wind_cat'] = 1
all_df.loc[(all_df['windspeed'] >= 10) & (all_df['windspeed'] < 15),'wind_cat'] = 2
all_df.loc[(all_df['windspeed'] >= 15) & (all_df['windspeed'] < 20),'wind_cat'] = 3
all_df.loc[(all_df['windspeed'] >= 20) & (all_df['windspeed'] < 25),'wind_cat'] = 4
all_df.loc[(all_df['windspeed'] >= 25) & (all_df['windspeed'] < 30),'wind_cat'] = 5
all_df.loc[(all_df['windspeed'] >= 30) & (all_df['windspeed'] < 35),'wind_cat'] = 6
all_df.loc[(all_df['windspeed'] >= 35) & (all_df['windspeed'] < 40),'wind_cat'] = 7
all_df.loc[(all_df['windspeed'] >= 40) & (all_df['windspeed'] < 45),'wind_cat'] = 8
all_df.loc[(all_df['windspeed'] >= 45),'wind_cat'] = 9


In [3]:
all_df.head(3)

Unnamed: 0_level_0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,data_set,casual_log,registered_log,count_log,date,day,month,year,hour,dow,woy,count_season,peak,ideal,sticky,temp_cat,humidity_cat_many,humidity_cat_less,wind_cat
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2011-01-01 00:00:00,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,train,1.386294,2.639057,2.833213,2011-01-01,1,1,2011,0,5,52,312498,0,0,0,1.0,8.0,4.0,0.0
2011-01-01 01:00:00,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,train,2.197225,3.496508,3.713572,2011-01-01,1,1,2011,1,5,52,312498,0,0,0,1.0,8.0,4.0,0.0
2011-01-01 02:00:00,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,train,1.791759,3.332205,3.496508,2011-01-01,1,1,2011,2,5,52,312498,0,0,0,1.0,8.0,4.0,0.0


In [4]:
all_df.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count',
       'data_set', 'casual_log', 'registered_log', 'count_log', 'date', 'day',
       'month', 'year', 'hour', 'dow', 'woy', 'count_season', 'peak', 'ideal',
       'sticky', 'temp_cat', 'humidity_cat_many', 'humidity_cat_less',
       'wind_cat'],
      dtype='object')

In [5]:
req_cols = ['season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 
        'day','month', 'year', 'hour', 'dow', 'woy', 'peak', 'ideal',
       'sticky', 'temp_cat', 'humidity_cat_many', 'humidity_cat_less','wind_cat'
       #'datetime', 'casual', 'registered', 'count','data_set', 'casual_log', 'registered_log', 'count_log', 'date','count_season', 
       ]


In [6]:

""" X_train, y_train_r, y_train_c = prep_train_data(train_df, req_cols)

selector = SelectFromModel(RandomForestRegressor(n_estimators = 100,random_state = 42),threshold = 0.3)

selector.fit(X_train,y_train_r)
X_train_f_select_r = selector.transform(X_train) """

' X_train, y_train_r, y_train_c = prep_train_data(train_df, req_cols)\n\nselector = SelectFromModel(RandomForestRegressor(n_estimators = 100,random_state = 42),threshold = 0.3)\n\nselector.fit(X_train,y_train_r)\nX_train_f_select_r = selector.transform(X_train) '

In [7]:
""" feature_idx = selector.get_support()
X_train.columns[feature_idx] """

' feature_idx = selector.get_support()\nX_train.columns[feature_idx] '

In [6]:
# instead of randomly splitting our training data 
# for cross validation, let's construct a framework that's more
# in line with how the data is divvied up for this competition
# (given first 19 days of each month, what is demand for remaining days)
# so, let's split our training data into 2 time contiguous datasets
# for fitting and validating our model (days 1-14 vs. days 15-19).

# also, since submissions are evaluated based on the
# root mean squared logarithmic error (RMSLE), let's replicate
# that computation as we test and tune our model.

train_df = all_df[all_df['data_set'] == 'train']
test_df = all_df[all_df['data_set'] == 'test']

def get_rmsle(y_pred, y_actual):
    diff = np.log(y_pred + 1) - np.log(y_actual + 1)
    mean_error = np.square(diff).mean()
    return np.sqrt(mean_error)

def custom_train_valid_split(data, cutoff_day=15):
    train = data[data['day'] <= cutoff_day]
    valid = data[data['day'] > cutoff_day]

    return train, valid

def prep_train_data(data, input_cols):
    X = data[input_cols]#.values
    y_r = data['registered_log']#.values
    y_c = data['casual_log']#.values

    return X, y_r, y_c

# predict on validation set & transform output back from log scale
def predict_on_validation_set(model, input_cols):
    
    train, valid = custom_train_valid_split(train_df)
    y_pred_comb_l = []
    y_actual_comb_l = []

    for year_val in [2011,2012]:
        for month_val in range(1,13):

            print(f'Now,{year_val} {month_val} training and validating...')
            # split data for kaggle data rules
            train_tmp = train.query('year <= @year_val and month <= @month_val')
            valid_tmp = valid.query('year == @year_val and month == @month_val')
            
            # split data for feature selection and training, validating
            X_train, y_train_r, y_train_c = prep_train_data(train_tmp, req_cols)
            X_valid, y_valid_r, y_valid_c = prep_train_data(valid_tmp, req_cols)

            # feature selection and training and validating
            selector = SelectFromModel(RandomForestRegressor(n_estimators = 100,random_state = 42),threshold = 0.1)#'median')

            selector.fit(X_train,y_train_r)
            X_train_fs_r = selector.transform(X_train)
            print(X_train.columns[selector.get_support()].tolist())
            model_r = model.fit(X_train_fs_r, y_train_r)
            X_valid_fs_r = selector.transform(X_valid)
            y_pred_r = np.exp(model_r.predict(X_valid_fs_r)) - 1
            
            selector.fit(X_train,y_train_c)
            X_train_fs_c = selector.transform(X_train)
            print(X_train.columns[selector.get_support()].tolist())
            model_c = model.fit(X_train_fs_c, y_train_c)
            X_valid_fs_c = selector.transform(X_valid)
            y_pred_c = np.exp(model_c.predict(X_valid_fs_c)) - 1

            y_pred_comb = np.round(y_pred_r + y_pred_c)
            y_pred_comb[y_pred_comb < 0] = 0
            y_pred_comb_l.extend(y_pred_comb)

            y_actual_comb = np.exp(y_valid_r) + np.exp(y_valid_c) - 2
            y_actual_comb_l.extend(y_actual_comb)

            #rmsle = get_rmsle(y_pred_comb, y_actual_comb)
            #rmsle_l.append(rmsle)
    
    rmsle = get_rmsle(np.array(y_pred_comb_l),np.array(y_actual_comb_l))
    
    return (np.array(y_pred_comb_l), np.array(y_actual_comb_l), rmsle)


# predict on test set & transform output back from log scale
""" def predict_on_test_set(model, input_cols):
    
    y_pred_comb_l = []
    for year_val in [2011,2012]:
        for month_val in range(1,13):
            
            # prepare training set
            print(f'Now,{year_val} {month_val} testing...')
            train_df_tmp = train_df.query('year <= @year_val and month <= @month_val')
            test_df_tmp = test_df.query('year == @year_val and month == @month_val')

            X_train, y_train_r, y_train_c = prep_train_data(train_df_tmp, req_cols)

            # prepare testing set
            X_test = test_df_tmp[req_cols]

            # feature selection and testing
            selector = SelectFromModel(RandomForestRegressor(n_estimators = 100,random_state = 42),threshold = 'median')

            selector.fit(X_train,y_train_r)
            X_train_fs_r = selector.transform(X_train)
            print(X_train.columns[selector.get_support()])
            model_r = model.fit(X_train_fs_r, y_train_r)
            X_test_fs_r = selector.transform(X_test)
            y_pred_r = np.exp(model_r.predict(X_test_fs_r)) - 1

            selector.fit(X_train,y_train_c)
            X_train_fs_c = selector.transform(X_train)
            print(X_train.columns[selector.get_support()])
            model_c = model.fit(X_train_fs_c, y_train_c)
            X_test_fs_c = selector.transform(X_test)
            y_pred_c = np.exp(model_c.predict(X_test_fs_c)) - 1
            
            # add casual & registered predictions together
            y_pred_comb = np.round(y_pred_r + y_pred_c)
            y_pred_comb[y_pred_comb < 0] = 0
            y_pred_comb_l.extend(y_pred_comb)

    
    return np.array(y_pred_comb_l) """

In [7]:
# Random forest for training and validation
params = {'n_estimators': 1000, 'max_depth': 15, 'random_state': 0, 'min_samples_split' : 5, 'n_jobs': -1}
rf_model = RandomForestRegressor(**params)
rf_cols = [
    #'weather', 'temp', 'atemp', 'windspeed',
    'weather', 'temp_cat', 'wind_cat',
    #'workingday', 'season', 'holiday', 'sticky',
    'workingday', 'season', 'holiday', 'humidity_cat',
    #'workingday', 'season_Fall', 'season_Spring', 'season_Summer', 'season_Winter', 'holiday', 'sticky',
    'hour', 'dow', 'woy', 'peak'
    ]

(rf_pred, rf_actual, rf_rmsle) = predict_on_validation_set(rf_model, rf_cols)

Now,2011 1 training and validating...
['peak']
['temp', 'hour', 'peak']
Now,2011 2 training and validating...
['hour', 'peak']
['temp', 'hour', 'peak']
Now,2011 3 training and validating...
['hour', 'peak']
['temp', 'hour', 'peak']
Now,2011 4 training and validating...
['hour', 'peak']
['temp', 'hour', 'peak']
Now,2011 5 training and validating...
['hour', 'peak']
['temp', 'hour', 'peak']
Now,2011 6 training and validating...
['hour', 'peak']
['temp', 'hour', 'peak']
Now,2011 7 training and validating...
['hour', 'peak']
['temp', 'atemp', 'hour', 'peak']
Now,2011 8 training and validating...
['hour', 'peak']
['temp', 'atemp', 'hour', 'peak']
Now,2011 9 training and validating...
['hour', 'peak']
['temp', 'atemp', 'hour', 'peak']
Now,2011 10 training and validating...
['hour', 'peak']
['temp', 'atemp', 'hour', 'peak']
Now,2011 11 training and validating...
['hour', 'peak']
['temp', 'atemp', 'hour', 'peak']
Now,2011 12 training and validating...
['hour', 'peak']
['temp', 'atemp', 'hour',

In [8]:
print(f'rf_pred.shape: {rf_pred.shape}   rf_actual.shape: {rf_actual.shape}   rf_rmsle: {rf_rmsle}')
#all_df[rf_cols].corr()

rf_pred.shape: (2286,)   rf_actual.shape: (2286,)   rf_rmsle: 0.6963769073763493


In [9]:
# Gradient Boosting for training and validation
params = {'n_estimators': 150, 'max_depth': 5, 'random_state': 0, 'min_samples_leaf' : 10, 'learning_rate': 0.1, 'subsample': 0.7, 'loss': 'ls'}
gbm_model = GradientBoostingRegressor(**params)
gbm_cols = [
    'weather', 'temp_cat', 'humidity', 'wind_cat',
    #'weather', 'temp', 'atemp', 'humidity', 'windspeed',
    #'weather', 'temp', 'atemp', 'humidity', 'wind_cat',
    'holiday', 'workingday', 'season',
    #'holiday', 'workingday', 'season_Fall', 'season_Spring', 'season_Summer', 'season_Winter',
    'hour', 'dow', 'year', #'ideal', #'count_season',
]

(gbm_pred, gbm_actual, gbm_rmsle) = predict_on_validation_set(gbm_model, gbm_cols)

Now,2011 1 training and validating...
['peak']
['temp', 'hour', 'peak']
Now,2011 2 training and validating...
['hour', 'peak']
['temp', 'hour', 'peak']
Now,2011 3 training and validating...
['hour', 'peak']
['temp', 'hour', 'peak']
Now,2011 4 training and validating...
['hour', 'peak']
['temp', 'hour', 'peak']
Now,2011 5 training and validating...
['hour', 'peak']
['temp', 'hour', 'peak']
Now,2011 6 training and validating...
['hour', 'peak']
['temp', 'hour', 'peak']
Now,2011 7 training and validating...
['hour', 'peak']
['temp', 'atemp', 'hour', 'peak']
Now,2011 8 training and validating...
['hour', 'peak']
['temp', 'atemp', 'hour', 'peak']
Now,2011 9 training and validating...
['hour', 'peak']
['temp', 'atemp', 'hour', 'peak']
Now,2011 10 training and validating...
['hour', 'peak']
['temp', 'atemp', 'hour', 'peak']
Now,2011 11 training and validating...
['hour', 'peak']
['temp', 'atemp', 'hour', 'peak']
Now,2011 12 training and validating...
['hour', 'peak']
['temp', 'atemp', 'hour',

In [10]:
print(f'gbm_pred.shape: {gbm_pred.shape}   gbm_actual.shape: {gbm_actual.shape}   gbm_rmsle: {gbm_rmsle}')
#all_df[gbm_cols].corr()

gbm_pred.shape: (2286,)   gbm_actual.shape: (2286,)   gbm_rmsle: 0.6958926884786664


In [11]:
# the blend gives a better score on the leaderboard, even though it does not on the validation set
y_pred = np.round(.4*rf_pred + .6*gbm_pred)
#y_pred = np.round(.4*rf_pred + .6*lgb_pred)
print(get_rmsle(y_pred, rf_actual))

0.6958149015323377


In [16]:
""" rf_pred = predict_on_test_set(rf_model, rf_cols)
gbm_pred = predict_on_test_set(gbm_model, gbm_cols)

y_pred = np.round(.4*rf_pred + .6*gbm_pred)
#y_pred = np.round(.4*rf_pred + .6*lgb_pred) """

Now,2011 1 testing...
Index(['workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed',
       'day', 'hour', 'dow', 'peak', 'wind_cat'],
      dtype='object')
Index(['workingday', 'temp', 'atemp', 'humidity', 'windspeed', 'day', 'hour',
       'dow', 'peak', 'temp_cat', 'wind_cat'],
      dtype='object')
Now,2011 2 testing...
Index(['workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed',
       'day', 'hour', 'dow', 'woy', 'peak'],
      dtype='object')
Index(['temp', 'atemp', 'humidity', 'windspeed', 'day', 'hour', 'dow', 'woy',
       'peak', 'temp_cat', 'wind_cat'],
      dtype='object')
Now,2011 3 testing...
Index(['workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed',
       'day', 'hour', 'dow', 'woy', 'peak'],
      dtype='object')
Index(['weather', 'temp', 'atemp', 'humidity', 'windspeed', 'day', 'hour',
       'dow', 'woy', 'peak', 'temp_cat'],
      dtype='object')
Now,2011 4 testing...
Index(['workingday', 'weather', 'temp', 'atemp', 'humidity

In [17]:
""" # output predictions for submission
submit_manual_blend_df = test_df[['datetime', 'count']].copy()
submit_manual_blend_df['count'] = y_pred
submit_manual_blend_df.to_csv('output/submit_manual_blend_20211008_3.csv', index=False) """

In [22]:
all_df.head(3)

Unnamed: 0_level_0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,data_set,casual_log,registered_log,count_log,date,day,month,year,hour,dow,woy,count_season,peak,ideal,sticky,temp_cat,humidity_cat_many,humidity_cat_less,wind_cat
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2011-01-01 00:00:00,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,train,1.386294,2.639057,2.833213,2011-01-01,1,1,2011,0,5,52,312498,0,0,0,1.0,8.0,4.0,0.0
2011-01-01 01:00:00,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,train,2.197225,3.496508,3.713572,2011-01-01,1,1,2011,1,5,52,312498,0,0,0,1.0,8.0,4.0,0.0
2011-01-01 02:00:00,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,train,1.791759,3.332205,3.496508,2011-01-01,1,1,2011,2,5,52,312498,0,0,0,1.0,8.0,4.0,0.0
