In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import lightgbm as lgb
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, Ridge, LassoCV,LinearRegression

import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
#from IPython.core.interactiveshell import InteractiveShell
#InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 30)

#sns.set_style("whitegrid")
#plt.style.use('bmh')
plt.style.use('seaborn-whitegrid')

# this allows plots to appear directly in the notebook
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
train_df = pd.read_csv('data/train.csv')
train_df['data_set'] = 'train'
test_df = pd.read_csv('data/test.csv')
test_df['data_set'] = 'test'
# combine train and test data into one df
test_df['registered'] = 0
test_df['casual'] = 0
test_df['count'] = 0

all_df = pd.concat([train_df, test_df])
# parse datetime colum & add new time related columns
dt = pd.DatetimeIndex(all_df['datetime'])
all_df.set_index(dt, inplace=True)

# logarithmic transformation of dependent cols
# (adding 1 first so that 0 values don't become -inf)
for col in ['casual', 'registered', 'count']:
    all_df[f'{col}_log'] = np.log(all_df[col] + 1)

all_df['date'] = dt.date # yyyymmdd
all_df['day'] = dt.day # dd
all_df['month'] = dt.month # mm
all_df['year'] = dt.year # yyyy
all_df['hour'] = dt.hour # hh
all_df['dow'] = dt.dayofweek #曜日 Mon:0 Tue:1 Wed:2 Thu:3 Fri:4 Sat:5 Sun:6
all_df['woy'] = dt.isocalendar().week #その日の週が年間で見ると何番目の週かを表す数字 [dt.weekofyear]は deprecated

# add a count_season column using join
by_season = all_df[all_df['data_set'] == 'train'].copy().groupby(['season'])[['count']].agg(sum)
by_season.columns = ['count_season']
all_df = all_df.join(by_season, on='season')


# feature engineer a new column whether its a peak hour or not
all_df['peak'] = all_df[['hour', 'workingday']]\
    .apply(lambda df: 3 if ((df['workingday'] == 1 and (df['hour'] == 8 or 17 <= df['hour'] <= 18)) \
                            or (df['workingday'] == 0 and 11 <= df['hour'] <= 17)) else \
                            ( 2 if ((df['workingday'] == 1 and (df['hour'] == 7 or df['hour'] == 9 or df['hour'] == 16 or 19 <= df['hour'] <= 20)) \
                            or (df['workingday'] == 0 and (df['hour'] == 10 or 18 <= df['hour'] <= 19))) else \
                            ( 1 if ((df['workingday'] == 1 and (10 <= df['hour'] <= 15 or 21 <= df['hour'] <= 22)) \
                            or (df['workingday'] == 0 and (8 <= df['hour'] <= 9 or 20 <= df['hour'] <= 23))) else 0)), axis = 1)

#ここの修正の仕方は、間違っているので要修正！
# sandy
#all_df['holiday'] = all_df[['month', 'day', 'holiday', 'year']]\
#    .apply(lambda df: 1 if (df['year'] == 2012 and df['month'] == 10 and df['day'] == 30) else 0, axis = 1)
# 修正後↓
all_df['holiday'] = all_df[['month', 'day', 'holiday', 'year']]\
    .apply(lambda df: 1 if (df['year'] == 2012 and df['month'] == 10 and df['day'] == 30) else df['holiday'], axis = 1)


# christmas and others
all_df['holiday'] = all_df[['month', 'day', 'holiday']]\
    .apply(lambda df: 1 if (df['month'] == 12 and df['day'] in [24, 26, 31]) else df['holiday'], axis = 1)
all_df['workingday'] = all_df[['month', 'day', 'workingday']]\
    .apply(lambda df: 0 if df['month'] == 12 and df['day'] in [24, 31] else df['workingday'], axis = 1)
# これは流石に気づかない気がする。。。気づけない気がする。。。
def get_day(day_start):
    day_end = day_start + pd.offsets.DateOffset(hours=23)
    return pd.date_range(day_start, day_end, freq="H")

# tax day
all_df.loc[get_day(datetime(2011, 4, 15)), "workingday"] = 1
all_df.loc[get_day(datetime(2012, 4, 16)), "workingday"] = 1

# thanksgiving friday
all_df.loc[get_day(datetime(2011, 11, 25)), "workingday"] = 0
all_df.loc[get_day(datetime(2012, 11, 23)), "workingday"] = 0

# tax day
all_df.loc[get_day(datetime(2011, 4, 15)), "holiday"] = 0
all_df.loc[get_day(datetime(2012, 4, 16)), "holiday"] = 0

# thanksgiving friday
all_df.loc[get_day(datetime(2011, 11, 25)), "holiday"] = 1
all_df.loc[get_day(datetime(2012, 11, 23)), "holiday"] = 1

#storms
all_df.loc[get_day(datetime(2012, 5, 21)), "holiday"] = 1

#tornado
all_df.loc[get_day(datetime(2012, 6, 1)), "holiday"] = 1
# from histogram
all_df['ideal'] = all_df[['temp', 'windspeed']]\
    .apply(lambda df: 1 if (df['temp'] > 27 and df['windspeed'] < 30) else 0, axis = 1)
    
all_df['sticky'] = all_df[['humidity', 'workingday']]\
    .apply(lambda df: 1 if (df['workingday'] == 1 and df['humidity'] >= 60) else 0, axis = 1)

# temperature
all_df.loc[all_df['temp'] < 10,'temp_cat'] = 1
all_df.loc[(all_df['temp'] >= 10) & (all_df['temp'] < 15),'temp_cat'] = 2
all_df.loc[(all_df['temp'] >= 15) & (all_df['temp'] < 20),'temp_cat'] = 3
all_df.loc[(all_df['temp'] >= 20) & (all_df['temp'] < 25),'temp_cat'] = 4
all_df.loc[(all_df['temp'] >= 25) & (all_df['temp'] < 30),'temp_cat'] = 5
all_df.loc[(all_df['temp'] >= 30) & (all_df['temp'] < 35),'temp_cat'] = 6
all_df.loc[(all_df['temp'] >= 35),'temp_cat'] = 7

# humidity many category
all_df.loc[all_df['humidity'] < 10,'humidity_cat_many'] = 0
all_df.loc[(all_df['humidity'] >= 10) & (all_df['humidity'] < 20),'humidity_cat_many'] = 1
all_df.loc[(all_df['humidity'] >= 20) & (all_df['humidity'] < 30),'humidity_cat_many'] = 2
all_df.loc[(all_df['humidity'] >= 30) & (all_df['humidity'] < 40),'humidity_cat_many'] = 3
all_df.loc[(all_df['humidity'] >= 40) & (all_df['humidity'] < 50),'humidity_cat_many'] = 4
all_df.loc[(all_df['humidity'] >= 50) & (all_df['humidity'] < 60),'humidity_cat_many'] = 5
all_df.loc[(all_df['humidity'] >= 60) & (all_df['humidity'] < 70),'humidity_cat_many'] = 6
all_df.loc[(all_df['humidity'] >= 70) & (all_df['humidity'] < 80),'humidity_cat_many'] = 7
all_df.loc[(all_df['humidity'] >= 80) & (all_df['humidity'] < 90),'humidity_cat_many'] = 8
all_df.loc[(all_df['humidity'] >= 90),'humidity_cat_many'] = 9

# humidity not many category
all_df.loc[all_df['humidity'] < 20,'humidity_cat_less'] = 0
all_df.loc[(all_df['humidity'] >= 20) & (all_df['humidity'] < 40),'humidity_cat_less'] = 1
all_df.loc[(all_df['humidity'] >= 40) & (all_df['humidity'] < 60),'humidity_cat_less'] = 2
all_df.loc[(all_df['humidity'] >= 60) & (all_df['humidity'] < 80),'humidity_cat_less'] = 3
all_df.loc[(all_df['humidity'] >= 80),'humidity_cat_less'] = 4

# windspeed
all_df.loc[all_df['windspeed'] < 5,'wind_cat'] = 0
all_df.loc[(all_df['windspeed'] >= 5) & (all_df['windspeed'] < 10),'wind_cat'] = 1
all_df.loc[(all_df['windspeed'] >= 10) & (all_df['windspeed'] < 15),'wind_cat'] = 2
all_df.loc[(all_df['windspeed'] >= 15) & (all_df['windspeed'] < 20),'wind_cat'] = 3
all_df.loc[(all_df['windspeed'] >= 20) & (all_df['windspeed'] < 25),'wind_cat'] = 4
all_df.loc[(all_df['windspeed'] >= 25) & (all_df['windspeed'] < 30),'wind_cat'] = 5
all_df.loc[(all_df['windspeed'] >= 30) & (all_df['windspeed'] < 35),'wind_cat'] = 6
all_df.loc[(all_df['windspeed'] >= 35) & (all_df['windspeed'] < 40),'wind_cat'] = 7
all_df.loc[(all_df['windspeed'] >= 40) & (all_df['windspeed'] < 45),'wind_cat'] = 8
all_df.loc[(all_df['windspeed'] >= 45),'wind_cat'] = 9


In [3]:
# One-hot-Encoding for season
season_map = {1:'Spring', 2:'Summer', 3:'Fall', 4:'Winter'}
all_df['season_name'] = all_df['season'].map(lambda d : season_map[d])
temporary = pd.get_dummies(all_df['season_name'])
all_df['season_Fall'] = temporary['Fall']
all_df['season_Spring'] = temporary['Spring']
all_df['season_Summer'] = temporary['Summer']
all_df['season_Winter'] = temporary['Winter']

# One-hot-Encoding for weather
weather_map = {1:'Good', 2:'Normal', 3:'Bad', 4:'Worse'}
all_df['weather_name'] = all_df['weather'].map(lambda d : weather_map[d])
temporary = pd.get_dummies(all_df['weather_name'])
all_df['weather_Good'] = temporary['Good']
all_df['weather_Normal'] = temporary['Normal']
all_df['weather_Bad'] = temporary['Bad']
all_df['weather_Worse'] = temporary['Worse']

# One-hot-Encoding for other columns
all_df['day_copy'] = all_df['day']
all_df['month_copy'] = all_df['month']
all_df['year_copy'] = all_df['year']

In [4]:
all_df.head(3)

Unnamed: 0_level_0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,data_set,casual_log,registered_log,count_log,date,day,month,year,hour,dow,woy,count_season,peak,ideal,sticky,temp_cat,humidity_cat_many,humidity_cat_less,wind_cat,season_name,season_Fall,season_Spring,season_Summer,season_Winter,weather_name,weather_Good,weather_Normal,weather_Bad,weather_Worse,day_copy,month_copy,year_copy
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1
2011-01-01 00:00:00,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,train,1.386294,2.639057,2.833213,2011-01-01,1,1,2011,0,5,52,312498,0,0,0,1.0,8.0,4.0,0.0,Spring,0,1,0,0,Good,1,0,0,0,1,1,2011
2011-01-01 01:00:00,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,train,2.197225,3.496508,3.713572,2011-01-01,1,1,2011,1,5,52,312498,0,0,0,1.0,8.0,4.0,0.0,Spring,0,1,0,0,Good,1,0,0,0,1,1,2011
2011-01-01 02:00:00,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,train,1.791759,3.332205,3.496508,2011-01-01,1,1,2011,2,5,52,312498,0,0,0,1.0,8.0,4.0,0.0,Spring,0,1,0,0,Good,1,0,0,0,1,1,2011


In [5]:
all_df = pd.get_dummies(all_df,columns=['day_copy','month_copy','year_copy','hour','dow','woy','peak','temp_cat','humidity_cat_many','wind_cat'])

In [6]:
print(all_df.columns.tolist())

['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count', 'data_set', 'casual_log', 'registered_log', 'count_log', 'date', 'day', 'month', 'year', 'count_season', 'ideal', 'sticky', 'humidity_cat_less', 'season_name', 'season_Fall', 'season_Spring', 'season_Summer', 'season_Winter', 'weather_name', 'weather_Good', 'weather_Normal', 'weather_Bad', 'weather_Worse', 'day_copy_1', 'day_copy_2', 'day_copy_3', 'day_copy_4', 'day_copy_5', 'day_copy_6', 'day_copy_7', 'day_copy_8', 'day_copy_9', 'day_copy_10', 'day_copy_11', 'day_copy_12', 'day_copy_13', 'day_copy_14', 'day_copy_15', 'day_copy_16', 'day_copy_17', 'day_copy_18', 'day_copy_19', 'day_copy_20', 'day_copy_21', 'day_copy_22', 'day_copy_23', 'day_copy_24', 'day_copy_25', 'day_copy_26', 'day_copy_27', 'day_copy_28', 'day_copy_29', 'day_copy_30', 'day_copy_31', 'month_copy_1', 'month_copy_2', 'month_copy_3', 'month_copy_4', 'month_copy_5', 'month_copy_6', 'month

In [7]:
# instead of randomly splitting our training data 
# for cross validation, let's construct a framework that's more
# in line with how the data is divvied up for this competition
# (given first 19 days of each month, what is demand for remaining days)
# so, let's split our training data into 2 time contiguous datasets
# for fitting and validating our model (days 1-14 vs. days 15-19).

# also, since submissions are evaluated based on the
# root mean squared logarithmic error (RMSLE), let's replicate
# that computation as we test and tune our model.

train_df = all_df[all_df['data_set'] == 'train']
test_df = all_df[all_df['data_set'] == 'test']

def get_rmsle(y_pred, y_actual):
    diff = np.log(y_pred + 1) - np.log(y_actual + 1)
    mean_error = np.square(diff).mean()
    return np.sqrt(mean_error)

def custom_train_valid_split(data, cutoff_day=15):
    train = data[data['day'] <= cutoff_day]
    valid = data[data['day'] > cutoff_day]

    return train, valid

def prep_train_data(data, input_cols):
    X = data[input_cols]#.values
    y_r = data['registered_log']#.values
    y_c = data['casual_log']#.values

    return X, y_r, y_c


# predict on validation set & transform output back from log scale
def predict_on_validation_set(model, input_cols):
    
    train, valid = custom_train_valid_split(train_df)
    y_pred_comb_l = []
    y_actual_comb_l = []

    for year_val in [2011,2012]:
        for month_val in range(1,13):

            #########print(f'Now,{year_val} {month_val} training and validating...')
            # prepare training & validation set
            train_tmp = train.query('year <= @year_val and month <= @month_val')
            valid_tmp = valid.query('year == @year_val and month == @month_val')

            X_train, y_train_r, y_train_c = prep_train_data(train_tmp, input_cols)
            X_valid, y_valid_r, y_valid_c = prep_train_data(valid_tmp, input_cols)

            # training and validating
            model_r = model.fit(X_train, y_train_r)
            y_pred_r = np.exp(model_r.predict(X_valid)) - 1

            model_c = model.fit(X_train, y_train_c)
            y_pred_c = np.exp(model_c.predict(X_valid)) - 1

            y_pred_comb = np.round(y_pred_r + y_pred_c)
            y_pred_comb[y_pred_comb < 0] = 0
            y_pred_comb_l.extend(y_pred_comb)

            y_actual_comb = np.exp(y_valid_r) + np.exp(y_valid_c) - 2
            y_actual_comb_l.extend(y_actual_comb)

            #rmsle = get_rmsle(y_pred_comb, y_actual_comb)
            #rmsle_l.append(rmsle)
    
    rmsle = get_rmsle(np.array(y_pred_comb_l),np.array(y_actual_comb_l))
    
    return (np.array(y_pred_comb_l), np.array(y_actual_comb_l), rmsle)



# FeatureSelection
def predict_on_validation_set_for_FeatureSelection(model, input_cols, threshold_val):
    
    train, valid = custom_train_valid_split(train_df)
    y_pred_comb_l = []
    y_actual_comb_l = []

    for year_val in [2011,2012]:
        for month_val in range(1,13):

            ###print(f'Now,{year_val} {month_val} training and validating...')
            # split data for kaggle data rules
            train_tmp = train.query('year <= @year_val and month <= @month_val')
            valid_tmp = valid.query('year == @year_val and month == @month_val')
            
            # split data for feature selection and training, validating
            X_train, y_train_r, y_train_c = prep_train_data(train_tmp, input_cols)
            X_valid, y_valid_r, y_valid_c = prep_train_data(valid_tmp, input_cols)

            # feature selection and training and validating
            selector = SelectFromModel(model,threshold = threshold_val)#'median')

            selector.fit(X_train,y_train_r)
            X_train_fs_r = selector.transform(X_train)
            print(f'{year_val} {month_val} registered columns: {X_train.columns[selector.get_support()].tolist()}')
            #print(f'{year_val} {month_val} registered columns: {X_train_fs_r.columns.tolist()}')
            model_r = model.fit(X_train_fs_r, y_train_r)
            X_valid_fs_r = selector.transform(X_valid)
            y_pred_r = np.exp(model_r.predict(X_valid_fs_r)) - 1
            
            selector.fit(X_train,y_train_c)
            X_train_fs_c = selector.transform(X_train)
            print(f'{year_val} {month_val} casual columns: {X_train.columns[selector.get_support()].tolist()}')
            model_c = model.fit(X_train_fs_c, y_train_c)
            X_valid_fs_c = selector.transform(X_valid)
            y_pred_c = np.exp(model_c.predict(X_valid_fs_c)) - 1

            y_pred_comb = np.round(y_pred_r + y_pred_c)
            y_pred_comb[y_pred_comb < 0] = 0
            y_pred_comb_l.extend(y_pred_comb)

            y_actual_comb = np.exp(y_valid_r) + np.exp(y_valid_c) - 2
            y_actual_comb_l.extend(y_actual_comb)

            #rmsle = get_rmsle(y_pred_comb, y_actual_comb)
            #rmsle_l.append(rmsle)
    
    rmsle = get_rmsle(np.array(y_pred_comb_l),np.array(y_actual_comb_l))
    
    return (np.array(y_pred_comb_l), np.array(y_actual_comb_l), rmsle)


# predict on test set & transform output back from log scale
""" def predict_on_test_set(model, input_cols):
    
    y_pred_comb_l = []
    for year_val in [2011,2012]:
        for month_val in range(1,13):
            
            # prepare training set
            print(f'Now,{year_val} {month_val} testing...')
            train_df_tmp = train_df.query('year <= @year_val and month <= @month_val')
            test_df_tmp = test_df.query('year == @year_val and month == @month_val')

            X_train, y_train_r, y_train_c = prep_train_data(train_df_tmp, req_cols)

            # prepare testing set
            X_test = test_df_tmp[req_cols]

            # feature selection and testing
            selector = SelectFromModel(RandomForestRegressor(n_estimators = 100,random_state = 42),threshold = 'median')

            selector.fit(X_train,y_train_r)
            X_train_fs_r = selector.transform(X_train)
            print(X_train.columns[selector.get_support()])
            model_r = model.fit(X_train_fs_r, y_train_r)
            X_test_fs_r = selector.transform(X_test)
            y_pred_r = np.exp(model_r.predict(X_test_fs_r)) - 1

            selector.fit(X_train,y_train_c)
            X_train_fs_c = selector.transform(X_train)
            print(X_train.columns[selector.get_support()])
            model_c = model.fit(X_train_fs_c, y_train_c)
            X_test_fs_c = selector.transform(X_test)
            y_pred_c = np.exp(model_c.predict(X_test_fs_c)) - 1
            
            # add casual & registered predictions together
            y_pred_comb = np.round(y_pred_r + y_pred_c)
            y_pred_comb[y_pred_comb < 0] = 0
            y_pred_comb_l.extend(y_pred_comb)

    
    return np.array(y_pred_comb_l) """

" def predict_on_test_set(model, input_cols):\n    \n    y_pred_comb_l = []\n    for year_val in [2011,2012]:\n        for month_val in range(1,13):\n            \n            # prepare training set\n            print(f'Now,{year_val} {month_val} testing...')\n            train_df_tmp = train_df.query('year <= @year_val and month <= @month_val')\n            test_df_tmp = test_df.query('year == @year_val and month == @month_val')\n\n            X_train, y_train_r, y_train_c = prep_train_data(train_df_tmp, req_cols)\n\n            # prepare testing set\n            X_test = test_df_tmp[req_cols]\n\n            # feature selection and testing\n            selector = SelectFromModel(RandomForestRegressor(n_estimators = 100,random_state = 42),threshold = 'median')\n\n            selector.fit(X_train,y_train_r)\n            X_train_fs_r = selector.transform(X_train)\n            print(X_train.columns[selector.get_support()])\n            model_r = model.fit(X_train_fs_r, y_train_r)\n       

In [8]:
params = np.arange(0.0010,0.0025,0.0001)
best_score_lasso = 10
best_alpha = 0
lasso_cols = ['holiday', 'workingday',  'temp', 'atemp', 'humidity', 'windspeed',  'ideal', 'sticky', 'season_Fall', 
            'season_Spring', 'season_Summer', 'season_Winter', 'weather_Good', 'weather_Normal', 'weather_Bad',
            'weather_Worse', 'day_copy_1', 'day_copy_2', 'day_copy_3', 'day_copy_4', 'day_copy_5', 'day_copy_6', 'day_copy_7', 'day_copy_8', 
            'day_copy_9', 'day_copy_10', 'day_copy_11', 'day_copy_12', 'day_copy_13', 'day_copy_14', 'day_copy_15', 'day_copy_16', 'day_copy_17', 
            'day_copy_18', 'day_copy_19', 'day_copy_20', 'day_copy_21', 'day_copy_22', 'day_copy_23', 'day_copy_24', 'day_copy_25', 'day_copy_26', 
            'day_copy_27', 'day_copy_28', 'day_copy_29', 'day_copy_30', 'day_copy_31', 'month_copy_1', 'month_copy_2', 'month_copy_3', 'month_copy_4', 
            'month_copy_5', 'month_copy_6', 'month_copy_7', 'month_copy_8', 'month_copy_9', 'month_copy_10', 'month_copy_11', 'month_copy_12', 
            'year_copy_2011', 'year_copy_2012',
            'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 
            'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 
            'dow_0', 'dow_1', 'dow_2', 'dow_3', 'dow_4', 'dow_5', 'dow_6', 
            'woy_1', 'woy_2', 'woy_3', 'woy_4', 'woy_5', 'woy_6', 'woy_7', 'woy_8', 'woy_9', 'woy_10', 'woy_11', 'woy_12', 'woy_13', 'woy_14', 
            'woy_15', 'woy_16', 'woy_17', 'woy_18', 'woy_19', 'woy_20', 'woy_21', 'woy_22', 'woy_23', 'woy_24', 'woy_25', 'woy_26', 'woy_27', 
            'woy_28', 'woy_29', 'woy_30', 'woy_31', 'woy_32', 'woy_33', 'woy_34', 'woy_35', 'woy_36', 'woy_37', 'woy_38', 'woy_39', 'woy_40', 
            'woy_41', 'woy_42', 'woy_43', 'woy_44', 'woy_45', 'woy_46', 'woy_47', 'woy_48', 'woy_49', 'woy_50', 'woy_51', 'woy_52', 
            'peak_0', 'peak_1', 'peak_2', 'peak_3', 
            'temp_cat_1.0', 'temp_cat_2.0', 'temp_cat_3.0', 'temp_cat_4.0', 'temp_cat_5.0', 'temp_cat_6.0', 'temp_cat_7.0', 
            'humidity_cat_many_0.0', 'humidity_cat_many_1.0', 'humidity_cat_many_2.0', 'humidity_cat_many_3.0', 'humidity_cat_many_4.0', 
            'humidity_cat_many_5.0', 'humidity_cat_many_6.0', 'humidity_cat_many_7.0', 'humidity_cat_many_8.0', 'humidity_cat_many_9.0', 
            'wind_cat_0.0', 'wind_cat_1.0', 'wind_cat_2.0', 'wind_cat_3.0', 'wind_cat_4.0', 'wind_cat_5.0', 'wind_cat_6.0', 'wind_cat_7.0', 'wind_cat_8.0', 'wind_cat_9.0'
            ]

for param in params:
    print(f'alpha: {param}')
    lasso_model = Lasso(alpha=param)
    (lasso_pred, lasso_actual, lasso_rmsle) = predict_on_validation_set(lasso_model, lasso_cols)
    print(lasso_rmsle)
    if lasso_rmsle < best_score_lasso:
        best_score_lasso = lasso_rmsle
        best_alpha = param

print(f'lasso_pred.shape: {lasso_pred.shape}   lasso_actual.shape: {lasso_actual.shape}   lasso_rmsle: {best_score_lasso}  best_alpha: {best_alpha}')



alpha: 0.001
0.5205293020277159
alpha: 0.0011
0.520231327343991
alpha: 0.0012000000000000001
0.5198799320725254
alpha: 0.0013000000000000002
0.5199972468650843
alpha: 0.0014000000000000002
0.5200478548047553
alpha: 0.0015000000000000002
0.5197853559510637
alpha: 0.0016000000000000003
0.520287422488824
alpha: 0.0017000000000000003
0.5200296094813253
alpha: 0.0018000000000000004
0.5201637343232212
alpha: 0.0019000000000000004
0.5198612168077418
alpha: 0.0020000000000000005
0.5200329415142628
alpha: 0.0021000000000000003
0.5205322368634931
alpha: 0.0022000000000000006
0.5208817825926578
alpha: 0.002300000000000001
0.5212129873712805
alpha: 0.0024000000000000007
0.5214598295345273
lasso_pred.shape: (2286,)   lasso_actual.shape: (2286,)   lasso_rmsle: 0.5197853559510637  best_alpha: 0.0015000000000000002


In [10]:
lasso_model = Lasso(alpha=best_alpha)
params = np.arange(0,0.01,0.001)
best_score_lasso = 10
best_threshold = 0

for threshold_val in params:
    print(f'threshold: {threshold_val}')
    (lasso_pred, lasso_actual, lasso_rmsle) = predict_on_validation_set_for_FeatureSelection(lasso_model, lasso_cols, threshold_val)
    print(f'lasso_pred.shape: {lasso_pred.shape}   lasso_actual.shape: {lasso_actual.shape}   lasso_rmsle: {lasso_rmsle}')
    if lasso_rmsle < best_score_lasso:
        best_score_lasso = lasso_rmsle
        best_threshold = threshold_val

print(f'lasso_pred.shape: {lasso_pred.shape}   lasso_actual.shape: {lasso_actual.shape}   lasso_rmsle: {best_score_lasso}  best_threshold: {best_threshold}')


threshold: 0.0
2011 1 registered columns: ['holiday', 'workingday', 'temp', 'atemp', 'humidity', 'windspeed', 'ideal', 'sticky', 'season_Fall', 'season_Spring', 'season_Summer', 'season_Winter', 'weather_Good', 'weather_Normal', 'weather_Bad', 'weather_Worse', 'day_copy_1', 'day_copy_2', 'day_copy_3', 'day_copy_4', 'day_copy_5', 'day_copy_6', 'day_copy_7', 'day_copy_8', 'day_copy_9', 'day_copy_10', 'day_copy_11', 'day_copy_12', 'day_copy_13', 'day_copy_14', 'day_copy_15', 'day_copy_16', 'day_copy_17', 'day_copy_18', 'day_copy_19', 'day_copy_20', 'day_copy_21', 'day_copy_22', 'day_copy_23', 'day_copy_24', 'day_copy_25', 'day_copy_26', 'day_copy_27', 'day_copy_28', 'day_copy_29', 'day_copy_30', 'day_copy_31', 'month_copy_1', 'month_copy_2', 'month_copy_3', 'month_copy_4', 'month_copy_5', 'month_copy_6', 'month_copy_7', 'month_copy_8', 'month_copy_9', 'month_copy_10', 'month_copy_11', 'month_copy_12', 'year_copy_2011', 'year_copy_2012', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'h

In [13]:
print(f'lasso_pred.shape: {lasso_pred.shape}   lasso_actual.shape: {lasso_actual.shape}   lasso_rmsle: {lasso_rmsle}')

lasso_pred.shape: (2286,)   lasso_actual.shape: (2286,)   lasso_rmsle: 0.519785439361491


In [None]:
temp_trn_df,temp_val_df = custom_train_valid_split(train_df)
submit_ridge_df = temp_val_df[['datetime', 'count']].copy()
submit_ridge_df['count'] = ridge_pred
submit_ridge_df.to_csv('output2/submit_ridge_val_20211022_1.csv', index=False)

ridge_actual_df = temp_val_df[['datetime', 'count']].copy()
ridge_actual_df['count'] = ridge_actual
ridge_actual_df.to_csv('output2/ridge_actual.csv', index=False)

print(ridge_model.coef_.toslit())

In [7]:
# Random forest for training and validation
params = {'n_estimators': 1000, 'max_depth': 15, 'random_state': 0, 'min_samples_split' : 5, 'n_jobs': -1}
rf_model = RandomForestRegressor(**params)
rf_cols = [
    #'weather', 'temp', 'atemp', 'windspeed',
    'weather', 'temp_cat', 'wind_cat',
    #'workingday', 'season', 'holiday', 'sticky',
    'workingday', 'season', 'holiday', 'humidity_cat',
    #'workingday', 'season_Fall', 'season_Spring', 'season_Summer', 'season_Winter', 'holiday', 'sticky',
    'hour', 'dow', 'woy', 'peak'
    ]

(rf_pred, rf_actual, rf_rmsle) = predict_on_validation_set(rf_model, rf_cols)

Now,2011 1 training and validating...
['peak']
['temp', 'hour', 'peak']
Now,2011 2 training and validating...
['hour', 'peak']
['temp', 'hour', 'peak']
Now,2011 3 training and validating...
['hour', 'peak']
['temp', 'hour', 'peak']
Now,2011 4 training and validating...
['hour', 'peak']
['temp', 'hour', 'peak']
Now,2011 5 training and validating...
['hour', 'peak']
['temp', 'hour', 'peak']
Now,2011 6 training and validating...
['hour', 'peak']
['temp', 'hour', 'peak']
Now,2011 7 training and validating...
['hour', 'peak']
['temp', 'atemp', 'hour', 'peak']
Now,2011 8 training and validating...
['hour', 'peak']
['temp', 'atemp', 'hour', 'peak']
Now,2011 9 training and validating...
['hour', 'peak']
['temp', 'atemp', 'hour', 'peak']
Now,2011 10 training and validating...
['hour', 'peak']
['temp', 'atemp', 'hour', 'peak']
Now,2011 11 training and validating...
['hour', 'peak']
['temp', 'atemp', 'hour', 'peak']
Now,2011 12 training and validating...
['hour', 'peak']
['temp', 'atemp', 'hour',

In [22]:
all_df.head(3)

Unnamed: 0_level_0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,data_set,casual_log,registered_log,count_log,date,day,month,year,hour,dow,woy,count_season,peak,ideal,sticky,temp_cat,humidity_cat_many,humidity_cat_less,wind_cat
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2011-01-01 00:00:00,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,train,1.386294,2.639057,2.833213,2011-01-01,1,1,2011,0,5,52,312498,0,0,0,1.0,8.0,4.0,0.0
2011-01-01 01:00:00,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,train,2.197225,3.496508,3.713572,2011-01-01,1,1,2011,1,5,52,312498,0,0,0,1.0,8.0,4.0,0.0
2011-01-01 02:00:00,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,train,1.791759,3.332205,3.496508,2011-01-01,1,1,2011,2,5,52,312498,0,0,0,1.0,8.0,4.0,0.0
