In [75]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn import metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RepeatedKFold, cross_val_score
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,BaggingRegressor,GradientBoostingRegressor
from sklearn.inspection import permutation_importance
from sklearn.utils import shuffle
import xgboost as xgb
import catboost as cb

### Imports

In [76]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [77]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn import metrics
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,BaggingRegressor,GradientBoostingRegressor
from sklearn.inspection import permutation_importance
from sklearn.utils import shuffle
import xgboost as xgb
import catboost as cb

In [78]:
#---------- configs
pd.set_option('max_rows',100)
pd.set_option('max_columns',40)

### loading data

In [79]:
train_path = r"train.csv"
test_path = r"test.csv"

In [80]:
df = pd.read_csv(train_path,index_col=0,parse_dates=['Date'],dayfirst=True)
df_test = pd.read_csv(test_path,parse_dates=['Date'],dayfirst=True)

df = df.rename(columns={'Temperature(�C)':'Temperature(C)','Dew point temperature(�C)'  : 'Dew point temperature(C)'})
df_test = df_test.rename(columns={'Temperature(�C)':'Temperature(C)','Dew point temperature(�C)'  : 'Dew point temperature(C)'})

## Feature Engineering

##### based on the previous notebook (eda) we will consider the following 

- A) The Hourly cycle has a good correlation. [7am-18pm has the bigger share]
- B) The Temperature has good corr. [high is good except > ~ 27]
- C) The Functional Day has good corr. [functional is good for target]
- D) The Seasons has good corr. [hot season is better]

-------------

- E) The Visibility may prove to be good. [if encoded to very_low visibility vs normal visibiliy].
- F) The RainFall/SnowFall may prove to be good. [if encoded to Snowing / Raining or not].
- G) The Humidity may have something to investigate. [if encoded as 0-80 % normal rentals, 80-100% show sharp decay in rentals]

---------------
- H) the Dew point temperature is strongly corr to Temperature will drop it.
- I) the Solar Radiation may be discarded .. i didn't see good relation.
- J) the holidays didn't prove decisive saying on our target. maybe discarded too.

In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5760 entries, 0 to 5759
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Date                      5760 non-null   datetime64[ns]
 1   y                         5760 non-null   int64         
 2   Hour                      5760 non-null   int64         
 3   Temperature(C)            5760 non-null   float64       
 4   Humidity(%)               5760 non-null   int64         
 5   Wind speed (m/s)          5760 non-null   float64       
 6   Visibility (10m)          5760 non-null   int64         
 7   Dew point temperature(C)  5760 non-null   float64       
 8   Solar Radiation (MJ/m2)   5760 non-null   float64       
 9   Rainfall(mm)              5760 non-null   float64       
 10  Snowfall (cm)             5760 non-null   float64       
 11  Seasons                   5760 non-null   object        
 12  Holiday             

In [82]:
df.describe()

Unnamed: 0,y,Hour,Temperature(C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm)
count,5760.0,5760.0,5760.0,5760.0,5760.0,5760.0,5760.0,5760.0,5760.0,5760.0
mean,703.483854,11.5,12.803559,58.361806,1.723333,1473.711458,4.058264,0.572325,0.135122,0.046302
std,655.249491,6.922788,11.902206,19.840167,1.047898,585.87988,13.003582,0.869943,1.07532,0.323728
min,0.0,0.0,-15.1,0.0,0.0,27.0,-26.9,0.0,0.0,0.0
25%,188.0,5.75,3.2,43.0,0.9,1025.75,-5.4,0.0,0.0,0.0
50%,485.5,11.5,13.7,57.0,1.5,1732.0,5.2,0.01,0.0,0.0
75%,1066.0,17.25,22.225,73.0,2.3,2000.0,14.7,0.94,0.0,0.0
max,3556.0,23.0,39.4,98.0,7.4,2000.0,27.2,3.52,35.0,5.1


In [83]:
len(df[(df['Snowfall (cm)'])>=1.5])

84

In [84]:
df['Seasons'].value_counts()

Autumn    1440
Summer    1440
Spring    1440
Winter    1440
Name: Seasons, dtype: int64

In [85]:
df_prep = df.copy()

In [86]:
df_prep['Year'] = df['Date'].dt.year.astype('int')
df_prep['Week'] = df['Date'].dt.isocalendar().week.astype('int')
df_prep['Month'] = df['Date'].dt.month.astype('int')
df_prep['WeekDay'] = df['Date'].dt.weekday.astype('int')

In [87]:
def restore_data_series(dataframe,dforiginal,col_name):
    dataframe[col_name] = dforiginal[col_name]

In [88]:
def map_data_series(dataframe,col_name,from_vals,to_vals):
    dataframe[col_name].replace(from_vals,to_vals,inplace=True)

In [89]:
def new_feature_data_series(dataframe,col_name,function,new_col_name):
       dataframe[new_col_name]  = dataframe[col_name].apply(function,axis=1)

In [90]:
map_data_series(df_prep,'Functioning Day',['Yes','No'],[1,0])

In [91]:
map_data_series(df_prep,'Holiday',['Holiday','No Holiday'],[1,0])

In [92]:
map_data_series(df_prep,'Seasons',['Winter','Spring','Summer','Autumn'],[0,1,2,3])

In [93]:
## lambda df : 0 if ( (abs(df['Hour'] - 8) <=rush_range_1) or (abs(df['Hour'] - 18) <= rush_range_2) ) else 1
def rush_hour(df):
    #rush_range_1 = 2 # around the rush hour by +- 2 hrs
    #rush_range_2 = 4 # around the rush hour by +- 4 hrs
    if (df['Hour'] in [6,7,8,9,10,16,17,18,19,20]):
        return 1
    else:
        return 0
    
def rush_hour_grad_2(df):
    rush_range_1 = 2 # around the rush hour by +- 2 hrs
    rush_range_2 = 5 # around the rush hour by +- 4 hrs
    is_functional = df['Functioning Day']

    return (np.exp(-abs(df['Hour'] - 8)) +  np.exp(-abs(df['Hour'] - 18))) * is_functional

def rush_hour_grad(df):
    #from 8 am , 6 pm
    rush_range_1 = 2 # around the rush hour by +- 2 hrs
    rush_range_2 = 5 # around the rush hour by +- 4 hrs
    is_functional = df['Functioning Day'] 
    
    if (abs(df['Hour'] - 8) <= rush_range_1): 
        return np.exp(-abs(df['Hour'] - 8)) * is_functional
    
    elif (abs(df['Hour'] - 18) <= rush_range_2):
        return np.exp(-abs(df['Hour'] - 18)) * is_functional
    else:
        return 0
    
def dead_hour_grad(df):
    dead_range_1 = 3 # around the dead hour by +- 3 hrs from 4 am
    is_functional = df['Functioning Day'] 
    
    if (abs(df['Hour'] - 4) <= dead_range_1): 
        return np.exp(-abs(df['Hour'] - 4)) * is_functional
    else:
        return 0
    
def dead_hour_grad_2(df):
    #dead_range_1 = 5 # around the dead hour by +- 3 hrs from 4 am
    is_functional = df['Functioning Day'] 
    
    if(df['Hour'] in [22,23,0,1,2,3,4,5]):
        return np.exp(-abs(df['Hour'] - 4)) * is_functional
    else:
        return 0   
new_feature_data_series(df_prep,['Hour','Functioning Day'],rush_hour_grad,'Rush_hour')
new_feature_data_series(df_prep,['Hour','Functioning Day'],dead_hour_grad_2,'Dead_hour')

In [94]:
## lambda df : 0 if ( (abs(df['Hour'] - 8) <=rush_range_1) or (abs(df['Hour'] - 18) <= rush_range_2) ) else 1
def day_night(df):
    #rush_range_1 = 2 # around the rush hour by +- 2 hrs
    #rush_range_2 = 4 # around the rush hour by +- 4 hrs
    if (df['Hour'] >=6 and df['Hour'] <=18):
        return 1
    else:
        return 0

new_feature_data_series(df_prep,['Hour'],day_night,'day_night')

In [95]:
new_feature_data_series(df_prep,['Humidity(%)'],lambda df : 0 if (df['Humidity(%)'] <= 70) else 1,'Humidity_high')

In [96]:
new_feature_data_series(df_prep,['Temperature(C)'],lambda df : 0 if (df['Temperature(C)'] <= 35) else 1,'Temperature_high')

In [97]:
new_feature_data_series(df_prep,['Humidity(%)','Temperature(C)','Solar Radiation (MJ/m2)','Wind speed (m/s)'],lambda df : 0 if (df['Temperature(C)'] < 35) and (df['Humidity(%)'] < 70) and (df['Solar Radiation (MJ/m2)'] < 1.4) else 1,'Weather_bad')

In [98]:
def humidex(df):
    ### humidex
    ## eq H = Tair + 5/9 * [(6.11 * e^5417.7530 * (a-b)) - 10 ]
    Tair = df['Temperature(C)']
    Tdew = df['Dew point temperature(C)']
    is_functionalDay = df['Functioning Day']
    a = 1/273.16
    b = 1/(273.15 + Tdew)
    Humidex = Tair + (5/9) * (6.11 * np.exp(5417.7530 * (a-b)) -10)
    return Humidex  
new_feature_data_series(df_prep,['Temperature(C)','Dew point temperature(C)','Functioning Day'],humidex,'Humidex')

In [99]:
df_prep['Ideal'] = df[['Temperature(C)', 'Wind speed (m/s)']] \
    .apply(lambda df: 1 if (df['Temperature(C)'] < 30 and df['Wind speed (m/s)'] < 2.8) else 0, axis = 1)

In [100]:
df_prep["Ideal"].value_counts()

1    4548
0    1212
Name: Ideal, dtype: int64

In [101]:
def visibility(df):
    #0:250 , 250:500 : 500:inf
    
    if df['Visibility (10m)'] <=250 :
        return 0
    elif df['Visibility (10m)'] <=350 :
        return 1
    elif df['Visibility (10m)'] <=450 :
        return 1
    elif df['Visibility (10m)'] <=550 :
        return 2
    elif df['Visibility (10m)'] <=650 :
        return 2
    elif df['Visibility (10m)'] <=850 :
        return 3
    elif df['Visibility (10m)'] <=1150 :
        return 3
    elif df['Visibility (10m)'] <=1350 :
        return 4
    else:
        return 5
    
new_feature_data_series(df_prep,['Visibility (10m)'],visibility,'Visibility_cat')

In [102]:
def visibility_dist(df):
    #0:250 , 250:500 : 500:inf
    
    if df['Visibility (10m)'] <100 :     #can't see
        return 1
    elif df['Visibility (10m)'] <400 :   # foggy
        return 2
    elif df['Visibility (10m)'] <1000 :  # low fog
        return 3
    elif df['Visibility (10m)'] <2000 :  # no fog (good vision)
        return 4
    elif df['Visibility (10m)'] <4000 :  # no fog (excellent vision)
        return 5
    else:
        return 5
    
new_feature_data_series(df_prep,['Visibility (10m)'],visibility_dist,'Visibility_dist')

In [103]:
# for rainfall

def rain_fall(df):
    # 0:2.5 , 
    
    if df['Rainfall(mm)'] < 0.5 :
        return 0
    else:
        return 1

new_feature_data_series(df_prep,['Rainfall(mm)'],rain_fall,'Rain_cat')

In [104]:
def snow_fall(df):
    # 0:2.5 , 
    
    if df['Snowfall (cm)'] == 0 :
        return 0
    else:
        return 1

new_feature_data_series(df_prep,['Snowfall (cm)'],snow_fall,'Snow_cat')

In [105]:
def Sunny_Hot (df):
    if ((df['Temperature(C)']>25)&((df['Solar Radiation (MJ/m2)'] >2)&(df['day_night']==1))):
        return True
    else:
        return False

def Clear_hot (df):
    if ((df['Temperature(C)']>25)&(((df['Solar Radiation (MJ/m2)'] <=2)&(df['Solar Radiation (MJ/m2)'] >1))\
                                   &(df['day_night']==1))&(df['Rainfall(mm)'] == 0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False

def PCloudy_hot(df):  
    if((df['Temperature(C)']>25)&(((df['Solar Radiation (MJ/m2)'] <=1)&(df['Solar Radiation (MJ/m2)'] >0.5))\
                                                       &(df['day_night']==1))&(df['Rainfall(mm)'] == 0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False
def Cloudy_hot(df): 
    if((df['Temperature(C)']>25)&(((df['Solar Radiation (MJ/m2)'] <=0.5))&(df['day_night']==1))\
                      &(df['Rainfall(mm)'] == 0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False
def Rain_hot(df):  
    if((df['Temperature(C)']>25)&(df['Rainfall(mm)'] >0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False

def Night_hot(df): 
    if((df['Temperature(C)']>25)&(df['day_night']==0)
                      &(df['Rainfall(mm)'] == 0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False

#len(sunny_hot)+len(Clear_hot)+len(PCloudy_hot)+len(Cloudy_hot)+len(Rain_hot)+len(Night_hot)

In [106]:
def Sunny_Warm (df):
    if (((df['Temperature(C)']<=25)&(df['Temperature(C)']>10))&((df['Solar Radiation (MJ/m2)'] >2)&(df['day_night']==1))):
        return True
    else:
        return False

def Clear_Warm (df):
    if (((df['Temperature(C)']<=25)&(df['Temperature(C)']>10))\
        &(((df['Solar Radiation (MJ/m2)'] <=2)&(df['Solar Radiation (MJ/m2)'] >1))&(df['day_night']==1))\
        &(df['Rainfall(mm)'] == 0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False

def PCloudy_Warm(df):  
    if(((df['Temperature(C)']<=25)&(df['Temperature(C)']>10))\
       &(((df['Solar Radiation (MJ/m2)'] <=1)&(df['Solar Radiation (MJ/m2)'] >0.5))&(df['day_night']==1))\
       &(df['Rainfall(mm)'] == 0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False
def Cloudy_Warm(df): 
    if(((df['Temperature(C)']<=25)&(df['Temperature(C)']>10))\
       &(((df['Solar Radiation (MJ/m2)'] <=0.5))&(df['day_night']==1))\
       &(df['Rainfall(mm)'] == 0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False
def Rain_Warm(df):  
    if(((df['Temperature(C)']<=25)&(df['Temperature(C)']>10))&(df['Rainfall(mm)'] >0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False

def Night_Warm(df): 
    if(((df['Temperature(C)']<=25)&(df['Temperature(C)']>10))\
       &(df['day_night']==0)&(df['Rainfall(mm)'] == 0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False

In [107]:
def Sunny_Cold (df):
    if (((df['Temperature(C)']<=10)&(df['Temperature(C)']>0))&((df['Solar Radiation (MJ/m2)'] >2)&(df['day_night']==1))):
        return True
    else:
        return False

def Clear_Cold (df):
    if (((df['Temperature(C)']<=10)&(df['Temperature(C)']>0))\
        &(((df['Solar Radiation (MJ/m2)'] <=2)&(df['Solar Radiation (MJ/m2)'] >1))&(df['day_night']==1))\
        &(df['Rainfall(mm)'] == 0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False

def PCloudy_Cold(df):  
    if(((df['Temperature(C)']<=10)&(df['Temperature(C)']>0))\
       &(((df['Solar Radiation (MJ/m2)'] <=1)&(df['Solar Radiation (MJ/m2)'] >0.5))&(df['day_night']==1))\
       &(df['Rainfall(mm)'] == 0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False
def Cloudy_Cold(df): 
    if(((df['Temperature(C)']<=10)&(df['Temperature(C)']>0))\
       &(((df['Solar Radiation (MJ/m2)'] <=0.5))&(df['day_night']==1))\
       &(df['Rainfall(mm)'] == 0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False
def Rain_Cold(df):  
    if(((df['Temperature(C)']<=10)&(df['Temperature(C)']>0))&(df['Rainfall(mm)'] >0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False
def Snow_Cold(df):  
    if(((df['Temperature(C)']<=10)&(df['Temperature(C)']>0))&(df['Rainfall(mm)'] ==0)&(df['Snowfall (cm)'] >0)):
        return True
    else:
        return False
def Night_Cold(df): 
    if(((df['Temperature(C)']<=10)&(df['Temperature(C)']>0))\
       &(df['day_night']==0)&(df['Rainfall(mm)'] == 0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False

In [108]:
def Sunny_Freezing (df):
    if ((df['Temperature(C)']<=0)&((df['Solar Radiation (MJ/m2)'] >2)&(df['day_night']==1))):
        return True
    else:
        return False

def Clear_Freezing (df):
    if ((df['Temperature(C)']<=0)&(((df['Solar Radiation (MJ/m2)'] <=2)&(df['Solar Radiation (MJ/m2)'] >1))\
                                   &(df['day_night']==1))&(df['Rainfall(mm)'] == 0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False

def PCloudy_Freezing(df):  
    if((df['Temperature(C)']<=0)&(((df['Solar Radiation (MJ/m2)'] <=1)&(df['Solar Radiation (MJ/m2)'] >0.5))\
                                                       &(df['day_night']==1))&(df['Rainfall(mm)'] == 0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False
def Cloudy_Freezing(df): 
    if((df['Temperature(C)']<=0)&(((df['Solar Radiation (MJ/m2)'] <=0.5))&(df['day_night']==1))\
                      &(df['Rainfall(mm)'] == 0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False
def Rain_Freezing(df):  
    if((df['Temperature(C)']<=0)&(df['Rainfall(mm)'] >0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False
def Snow_Freezing(df):  
    if((df['Temperature(C)']<=0)&(df['Rainfall(mm)'] ==0)&(df['Snowfall (cm)'] >0)):
        return True
    else:
        return False

def Night_Freezing(df): 
    if((df['Temperature(C)']<=0)&(df['day_night']==0)
                      &(df['Rainfall(mm)'] == 0)&(df['Snowfall (cm)'] == 0)):
        return True
    else:
        return False
def Rain_Snow(df):
    if (df['Rainfall(mm)'] >0)&(df['Snowfall (cm)'] >0):
        return True
    else:
        return False

#len(sunny_hot)+len(Clear_hot)+len(PCloudy_hot)+len(Cloudy_hot)+len(Rain_hot)+len(Night_hot)

In [109]:
df_prep['Weather_State'] = df_prep[['Temperature(C)', 'Solar Radiation (MJ/m2)' , 'day_night','Snowfall (cm)','Rainfall(mm)']]  \
    .apply(lambda df: 1 if Sunny_Hot(df)\
           else 1 if Clear_hot(df)\
           else 1 if PCloudy_hot(df)\
           else 1 if Cloudy_hot(df)\
           else 1 if Rain_hot(df)\
           else 1 if Night_hot(df)\
           else 2 if Sunny_Warm(df)\
           else 2 if Clear_Warm(df)\
           else 2 if PCloudy_Warm(df)\
           else 2 if Cloudy_Warm(df)\
           else 2 if Rain_Warm(df)\
           else 2 if Night_Warm(df)\
           else 3 if Sunny_Cold(df)\
           else 3 if Clear_Cold(df)\
           else 3 if PCloudy_Cold(df)\
           else 3 if Cloudy_Cold(df)\
           else 3 if Rain_Cold(df)\
           else 3 if Snow_Cold(df)\
           else 3 if Night_Cold(df)\
           else 4 if Sunny_Freezing(df)\
           else 4 if Clear_Freezing(df)\
           else 4 if PCloudy_Freezing(df)\
           else 4 if Cloudy_Freezing(df)\
           else 4 if Rain_Freezing(df)\
           else 4 if Snow_Freezing(df)\
           else 4 if Night_Freezing(df)\
           else 5 if Rain_Snow(df)\
           else 6, axis = 1)

In [110]:
Weather_Description_list = ['Night_Freezing', 'Cloudy_Freezing', 'PCloudy_Freezing',
                            'Clear_Cold', 'PCloudy_Cold', 'Cloudy_Cold', 'Night_Cold',
                            'Rain_Cold', 'Clear_Freezing', 'Rain&Snow', 'Snow_Freezing',
                            'Snow_Cold', 'Sunny_Freezing', 'Sunny_Cold', 'Sunny_Warm',
                            'Clear_Warm', 'PCloudy_Warm', 'Cloudy_Warm', 'Night_Warm',
                            'Rain_Warm', 'Sunny_hot', 'Clear_hot', 'Cloudy_hot', 'Night_hot',
                            'PCloudy_hot', 'Rain_hot']

length = len(Weather_Description_list)+1
Weather_Description_neumeric = [*range(1, length, 1)]

In [111]:
# map_data_series(df_prep,'Weather_State',Weather_Description_list,Weather_Description_neumeric)

In [112]:
new_feature_data_series(df_prep,['Weather_State','Visibility_dist'],lambda df :df['Weather_State']*(df['Visibility_dist']),'Weather_stat&visb_dist')

In [113]:
new_feature_data_series(df_prep,['Weather_State','Visibility (10m)'],lambda df :df['Weather_State']*np.log(df['Visibility (10m)']),'Weather&visb')

In [114]:
df_prep

Unnamed: 0_level_0,Date,y,Hour,Temperature(C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day,Year,Week,Month,WeekDay,Rush_hour,Dead_hour,day_night,Humidity_high,Temperature_high,Weather_bad,Humidex,Ideal,Visibility_cat,Visibility_dist,Rain_cat,Snow_cat,Weather_State,Weather_stat&visb_dist,Weather&visb
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
0,2017-12-01,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,0,0,1,2017,48,12,4,0.000000,1.831564e-02,0,0,0,0,-9.890186,1,5,5,0,0,4,20,30.403610
1,2017-12-01,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,0,0,1,2017,48,12,4,0.000000,4.978707e-02,0,0,0,0,-10.190186,1,5,5,0,0,4,20,30.403610
2,2017-12-01,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,0,0,1,2017,48,12,4,0.000000,1.353353e-01,0,0,0,0,-10.697338,1,5,5,0,0,4,20,30.403610
3,2017-12-01,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,0,0,1,2017,48,12,4,0.000000,3.678794e-01,0,0,0,0,-10.890186,1,5,5,0,0,4,20,30.403610
4,2017-12-01,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,0,0,1,2017,48,12,4,0.000000,1.000000e+00,0,0,0,0,-10.759339,1,5,5,0,0,4,20,30.403610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5755,2018-11-20,1317,19,8.7,34,2.2,1857,-6.3,0.0,0.0,0.0,3,0,1,2018,47,11,1,0.367879,0.000000e+00,0,0,0,0,5.268122,1,5,4,0,0,3,12,22.580153
5756,2018-11-20,1078,20,8.5,32,2.6,1789,-7.2,0.0,0.0,0.0,3,0,1,2018,47,11,1,0.135335,0.000000e+00,0,0,0,0,4.927112,1,5,4,0,0,3,12,22.468236
5757,2018-11-20,1020,21,8.8,37,3.8,1750,-5.1,0.0,0.0,0.0,3,0,1,2018,47,11,1,0.049787,0.000000e+00,0,0,0,0,5.570188,0,5,4,0,0,3,12,22.402113
5758,2018-11-20,922,22,8.9,40,3.4,1810,-3.9,0.0,0.0,0.0,3,0,1,2018,47,11,1,0.018316,1.522998e-08,0,0,0,0,5.889417,0,5,4,0,0,3,12,22.503246


In [115]:
def Clear(df):
    if (df['Solar Radiation (MJ/m2)'] >2) & (df['day_night'] ==1):
        return 1
    else:
        return 0
    
def Few_Clouds(df):
    if (((df['Solar Radiation (MJ/m2)'] <=2)&(df['Solar Radiation (MJ/m2)'] >1.5)) & (df['day_night'] ==1)):
        return 1  
    else:
        return 0
    
def Scatterd_Clouds(df):
    if (((df['Solar Radiation (MJ/m2)'] <=1.5)&(df['Solar Radiation (MJ/m2)'] >1)) & (df['day_night'] ==1)):
        return 1  
    else:
        return 0
    
def Partly_Cloudy(df):
    if (((df['Solar Radiation (MJ/m2)'] <=1)&(df['Solar Radiation (MJ/m2)'] >0.5)) & (df['day_night'] ==1)):
        return 1  
    else:
        return 0
    
def Cloudy(df):
    if ((df['Solar Radiation (MJ/m2)'] <= 0.5) & (df['day_night'] ==1)):
        return 1
    else:
        return 0
    
def Night(df):
    if ((df['Solar Radiation (MJ/m2)'] <= 0.8) & (df['day_night'] ==0)):
        return 1
    else:
        return 0
    

In [116]:
def Mist(df):
    if df['Visibility_dist'] == 3:
        return 1
    else:
        return 0
    
def Fog(df):
    if ((df['Visibility_dist'] == 1) | (df['Visibility_dist'] == 2)) :
        return 1
    else:
        return 0

In [117]:
def Light_Rain(df):
    if ((df['Rainfall(mm)']) >0 & (df['Rainfall(mm)'] < 2.5))  :
        return 1
    else:
        return 0
    
def Heavy_Rain(df):
    if (df['Rainfall(mm)'] >= 2.5)  :
        return 1
    else:
        return 0
def Ice_pallets(df):
        if ((df['Rainfall(mm)'] > 0 )& (df['Snowfall (cm)'] > 0 ))  :
            return 1
        else:
            return 0


In [118]:
def Light_Snow(df):
    if ((df['Snowfall (cm)']) >0 & (df['Snowfall (cm)'] < 1))  :
        return 1
    else:
        return 0
    
def Snow(df):
    if (df['Snowfall (cm)'] >= 1)  :
        return 1
    else:
        return 0

In [119]:
def Cond1(df):
    if Clear(df) or Few_Clouds(df) or Partly_Cloudy(df) or Cloudy(df):
        return 1
    else:
        return 0
def Cond2(df):
    if (Mist(df) and Cloudy(df)) or (Mist(df) and Partly_Cloudy(df)) or (Mist(df) and Few_Clouds(df)) or Mist(df):
        return 1
    else:
        return 0
def Cond3(df):
    if Light_Snow(df) or (Light_Rain(df) and Scatterd_Clouds(df)) or Light_Rain(df):
        return 1
    else:
        return 0
def Cond4(df):
    if (Ice_pallets(df) and Mist(df)) or (Snow(df) and Fog(df)) or Snow(df) :
        return 1
    else:
        return 0
def Cond5(df):
    if Night(df):
        return 1
    else:
        return 0

In [120]:
df_prep['Sky_State'] =df_prep[['Temperature(C)','Visibility_dist', 'Solar Radiation (MJ/m2)' , 'day_night','Snowfall (cm)','Rainfall(mm)']]  \
    .apply(lambda df: 4 if Cond4(df)\
                 else 2 if Cond2(df)\
                 else 3 if Cond3(df)\
                 else 1 if Cond1(df)\
                 
                 else 5, axis = 1)

In [121]:
def Hot(df):
    if (df['Temperature(C)']>25):
        return 1
    else:
        return 0
def Warm(df):
    if ((df['Temperature(C)']<=25)&(df['Temperature(C)']>10)):
        return 1
    else:
        return 0
def Cold(df):
    if ((df['Temperature(C)']<=10)&(df['Temperature(C)']>0)):
        return 1
    else:
        return 0
def Freezing(df):
    if (df['Temperature(C)']<=0):
        return 1
    else:
        return 0

In [122]:
df_prep['Temp_State'] =df_prep[['Temperature(C)','Visibility_dist', 'Solar Radiation (MJ/m2)' , 'day_night','Snowfall (cm)','Rainfall(mm)']]  \
    .apply(lambda df: 1 if Warm(df)\
                 else 2 if Cold(df)\
                 else 3 if Hot(df)\
                 else 4 if Freezing(df)\
                 
                 else 5, axis = 1)

In [123]:
df_prep[["Temp_State"]].value_counts()

Temp_State
1             2396
2             1358
3             1004
4             1002
dtype: int64

In [124]:
new_feature_data_series(df_prep,['Sky_State','Temp_State'],lambda df :df['Sky_State']*(df['Temp_State']),'GBL_Forecast')

In [126]:
def value(df):
    return np.log((np.abs(df['Humidity(%)']*(df['Temperature(C)'])*df['Dew point temperature(C)']))+1)

In [128]:
def F_state(df1):
    if df1['X_Temp']-df1['Temperature(C)']>= 1:
        return 1
    else:
        return 0
def Fogs (df):
    if ((df['Fog_State']==1) & (df['Visibility (10m)'] <= 1000)):
        return 1
    else :
        return 0

In [129]:
#added from the Abdo nabil
new_feature_data_series(df_prep,['Temperature(C)','Humidity(%)','Dew point temperature(C)']
                        ,lambda df :value(df) if value(df) != float('nan') else 0,'Abdo_try')
new_feature_data_series(df_prep,['Temperature(C)','Humidity(%)']
                        ,lambda  df :df['Humidity(%)']*(df['Temperature(C)'])/100  ,'X_Temp')
df_prep['Fog_State'] =df_prep[['Temperature(C)','X_Temp']]  \
    .apply(lambda df: 1 if F_state(df)\
                 else 0, axis = 1)
df_prep['Fog'] =df_prep[['Fog_State','Visibility (10m)']]  \
    .apply(lambda df: 2 if Fogs(df)\
                 else 1, axis = 1)

In [130]:
# df_prep["Date_tmp"] = df_prep['Date'].dt.to_period('M')
# new_feature_df = pd.DataFrame(df_prep.groupby(['Date_tmp', 'Year'])['y'].mean()).sort_values(["Year"])
# new_feature_df = pd.DataFrame(new_feature_df['y'].shift(+1))
# new_feature_df.reset_index(inplace=True)
# new_feature_df.rename(columns={'y':'Prev Month y'}, inplace=True)
# new_feature_df.drop(['Year'], axis = 1, inplace =True)
#new_feature_df.fillna(225, inplace = True)

In [131]:
# df_prep = pd.merge(df_prep, new_feature_df, on='Date_tmp', how='left')

In [132]:
df_prep.columns

Index(['Date', 'y', 'Hour', 'Temperature(C)', 'Humidity(%)',
       'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(C)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Seasons',
       'Holiday', 'Functioning Day', 'Year', 'Week', 'Month', 'WeekDay',
       'Rush_hour', 'Dead_hour', 'day_night', 'Humidity_high',
       'Temperature_high', 'Weather_bad', 'Humidex', 'Ideal', 'Visibility_cat',
       'Visibility_dist', 'Rain_cat', 'Snow_cat', 'Weather_State',
       'Weather_stat&visb_dist', 'Weather&visb', 'Sky_State', 'Temp_State',
       'GBL_Forecast', 'Abdo_try', 'X_Temp', 'Fog_State', 'Fog'],
      dtype='object')

----
# Training The Model

####  Our Scoring Metric (RMSLE)

In [133]:
def rmsle(y_true, y_pred, convertExp=True):
    if convertExp:
        y_true = np.exp(y_true)
        y_pred = np.exp(y_pred)
        
    log_true = np.nan_to_num(np.array([np.log(y+1.0) for y in y_true]))
    log_pred = np.nan_to_num(np.array([np.log(y+1.0) for y in y_pred]))
    
    output = np.sqrt(np.mean((log_true - log_pred)**2))
    return output

def NumPyRMSLE(y_true:list, y_pred:list) -> float:
    """
        The Root Mean Squared Log Error (RMSLE) metric using only NumPy
        N.B. This function is a lot slower than sklearn's implementation
        
        :param y_true: The ground truth labels given in the dataset
        :param y_pred: Our predictions
        :return: The RMSLE score
    """
    n = len(y_true)
    msle = np.mean([(np.log(y_pred[i] + 1) - np.log(y_true[i] + 1)) ** 2.0 for i in range(n)])
    return np.sqrt(msle)

rmsle_scorer = metrics.make_scorer(rmsle, greater_is_better=False) 

##### Selecting features and splitting (do this everytime you change the features! and want to do training)

In [134]:

selected_features = ['Hour','Week', 'Month','Year', 'Solar Radiation (MJ/m2)','Rainfall(mm)',
                     'Snowfall (cm)','Wind speed (m/s)','Visibility (10m)','WeekDay',
                     'Seasons','Holiday' ,'Temperature(C)', 'Humidity(%)',
                    'Functioning Day','Rush_hour','Dead_hour','Weather_State',
                    'Weather&visb','Weather_stat&visb_dist','Sky_State','Temp_State','GBL_Forecast'] 

train = df_prep.copy()
train["y"] = np.log(df_prep["y"]+0.00001) ##--- someone explain why this works better than +1

#train["y"] = df_prep["y"]
X1 = train[selected_features]
y1 = train["y"]




# tss = TimeSeriesSplit(n_splits=2)
# train_ind,test_ind  = tss.split(X1,groups=[20,1])

X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size = 0.2, random_state=42,shuffle=True,)

In [135]:
len(selected_features)

23

In [136]:

selected_features = ['Hour', 'Temperature(C)', 'Humidity(%)', 'Wind speed (m/s)',
       'Visibility (10m)', 'Dew point temperature(C)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Seasons',
       'Holiday', 'Functioning Day', 'Year', 'Week', 'Month', 'WeekDay',
       'day_night', 'Humidex', 'Visibility_dist', 'Rain_cat', 'Snow_cat',
       'Rush_hour', 'Dead_hour', 'Humidity_high', 'Weather_bad', 'Ideal',
       'Weather_State', 'Weather_stat&visb_dist', 'Weather&visb', 'Sky_State',
       'Temp_State', 'GBL_Forecast', 'X_Temp', 'Abdo_try', 'Fog_State', 'Fog']

# No bagging 
#2500 

# define dataset
train = df_prep.copy()
train["y"] = np.log(df_prep["y"]+0.00001) ##--- someone explain why this works better than +1
#train["y"] = df_prep["y"]
X1 = train[selected_features]
y1 = train["y"]
X_train, X_test, y_train, y_test =\
train_test_split(X1, y1, test_size = 0.2, random_state=42,shuffle=True,)
# define the model
GBR = GradientBoostingRegressor(learning_rate=0.04, max_features=0.9,
                                          min_samples_leaf=0.0001,
                                          n_estimators=2500, random_state=0,
                                          subsample=0.8)
GBR.fit(X_train , y_train)
# evaluate the model
y_pred_train_GBR = GBR.predict(X_train)  
y_pred_test_GBR = GBR.predict(X_test)  

rmsle_train_GBR =  rmsle(y_train, y_pred_train_GBR, True)
rmsle_test_GBR  = rmsle(y_test, y_pred_test_GBR, True)

prev_train_GBR = 0
prev_test_GBR = 0

print('(Train)  BaggingRegressor  RMSLE:',rmsle_train_GBR )
print('(Test) BaggingRegressor RMSLE:',rmsle_test_GBR  )
print('---------------------------------' )
print('(Train) Prev RMSLE:', prev_train_GBR)
print('(Test) Prev RMSLE:', prev_test_GBR)
prev_train_GBR = rmsle_train_GBR
prev_test_GBR = rmsle_test_GBR 
print('---------------------------------' )
print("Training Score:", r2_score(y_train, y_pred_train_GBR) * 100,' %')
print("Test Score:", r2_score(y_test, y_pred_test_GBR) * 100,' %')

(Train)  BaggingRegressor  RMSLE: 0.13600967003042333
(Test) BaggingRegressor RMSLE: 0.2986905783339536
---------------------------------
(Train) Prev RMSLE: 0
(Test) Prev RMSLE: 0
---------------------------------
Training Score: 99.8580943640935  %
Test Score: 99.42271765350864  %


---

### Submission Preparation

In [11]:
full_test_path = r"SeoulBikeDataTestFull.csv"

In [12]:
full_test = pd.read_csv(full_test_path)

In [139]:
test = df_test.copy()

In [140]:
test['Year'] = df_test['Date'].dt.year.astype('int')
test['Week'] = df_test['Date'].dt.isocalendar().week.astype('int')
test['Month'] = df_test['Date'].dt.month.astype('int') 
test['WeekDay'] = df_test['Date'].dt.weekday.astype('int')

In [141]:
#test.y.value_counts()

In [142]:
test.columns

Index(['ID', 'Date', 'Hour', 'Temperature(C)', 'Humidity(%)',
       'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(C)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Seasons',
       'Holiday', 'Functioning Day', 'Year', 'Week', 'Month', 'WeekDay'],
      dtype='object')

## Test Data Procc

In [143]:
map_data_series(test,'Functioning Day',['Yes','No'],[1,0])

In [144]:
map_data_series(test,'Holiday',['Holiday','No Holiday'],[1,0])

In [145]:
map_data_series(test,'Seasons',['Winter','Spring','Summer','Autumn'],[0,1,2,3])

In [146]:
new_feature_data_series(test,['Hour','Functioning Day'],dead_hour_grad_2,'Dead_hour')
new_feature_data_series(test,['Hour','Functioning Day'],rush_hour_grad,'Rush_hour')

In [147]:
new_feature_data_series(test,['Hour'],day_night,'day_night')
new_feature_data_series(test,['Humidity(%)'],lambda df : 0 if (df['Humidity(%)'] <= 70) else 1,'Humidity_high')
new_feature_data_series(test,['Temperature(C)'],lambda df : 0 if (df['Temperature(C)'] <= 35) else 1,'Temperature_high')

In [148]:
new_feature_data_series(test,['Humidity(%)','Temperature(C)','Solar Radiation (MJ/m2)'],lambda df : 0 if (df['Temperature(C)'] < 35) and (df['Humidity(%)'] < 70) and (df['Solar Radiation (MJ/m2)'] < 1.4) else 1,'Weather_bad')

In [149]:
test['Ideal'] = df_test[['Temperature(C)', 'Wind speed (m/s)']] \
    .apply(lambda df: 1 if (df['Temperature(C)'] < 30 and df['Wind speed (m/s)'] < 2.8) else 0, axis = 1)

In [150]:
new_feature_data_series(test,['Temperature(C)','Dew point temperature(C)','Functioning Day'],humidex,'Humidex')

In [151]:
new_feature_data_series(test,['Visibility (10m)'],visibility,'Visibility_cat')

In [152]:
new_feature_data_series(test,['Rainfall(mm)'],rain_fall,'Rain_cat')

In [153]:
new_feature_data_series(test,['Snowfall (cm)'],snow_fall,'Snow_cat')

In [154]:
test['Weather_State'] =test[['Temperature(C)', 'Solar Radiation (MJ/m2)' , 'day_night','Snowfall (cm)','Rainfall(mm)']]  \
    .apply(lambda df: 1 if Sunny_Hot(df)\
           else 1 if Clear_hot(df)\
           else 1 if PCloudy_hot(df)\
           else 1 if Cloudy_hot(df)\
           else 1 if Rain_hot(df)\
           else 1 if Night_hot(df)\
           else 2 if Sunny_Warm(df)\
           else 2 if Clear_Warm(df)\
           else 2 if PCloudy_Warm(df)\
           else 2 if Cloudy_Warm(df)\
           else 2 if Rain_Warm(df)\
           else 2 if Night_Warm(df)\
           else 3 if Sunny_Cold(df)\
           else 3 if Clear_Cold(df)\
           else 3 if PCloudy_Cold(df)\
           else 3 if Cloudy_Cold(df)\
           else 3 if Rain_Cold(df)\
           else 3 if Snow_Cold(df)\
           else 3 if Night_Cold(df)\
           else 4 if Sunny_Freezing(df)\
           else 4 if Clear_Freezing(df)\
           else 4 if PCloudy_Freezing(df)\
           else 4 if Cloudy_Freezing(df)\
           else 4 if Rain_Freezing(df)\
           else 4 if Snow_Freezing(df)\
           else 4 if Night_Freezing(df)\
           else 5 if Rain_Snow(df)\
           else 6, axis = 1)

In [155]:
new_feature_data_series(test,['Visibility (10m)'],visibility_dist,'Visibility_dist')

In [156]:
new_feature_data_series(test,['Weather_State','Visibility_dist'],lambda df :df['Weather_State']*(df['Visibility_dist']),'Weather_stat&visb_dist')

In [157]:
new_feature_data_series(test,['Weather_State','Visibility (10m)'],lambda df :df['Weather_State']*np.log(df['Visibility (10m)']),'Weather&visb')

In [158]:
test['Sky_State'] =test[['Temperature(C)','Visibility_dist', 'Solar Radiation (MJ/m2)' , 'day_night','Snowfall (cm)','Rainfall(mm)']]  \
    .apply(lambda df: 4 if Cond4(df)\
                 else 2 if Cond2(df)\
                 else 3 if Cond3(df)\
                 else 1 if Cond1(df)\
                 else 5, axis = 1)

In [159]:
test['Temp_State'] =test[['Temperature(C)','Visibility_dist', 'Solar Radiation (MJ/m2)' , 'day_night','Snowfall (cm)','Rainfall(mm)']]  \
    .apply(lambda df: 1 if Warm(df)\
                 else 2 if Cold(df)\
                 else 3 if Hot(df)\
                 else 4 if Freezing(df)\
                 
                 else 5, axis = 1)

In [160]:
new_feature_data_series(test,['Sky_State','Temp_State'],lambda df :df['Sky_State']*(df['Temp_State']),'GBL_Forecast')

In [161]:
new_feature_data_series(test,['Temperature(C)','Humidity(%)','Dew point temperature(C)'],lambda df :value(df) if value(df) != float('nan') else 0,'Abdo_try')
new_feature_data_series(test,['Temperature(C)','Humidity(%)']
                        ,lambda  df :df['Humidity(%)']*(df['Temperature(C)'])/100  ,'X_Temp')
test['Fog_State'] =test[['Temperature(C)','X_Temp']]  \
    .apply(lambda df: 1 if F_state(df)\
                 else 0, axis = 1)
test['Fog'] =test[['Fog_State','Visibility (10m)']]  \
    .apply(lambda df: 2 if Fogs(df)\
                 else 1, axis = 1)

In [162]:
#test.drop(["Date", "y", "Date_tmp", "Dataset"], axis=1, inplace=True)
Xtest = test[selected_features]
Xtest.columns

Index(['Hour', 'Temperature(C)', 'Humidity(%)', 'Wind speed (m/s)',
       'Visibility (10m)', 'Dew point temperature(C)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Seasons',
       'Holiday', 'Functioning Day', 'Year', 'Week', 'Month', 'WeekDay',
       'day_night', 'Humidex', 'Visibility_dist', 'Rain_cat', 'Snow_cat',
       'Rush_hour', 'Dead_hour', 'Humidity_high', 'Weather_bad', 'Ideal',
       'Weather_State', 'Weather_stat&visb_dist', 'Weather&visb', 'Sky_State',
       'Temp_State', 'GBL_Forecast', 'X_Temp', 'Abdo_try', 'Fog_State', 'Fog'],
      dtype='object')

In [163]:
pred_test = model.predict(Xtest)

NameError: name 'model' is not defined

### model Predictions 

In [164]:
pred_test = GBR.predict(Xtest)

In [165]:
from sklearn.metrics import r2_score

df_test_real = pd.read_csv(r"SeoulBikeDataTestFull.csv")
y_pred_real = df_test_real['y']
print('(Real) Stacking Regression RMSLE:', rmsle(y_pred_real, np.exp(pred_test), False))
print('(Real) Stacking Regression R2:', r2_score(y_pred_real, np.exp(pred_test)))

(Real) Stacking Regression RMSLE: 0.40715602116697847
(Real) Stacking Regression R2: 0.9118720916161493


In [None]:
df_test.info()

In [None]:
original_test = pd.read_csv(r"datasets/test.csv")
df_test["ID"] = original_test["ID"]
df_test["y"] = np.exp(pred_test)
df_test[["ID", "y"]].to_csv('ultima2.csv', index=False)

In [None]:
print(df_test["y"])

### Save Submission

# 