### Seoul Bike Rental 🚴‍♂️ | Regression Model

![](https://raw.githubusercontent.com/MhmdSyd/needed_image/main/iti-kaggle.jpg)

**This notebook explains how we can go about explore and prepare data for model building.**

*The notebook is structured in the following way:*

- About Dataset.
- Data Summary.
- Feature Engineering.
- Model:
    - CatBoost Model.
    - XGBoost Model.
    - Grediant Boost Model.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
import missingno as msno

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import  Lasso
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor 
import xgboost as xgb

In [2]:
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("data/SeoulBikeData.csv", encoding='ISO-8859-1')
df.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


## EDA

In [4]:
df.describe()

Unnamed: 0,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm)
count,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0
mean,704.602055,11.5,12.882922,58.226256,1.724909,1436.825799,4.073813,0.569111,0.148687,0.075068
std,644.997468,6.922582,11.944825,20.362413,1.0363,608.298712,13.060369,0.868746,1.128193,0.436746
min,0.0,0.0,-17.8,0.0,0.0,27.0,-30.6,0.0,0.0,0.0
25%,191.0,5.75,3.5,42.0,0.9,940.0,-4.7,0.0,0.0,0.0
50%,504.5,11.5,13.7,57.0,1.5,1698.0,5.1,0.01,0.0,0.0
75%,1065.25,17.25,22.5,74.0,2.3,2000.0,14.8,0.93,0.0,0.0
max,3556.0,23.0,39.4,98.0,7.4,2000.0,27.2,3.52,35.0,8.8


In [5]:
df.shape

(8760, 14)

In [6]:
df.dtypes

Date                          object
Rented Bike Count              int64
Hour                           int64
Temperature(°C)              float64
Humidity(%)                    int64
Wind speed (m/s)             float64
Visibility (10m)               int64
Dew point temperature(°C)    float64
Solar Radiation (MJ/m2)      float64
Rainfall(mm)                 float64
Snowfall (cm)                float64
Seasons                       object
Holiday                       object
Functioning Day               object
dtype: object

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Date                       8760 non-null   object 
 1   Rented Bike Count          8760 non-null   int64  
 2   Hour                       8760 non-null   int64  
 3   Temperature(°C)            8760 non-null   float64
 4   Humidity(%)                8760 non-null   int64  
 5   Wind speed (m/s)           8760 non-null   float64
 6   Visibility (10m)           8760 non-null   int64  
 7   Dew point temperature(°C)  8760 non-null   float64
 8   Solar Radiation (MJ/m2)    8760 non-null   float64
 9   Rainfall(mm)               8760 non-null   float64
 10  Snowfall (cm)              8760 non-null   float64
 11  Seasons                    8760 non-null   object 
 12  Holiday                    8760 non-null   object 
 13  Functioning Day            8760 non-null   objec

## Preprocessing

In [8]:
df.columns = [i.split("(")[0].strip().title().replace(" ","_") for i in [*df.columns]]
df.columns

Index(['Date', 'Rented_Bike_Count', 'Hour', 'Temperature', 'Humidity',
       'Wind_Speed', 'Visibility', 'Dew_Point_Temperature', 'Solar_Radiation',
       'Rainfall', 'Snowfall', 'Seasons', 'Holiday', 'Functioning_Day'],
      dtype='object')

In [9]:
df.Date = pd.to_datetime(df.Date, format="%d/%m/%Y")
df.Date = pd.to_datetime(df.Date, format="%d/%m/%Y")
df.Date.dtype

dtype('<M8[ns]')

In [10]:
df.insert(1,"Day", df.Date.dt.day)
df.insert(2, "Month", df.Date.dt.month)
df.insert(3, "Year", df.Date.dt.year)

In [11]:
df.insert(3, 'WeekDay',df["Date"].dt.day_name())

In [12]:
df.Holiday.replace(['Holiday','No Holiday'], [0,1], inplace=True)

In [13]:
df.Functioning_Day = df.Functioning_Day.map({"No":0, "Yes":1})

In [14]:
df.Year = df.Year.map({2017:0, 2018:1})

In [15]:
df.insert(2, 'label_day_night', df['Hour'].apply(lambda x : 0 if (x<7) else( 1)))

In [16]:
df["Working_Day"] = 1
df.loc[(df.WeekDay=="Saturday")|(df.WeekDay=="Sunday"), "Working_Day"] = 0

In [17]:
df.Humidity[df.Humidity==0] = df.Humidity.mean()

In [18]:
categoryVariableList = ["WeekDay", "Seasons"]
for var in categoryVariableList:
    df[var] = df[var].astype("category")

In [19]:
for col in categoryVariableList:
    dummies_col = pd.get_dummies(df[col])
    
    df = pd.concat([df, dummies_col],axis=1)
    
    df.drop(col, axis=1, inplace=True)

In [20]:
df.columns

Index(['Date', 'Day', 'label_day_night', 'Month', 'Year', 'Rented_Bike_Count',
       'Hour', 'Temperature', 'Humidity', 'Wind_Speed', 'Visibility',
       'Dew_Point_Temperature', 'Solar_Radiation', 'Rainfall', 'Snowfall',
       'Holiday', 'Functioning_Day', 'Working_Day', 'Friday', 'Monday',
       'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'Autumn',
       'Spring', 'Summer', 'Winter'],
      dtype='object')

In [21]:
columns = df.columns.tolist()
columns.append(columns.pop(5))
df = df[columns]

In [22]:
df.drop(["Date","Dew_Point_Temperature", "Snowfall", "Year"],axis=1, inplace=True)

In [23]:
df.columns

Index(['Day', 'label_day_night', 'Month', 'Hour', 'Temperature', 'Humidity',
       'Wind_Speed', 'Visibility', 'Solar_Radiation', 'Rainfall', 'Holiday',
       'Functioning_Day', 'Working_Day', 'Friday', 'Monday', 'Saturday',
       'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'Autumn', 'Spring',
       'Summer', 'Winter', 'Rented_Bike_Count'],
      dtype='object')

In [24]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Rented_Bike_Count'], axis=1),
                                                  df.Rented_Bike_Count,
                                                  test_size=0.3,
                                                  random_state=42)
y_train_log = np.log1p(y_train)

In [25]:
X_train.columns

Index(['Day', 'label_day_night', 'Month', 'Hour', 'Temperature', 'Humidity',
       'Wind_Speed', 'Visibility', 'Solar_Radiation', 'Rainfall', 'Holiday',
       'Functioning_Day', 'Working_Day', 'Friday', 'Monday', 'Saturday',
       'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'Autumn', 'Spring',
       'Summer', 'Winter'],
      dtype='object')

### Model

In [26]:
# loss function
def RMSLE(y_acual, y_pred):
    return np.sqrt(mean_squared_log_error(y_acual, y_pred))

In [27]:
catBoost = CatBoostRegressor(objective='Poisson',
                             loss_function=RMSLE,
                             n_estimators=1000,
                             random_state=0,
                             learning_rate=0.09,
                             subsample=0.75,
                             max_depth=7,
                             verbose=0)

catBoost.fit(X_train,y_train)

y_pred = [int(i) for i in catBoost.predict(X_train)]

rmsle_train = np.sqrt(mean_squared_log_error(y_train, y_pred))
rmsle_test = np.sqrt(mean_squared_log_error(y_test, catBoost.predict(X_test)))

print("RMSLE Trian ====> ",round(rmsle_train,6))
print('='*30)
print("RMSLE Test ====> ",round(rmsle_test,6))

RMSLE Trian ====>  0.156606
RMSLE Test ====>  0.313619


In [28]:
xg_reg = xgb.XGBRegressor(objective='count:poisson',
                          random_state=0,
                          loss_function=RMSLE,
                          colsample_bytree = 0.6, 
                          learning_rate = 0.06,
                          gamma = 5,
                          n_estimators = 1000,
                          max_depth=5,
                          subsample=0.75,
                          min_child_weight=6)

xg_reg = xg_reg.fit(X_train,y_train)

y_pred = [int(i) for i in xg_reg.predict(X_train)]

rmsle_train = np.sqrt(mean_squared_log_error(y_train, y_pred))
rmsle_test = np.sqrt(mean_squared_log_error(y_test, xg_reg.predict(X_test)))

print("RMSLE Train =====> %f" % (round(rmsle_train,6)))
print('='*30)
print("RMSLE Test ====> %f"%(round(rmsle_test,6)))

RMSLE Train =====> 0.213453
RMSLE Test ====> 0.318309


In [29]:
params = {'n_estimators': 1000,
          'max_depth': 5,
          'random_state': 0,
          'min_samples_leaf': 4,
          'learning_rate': 0.07,
          'subsample': 0.75,
          'loss': 'ls'}

reg = GradientBoostingRegressor(**params)
reg.fit(X_train, y_train_log)

train_pred = np.sqrt(mean_squared_log_error(y_train, np.exp(reg.predict(X_train))))

test_pred = np.sqrt(mean_squared_log_error(y_test, np.exp(reg.predict(X_test))))

print("RMSLE Train =====> %f" % (round(train_pred,6)))
print('='*30)
print("RMSLE Test ====> %f"%(round(test_pred,6)))

RMSLE Train =====> 0.153546
RMSLE Test ====> 0.331601


In [30]:
# save models
import pickle

pickle.dump(xg_reg, open('output/Xgb_model.pkl', 'wb'))

### Thanks For Read My NoteBook :)