In [85]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
import pickle
import warnings
%matplotlib inline

Import Data set

In [86]:
df = pd.read_csv("data\SeoulBikeData.csv", encoding="utf-8")

In [87]:
df.columns

Index(['Date', 'Rented Bike Count', 'Hour', 'Temperature(°C)', 'Humidity(%)',
       'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(°C)',
       'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)', 'Seasons',
       'Holiday', 'Functioning Day'],
      dtype='object')

In [88]:
df.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature(°C),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


Converting Date column to type datetime64 and adding new column weekday.Then
removing Date column.

In [89]:
df["Date"] = pd.to_datetime(df["Date"], dayfirst=True)
df.insert(
    1, "weekday", df["Date"].dt.strftime("%w")
)  # here weekday starting from sunday as 0 .
df["weekday"] = pd.to_numeric(df["weekday"])
df.drop(["Date"], axis=1, inplace=True)

Preparing X and Y variables

In [90]:
X = df.drop(columns=["Rented Bike Count","Dew point temperature(°C)"], axis=1)

In [91]:
y=df["Rented Bike Count"]

In [92]:
X.head()

Unnamed: 0,weekday,Hour,Temperature(°C),Humidity(%),Wind speed (m/s),Visibility (10m),Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,5,0,-5.2,37,2.2,2000,0.0,0.0,0.0,Winter,No Holiday,Yes
1,5,1,-5.5,38,0.8,2000,0.0,0.0,0.0,Winter,No Holiday,Yes
2,5,2,-6.0,39,1.0,2000,0.0,0.0,0.0,Winter,No Holiday,Yes
3,5,3,-6.2,40,0.9,2000,0.0,0.0,0.0,Winter,No Holiday,Yes
4,5,4,-6.0,36,2.3,2000,0.0,0.0,0.0,Winter,No Holiday,Yes


In [93]:
y.head()

0    254
1    204
2    173
3    107
4     78
Name: Rented Bike Count, dtype: int64

In [94]:
X.shape

(8760, 12)

In [95]:
# define numerical and categorical columns

numerical_features = [feature for feature in df.columns if df[feature].dtype != "O"]
categorical_features = [feature for feature in df.columns if df[feature].dtype == "O"]

print(f"We have {len(numerical_features)} numerical features : {numerical_features} ")
print(
    f"We have {len(categorical_features)} categorical features : {categorical_features} "
)

We have 11 numerical features : ['weekday', 'Rented Bike Count', 'Hour', 'Temperature(°C)', 'Humidity(%)', 'Wind speed (m/s)', 'Visibility (10m)', 'Dew point temperature(°C)', 'Solar Radiation (MJ/m2)', 'Rainfall(mm)', 'Snowfall (cm)'] 
We have 3 categorical features : ['Seasons', 'Holiday', 'Functioning Day'] 


In [96]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numerical_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("StandardScaler", numerical_transformer, num_features),
        ("OneHotEncoder", oh_transformer, cat_features),
    ]
)

In [97]:
X=preprocessor.fit_transform(X)

In [98]:
#seperate dataset into train and test
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)
X_train.shape,X_test.shape

((7008, 17), (1752, 17))

In [99]:
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mean_squared_error(true,predicted))
    r2_square=r2_score(true,predicted)
    return mae,rmse,r2_square

In [101]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor(),
}
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)  # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate Train and Test dataset
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(
        y_train, y_train_pred
    )

    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model performance for Training set")
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print("----------------------------------")

    print("Model performance for Test set")
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)

    print("=" * 35)
    print("\n")

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 432.2581
- Mean Absolute Error: 323.4091
- R2 Score: 0.5515
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 431.6086
- Mean Absolute Error: 320.0103
- R2 Score: 0.5493


Lasso
Model performance for Training set
- Root Mean Squared Error: 432.3740
- Mean Absolute Error: 323.1399
- R2 Score: 0.5513
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 431.6299
- Mean Absolute Error: 319.4278
- R2 Score: 0.5492


Ridge
Model performance for Training set
- Root Mean Squared Error: 432.2583
- Mean Absolute Error: 323.3913
- R2 Score: 0.5515
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 431.6103
- Mean Absolute Error: 319.9904
- R2 Score: 0.5493


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 236.6231
- Mean Absolute Error: 149.9313
- R2 Score: 0.

Results

In [102]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
7,CatBoosting Regressor,0.931766
6,XGBRegressor,0.928983
5,Random Forest Regressor,0.9164
4,Decision Tree,0.844105
3,K-Neighbors Regressor,0.782678
8,AdaBoost Regressor,0.584246
0,Linear Regression,0.549262
2,Ridge,0.549258
1,Lasso,0.549217


#From this result I am Selecting CatBoosting Regressor
In this model I get an R2_score of 0.9669 for training data and R2 Score of 0.931766 for test data.

In [109]:
cat_model=CatBoostRegressor()
cat_model=cat_model.fit(X_train,y_train)
y_pred=cat_model.predict(X_test)
score=r2_score(y_test,y_pred)*100

Learning rate set to 0.055691
0:	learn: 620.7123339	total: 2.25ms	remaining: 2.25s
1:	learn: 598.9884479	total: 4.26ms	remaining: 2.13s
2:	learn: 580.4465572	total: 6.47ms	remaining: 2.15s
3:	learn: 560.6493745	total: 8.56ms	remaining: 2.13s
4:	learn: 543.0113559	total: 10.6ms	remaining: 2.11s
5:	learn: 525.1574911	total: 12.7ms	remaining: 2.1s
6:	learn: 509.0035743	total: 14.6ms	remaining: 2.08s
7:	learn: 493.4721225	total: 16.6ms	remaining: 2.06s
8:	learn: 480.0629170	total: 18.8ms	remaining: 2.08s
9:	learn: 466.6019896	total: 20.9ms	remaining: 2.07s
10:	learn: 455.0152052	total: 22.8ms	remaining: 2.05s
11:	learn: 443.6664367	total: 24.7ms	remaining: 2.03s
12:	learn: 432.6944521	total: 26.4ms	remaining: 2.01s
13:	learn: 422.8950842	total: 28.5ms	remaining: 2s
14:	learn: 414.5890757	total: 30.6ms	remaining: 2.01s
15:	learn: 406.0407823	total: 32.3ms	remaining: 1.99s
16:	learn: 398.5815981	total: 34.2ms	remaining: 1.98s
17:	learn: 391.1054012	total: 36.1ms	remaining: 1.97s
18:	learn: 3

In [110]:
print("Accuracy of the model is %.2f" % score)

Accuracy of the model is 93.18


In [108]:
pred_df = pd.DataFrame(
    {"Actual Value": y_test, "Predicted Value": y_pred.astype(int), "Difference": y_test.astype(int) - y_pred.astype(int)}
)
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
4136,694,1157,-463
6705,1186,1199,-13
3538,789,690,99
6583,511,749,-238
1993,190,183,7
...,...,...,...
2263,85,289,-204
6255,1196,1272,-76
7617,775,696,79
4306,893,895,-2
