<div style="text-align:center">
    <img src="../../files/monolearn-logo.png" height="150px">
    <h1>ML course</h1>
    <h3>Session 12: Bike sharing demand project</h3>
    <h4><a href="https://amzenterprise.ir/">Ali Momenzadeh</a></h5>
</div>

#### Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
#When using the 'inline' backend, your matplotlib graphs will be included in your notebook, next to the code.

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

#### Load and prepare data

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

#### EDA

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.describe()

In [None]:
test.describe()

#### Data Preprocessing

In [None]:
train.isnull().sum()

In [None]:
# Convert string to datatime and create Hour, Month and Day
train["datetime"] = pd.to_datetime(train["datetime"])
train["Hour"] = train["datetime"].apply(lambda x: x.hour)
train["Month"] = train["datetime"].apply(lambda x: x.month)
train["Day"] = train["datetime"].apply(lambda x: x.dayofweek)
train.drop(["datetime"], axis=1, inplace=True)

test["datetime"] = pd.to_datetime(test["datetime"])
test["Hour"] = test["datetime"].apply(lambda x: x.hour)
test["Month"] = test["datetime"].apply(lambda x: x.month)
test["Day"] = test["datetime"].apply(lambda x: x.dayofweek)
test.drop(["datetime"], axis=1, inplace=True)

In [None]:
test.isnull().sum()

In [None]:
train.head()

In [None]:
test.head()

#### One-hot encoding

In [None]:
train = pd.get_dummies(train, columns=["season", "weather", "Hour", "Month", "Day"])
test = pd.get_dummies(test, columns=["season", "weather", "Hour", "Month", "Day"])

In [None]:
train.info()

In [None]:
test.info()

#### Strorytelling - Visualization

In [None]:
corr = train.corr()
corr.shape

In [None]:
plt.figure(figsize=(40, 40))
sns.heatmap(
    corr,
    cbar=True,
    square=True,
    fmt=".1f",
    annot=True,
    annot_kws={"size": 15},
    cmap="Blues",
)

#### Train and test (Regression)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error

In [None]:
X = train.drop(["casual", "registered", "count"], axis=1)
y = train["count"]

In [None]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.2, random_state = 0)

##### RandomForest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_train)
print(y_pred_rf)

In [None]:
sns.histplot(y_train - y_pred_rf)

In [None]:
map = sns.cubehelix_palette(as_cmap=True)
f, ax = plt.subplots()
points = ax.scatter(y_train, y_pred_rf, c=y_train, cmap=map)
f.colorbar(points)
plt.show()

In [None]:
print("R2 Score:", metrics.r2_score(y_train, y_pred_rf))
print("Mean Absolute Error:", metrics.mean_absolute_error(y_train, y_pred_rf))
print("Mean Squared Error:", metrics.mean_squared_error(y_train, y_pred_rf))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_train, y_pred_rf)))

##### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

model_lr = LinearRegression()

# Selecting only one feature (temp column) since the model is linear
X_train_lr = X_train["temp"]
X_train_lr = X_train_lr.values.reshape(-1, 1)

X_validation_lr = X_validation["temp"]
X_validation_lr = X_validation_lr.values.reshape(-1, 1)

y_train_lr = y_train.values.reshape(-1, 1)
y_validation_lr = y_validation.values.reshape(-1, 1)

model_lr.fit(X_train_lr, y_train_lr)
y_pred_lr = model_lr.predict(X_train_lr)
print(y_pred_lr)

In [None]:
sns.histplot(y_train_lr - y_pred_lr)

In [None]:
map = sns.cubehelix_palette(as_cmap=True)
f, ax = plt.subplots()
points = ax.scatter(y_train, y_pred_lr, c=y_train, cmap=map)
f.colorbar(points)
plt.show()

In [None]:
print("R2 Score:", metrics.r2_score(y_train_lr, y_pred_lr))
print("Mean Absolute Error:", metrics.mean_absolute_error(y_train_lr, y_pred_lr))
print("Mean Squared Error:", metrics.mean_squared_error(y_train_lr, y_pred_lr))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_train_lr, y_pred_lr)))

##### Multiple Regression

In [None]:
model_mlr = LinearRegression()
model_mlr.fit(X_train, y_train)
y_pred_mlr = model_mlr.predict(X_train)
print(y_pred_mlr)

In [None]:
sns.histplot(y_train - y_pred_mlr)

In [None]:
map = sns.cubehelix_palette(as_cmap=True)
f, ax = plt.subplots()
points = ax.scatter(y_train, y_pred_mlr, c=y_train, cmap=map)
f.colorbar(points)
plt.show()

In [None]:
print("R2 Score:", metrics.r2_score(y_train, y_pred_mlr))
print("Mean Absolute Error:", metrics.mean_absolute_error(y_train, y_pred_mlr))
print("Mean Squared Error:", metrics.mean_squared_error(y_train, y_pred_mlr))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_train, y_pred_mlr)))

In [None]:
print(model_mlr.intercept_)
print(model_mlr.coef_)

In [None]:
coeffcients = pd.DataFrame([X_train.columns,model_mlr.coef_]).T
coeffcients = coeffcients.rename(columns={0: 'Attribute', 1: 'Coefficients'})
coeffcients

##### KNN Regression

In [None]:
from sklearn.neighbors import KNeighborsRegressor

model_knn = KNeighborsRegressor(n_neighbors=2)
model_knn.fit(X_train, y_train)
y_pred_knn = model_knn.predict(X_train)
print(y_pred_knn)

In [None]:
sns.histplot(y_train - y_pred_knn)

In [None]:
map = sns.cubehelix_palette(as_cmap=True)
f, ax = plt.subplots()
points = ax.scatter(y_train, y_pred_knn, c=y_train, cmap=map)
f.colorbar(points)
plt.show()

In [None]:
print("R2 Score:", metrics.r2_score(y_train, y_pred_knn))
print("Mean Absolute Error:", metrics.mean_absolute_error(y_train, y_pred_knn))
print("Mean Squared Error:", metrics.mean_squared_error(y_train, y_pred_knn))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_train, y_pred_knn)))

##### Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

model_dt = DecisionTreeRegressor()
model_dt.fit(X_train, y_train)
y_pred_dt = model_dt.predict(X_train)
print(y_pred_dt)

In [None]:
sns.histplot(y_train - y_pred_dt)

In [None]:
map = sns.cubehelix_palette(as_cmap=True)
f, ax = plt.subplots()
points = ax.scatter(y_train, y_pred_dt, c=y_train, cmap=map)
f.colorbar(points)
plt.show()

In [None]:
print("R2 Score:", metrics.r2_score(y_train, y_pred_dt))
print("Mean Absolute Error:", metrics.mean_absolute_error(y_train, y_pred_dt))
print("Mean Squared Error:", metrics.mean_squared_error(y_train, y_pred_dt))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_train, y_pred_dt)))

> So far, it's obvious that Decision Tree is the best model for this problem and RandomForest is the second best. This can be seen in the provided sample test results below as well (Predicted values are closer in these two models overall).

### Results evaluation

In [None]:
# Random Forest Regressor
y_pred_test_rf = model_rf.predict(test)
y_pred_test_rf = y_pred_test_rf.reshape(-1, 1).tolist()

# if you want to save in the file, use the following code
# output = pd.DataFrame({'count': y_pred_test_rf})
# output.head()
# output.to_csv('output.csv', index=False)

# Linear Regression
y_pred_test_lr = model_lr.predict(test["temp"].values.reshape(-1, 1)) # Selecting only one feature (temp column) since the model is linear
y_pred_test_lr = y_pred_test_lr.reshape(-1, 1).tolist()

# Multiple Linear Regression
y_pred_test_mlr = model_mlr.predict(test)
y_pred_test_mlr = y_pred_test_mlr.reshape(-1, 1).tolist()

# KNN Regression
y_pred_test_knn = model_knn.predict(test)
y_pred_test_knn = y_pred_test_knn.reshape(-1, 1).tolist()

# Decision Tree Regressor
y_pred_test_dt = model_dt.predict(test)
y_pred_test_dt = y_pred_test_dt.reshape(-1, 1).tolist()

# Create a dataframe from the predictions
preds = pd.DataFrame({"Random Forest Regressor": y_pred_test_rf, "Linear Regression": y_pred_test_lr, "Multiple Linear Regression": y_pred_test_mlr, "KNN Regression": y_pred_test_knn, "Decision Tree Regressor": y_pred_test_dt})

random_rows = np.random.choice(preds.index, size=10)
preds.loc[random_rows]