#Dataset details : Both hour.csv and day.csv have the following fields, except hr which is not available in day.csv
	
	- instant: record index
	- dteday : date
	- season : season (1:springer, 2:summer, 3:fall, 4:winter)
	- yr : year (0: 2011, 1:2012)
	- mnth : month ( 1 to 12)
	- hr : hour (0 to 23)
	- holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
	- weekday : day of the week
	- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
	+ weathersit : 
		- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
		- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
		- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
		- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
	- temp : Normalized temperature in Celsius. The values are divided to 41 (max)
	- atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
	- hum: Normalized humidity. The values are divided to 100 (max)
	- windspeed: Normalized wind speed. The values are divided to 67 (max)
	- casual: count of casual users
	- registered: count of registered users
	- cnt: count of total rental bikes including both casual and registered

#Business usecase:
-Prediction of hourly/daily rentals accroding to the seasons and other climate factors.

## Dataset Exploration

In [106]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df = pd.read_csv("C://Users//saxen//OneDrive//Documents//GitHub//Deploy-BikeshareMLModel-GCP//data//hour.csv")
df.head()

In [None]:
#Check the data 
df.info()


In [None]:
df.describe()

In [None]:
#check for the duplicates and missing value :
duplicateRows = df[df.duplicated()]
duplicateRows.sum()

In [None]:
df.isnull().sum()

In [None]:
#check the shape of the data
df.shape

In [None]:
df.dtypes

# How many categorical and continuous data
##Categorical Columns: season, weathersit, yr (0-2018, 1-2019), weekday (0-6, Sunday-Saturday), mnth
##Continuous Columns: instant, casual, temp, windspeed, atemp, hum, registered

# Cleaning the data 

In [114]:
#change the non readable type of data :
df['dteday'] = pd.to_datetime(df['dteday'])

In [None]:
df.head()

In [None]:
#Drop uncessary columns :
col = ['instant', 'dteday', 'casual', 'registered']
#Removing time based data and redundant data
df.drop(col, axis = 1, inplace=True)
df.head()

In [None]:
#check the correlation of the data :
import seaborn as sb
corr = df.corr()
sb.heatmap(corr, cmap="Blues", annot=True)

In [None]:
df.corr(numeric_only=True)

In [119]:
X = df.drop(labels=['cnt'],axis=1)
Y = df[['cnt']]

In [None]:
# split a dataset into train and test sets
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
from sklearn.ensemble import RandomForestRegressor
for i in (10, 20, 30, 40, 100, 150,300):
    model = RandomForestRegressor(n_estimators= i,n_jobs= -1, random_state = 15)
    model.fit(X_train,y_train)

    relation = model.score(X_train, y_train)
    print('relation : ', relation_square)
    plt.figure(figsize=(20,10))
    y_p = model.predict(X_train)
    ax1 = sns.kdeplot(y_train,label = 'y_train',color="red")
    ax2 = sns.kdeplot(y_p,label = 'y_pred',color="blue")
    
    plt.title(i)
    plt.legend()
    plt.show()

In [None]:
model = RandomForestRegressor(n_estimators=300, n_jobs = -1 , random_state = 0)
model.fit(X_train, y_train)

predictions = model.predict(X_test)

In [None]:
predictions

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error



# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, predictions))

# Print the RMSE score
print(rmse)

In [125]:

r2_square = r2_score(y_test, predictions)

In [None]:
r2_square

In [127]:
# Save the trained model to a .pkl file
import pickle
with open('regression_model.pkl', 'wb') as file:
    pickle.dump(model, file)