# <center>Bike Sharing Demand</center>

### <div align='right'>Filip Kowalski</right>

In [None]:
from pandas import read_csv
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import  StandardScaler
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

In [None]:
# set figure size
plt.rcParams['figure.figsize'] = (10,10)

# Load Data

In [None]:
filename='train.csv'
dataset=read_csv(filename)

# Looking on data 

In [None]:
time=pd.to_datetime(dataset.datetime)
dataset.datetime=time

In [None]:
dataset.head(10)

### Data Fields
<br>
**datetime** - hourly date + timestamp  
**season** -  1 = spring, 2 = summer, 3 = fall, 4 = winter 
<br>
**holiday** - whether the day is considered a holidayr
<br>
**workingday** - whether the day is neither a weekend nor holiday
<br>
**weather**
1. : Clear, Few clouds, Partly cloudy, Partly cloudy 
* : Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 
* : Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 
* : Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 



**temp** - temperature in Celsius
<br>
**atemp** - "feels like" temperature in Celsius
<br>
**humidity** - relative humidity
<br>
**windspeed** - wind speed
<br>
**casual** - number of non-registered user rentals initiated
<br>
**registered** - number of registered user rentals initiated
<br>
**count** - number of total rentals

In [None]:
print(dataset.shape)

# Descriptions of data

In [None]:
print(dataset.describe())

There is no null values. Data look corect (there is not strange values like negative values or to low or to hight numbers)

In [None]:
bins = pd.cut(dataset['count'], [0, 100, 200,300,400,500,600,700,800,900])
groups=dataset.groupby(bins)['count'].agg(['count', 'sum'])
print(groups)

In [None]:
dataset.hist()
plt.show()

In [None]:
pd.plotting.scatter_matrix(dataset)
plt.show()

In [None]:
pd.set_option('precision',2)
print(dataset.corr(method='pearson'))

# Prepare data

In [None]:
dataset.head()

In [None]:
dataset['datetime'] = dataset['datetime'].dt.hour
dataset=dataset.drop(columns=['registered','casual'])

In [None]:
dataset.plot(kind='density',subplots=True,layout=(4,4),sharex=False,sharey=False,legend=True,fontsize=1)
plt.show()

In [None]:
dataset.plot(kind='box',subplots=True,layout=(4,4),sharex=False,sharey=False)
plt.show()

# Split-out validation dataset

In [None]:
array=dataset.values

In [None]:
array

In [None]:
X=array[:,:-1]
Y=array[:,-1]
validation_size=0.2
seed=7
X_train,X_validation,Y_train,Y_validation = train_test_split(X,Y,test_size=validation_size,random_state=seed)

In [None]:
X_train.shape

In [None]:
Y_train.shape

In [None]:
X.shape

In [None]:
Y.shape

# Test options and eveluation metric

In [None]:
num_folds=10
scoring='neg_mean_squared_error'

# Spot-check algorithms

In [None]:
models=[]
models.append(('LR',LinearRegression()))
models.append(('LASSO',Lasso()))
models.append(('EN',ElasticNet()))
models.append(('KNN',KNeighborsRegressor()))
models.append(('CART',DecisionTreeRegressor()))
models.append(('SVR',SVR()))

# Evaluate models

In [None]:
results=[]
names=[]
for name, model in models:
    kfold=KFold(n_splits=num_folds,random_state=seed,shuffle=True)
    cv_results=cross_val_score(model,X_train,Y_train,cv=kfold,scoring=scoring)
    results.append(cv_results)
    names.append(name)
    message="%a: %f (%f)" % (name,cv_results.mean(),cv_results.std())
    print(message)

In [None]:
fig=plt.figure()
fig.suptitle('Algorithm Comprasion')
ax=fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
model=DecisionTreeRegressor()
model.fit(X=X_train,y=Y_train)

In [None]:
predictions=model.predict(X_validation)
print(mean_squared_error(Y_validation,predictions))

In [None]:
predictions

In [None]:
Y_validation

This models dont look nice. We should standardize the data.

# Standardize the data

In [None]:
piplines=[]
piplines.append(('ScaledLR',Pipeline([('Scaler',StandardScaler()),('LR',LinearRegression())])))
piplines.append(('ScaledLASSO',Pipeline([('Scaler',StandardScaler()),('LASSO',Lasso())])))
piplines.append(('ScaledEN',Pipeline([('Scaler',StandardScaler()),('EN',ElasticNet())])))
piplines.append(('ScaledKNN',Pipeline([('Scaler',StandardScaler()),('KNN',KNeighborsRegressor())])))
piplines.append(('ScaledCART',Pipeline([('Scaler',StandardScaler()),('CART',DecisionTreeRegressor())])))
piplines.append(('ScaledSVR',Pipeline([('Scaler',StandardScaler()),('SVR',SVR())])))

results=[]
names=[]
for name, model in piplines:
    kfold=KFold(n_splits=num_folds,random_state=seed,shuffle=True)
    cv_results=cross_val_score(model,X_train,Y_train,cv=kfold,scoring=scoring)
    results.append(cv_results)
    names.append(name)
    message="%a: %f (%f)" % (name,cv_results.mean(),cv_results.std())
    print(message)




In [None]:
fig=plt.figure()
fig.suptitle('Algorithm Comprasion')
ax=fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

# Tuning ScaledCART

In [None]:

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 4000, num = 10)]
# Number of features to consider at every split
splitter = ['best','random']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 150, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,3,7,15]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8,12]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {
               'splitter': splitter,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               }
print(random_grid)

In [None]:
CART = DecisionTreeRegressor()
CART_random = RandomizedSearchCV(estimator = CART, param_distributions = random_grid, n_iter = 100, cv = 10, verbose=2, random_state=42, n_jobs = -1)

CART_random.fit(X_train,Y_train)

In [None]:
CART_random.best_params_

Best parameters for this model

# Evaluate tuned model

In [None]:
models=[]
models.append(('CART',DecisionTreeRegressor(splitter='best',min_samples_split=2,min_samples_leaf=8,max_depth=16)))
results=[]
names=[]
for name, model in models:
    kfold=KFold(n_splits=num_folds,random_state=seed,shuffle=True)
    cv_results=cross_val_score(model,X_train,Y_train,cv=kfold,scoring=scoring)
    results.append(cv_results)
    names.append(name)
    message="%a: %f (%f)" % (name,cv_results.mean(),cv_results.std())
    print(message)

In [None]:
model=DecisionTreeRegressor(splitter='best',min_samples_split=7,min_samples_leaf=12,max_depth=90)
results=[]
model.fit(X=X_train,y=Y_train)
predictions=model.predict(X_validation)
print(mean_squared_error(Y_validation,predictions))

Model get much bether result then with defult parameters

# Ensemble Methods

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
ensembles=[]
ensembles.append(('ScaledAB',Pipeline([('Scaler',StandardScaler()),('AB',AdaBoostRegressor())])))
ensembles.append(('ScaledGBM',Pipeline([('Scaler',StandardScaler()),('GBM',GradientBoostingRegressor())])))
ensembles.append(('ScaledRF',Pipeline([('Scaler',StandardScaler()),('RF',RandomForestRegressor())])))
ensembles.append(('ScaledET',Pipeline([('Scaler',StandardScaler()),('ET',ExtraTreesRegressor())])))

results=[]
names=[]
for name, model in ensembles:
    kfold=KFold(n_splits=num_folds,random_state=seed,shuffle=True)
    cv_results=cross_val_score(model,X_train,Y_train,cv=kfold,scoring=scoring)
    results.append(cv_results)
    names.append(name)
    message="%a: %f (%f)" % (name,cv_results.mean(),cv_results.std())
    print(message)

In [None]:
fig=plt.figure()
fig.suptitle('Algorithm Comprasion')
ax=fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

# Tunning RF

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

for i in random_grid:
    print(i,random_grid[i])


In [None]:
RF = RandomForestRegressor()
RF_random = RandomizedSearchCV(estimator = RF, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
RF_random.fit(X_train, Y_train)

In [None]:
RF_random.best_params_

{'n_estimators': 800,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': True}

In [None]:
models=[]
models.append(('RF',RandomForestRegressor(
                                random_state=seed
                                n_estimators=800,
                                max_features='auto',
                                max_depth=None,
                                min_samples_split=5,
                                min_samples_leaf=2,
                                bootstrap=True
)))
results=[]
names=[]
for name, model in models:
    kfold=KFold(n_splits=num_folds,random_state=seed,shuffle=True)
    cv_results=cross_val_score(model,X_train,Y_train,cv=kfold,scoring=scoring)
    results.append(cv_results)
    names.append(name)
    message="%a: %f (%f)" % (name,cv_results.mean(),cv_results.std())
    print(message)

In [None]:
model=RandomForestRegressor(
n_estimators=800,
max_features='auto',
max_depth=None,
min_samples_split=5,
min_samples_leaf=2,
bootstrap=True
)
results=[]
model.fit(X=X_train,y=Y_train)
predictions=model.predict(X_validation)
print(mean_squared_error(Y_validation,predictions))

In [None]:
scaler=StandardScaler().fit(X_train)
rescaledX=scaler.transform(X_train)
model=RandomForestRegressor(    random_state=seed,
                                n_estimators=800,
                                max_features='auto',
                                max_depth=None,
                                min_samples_split=5,
                                min_samples_leaf=2,
                                bootstrap=True
)
model.fit(rescaledX,Y_train)

In [None]:
rescaledValidationX=scaler.transform(X_validation)
predictions=model.predict(rescaledValidationX)
print(mean_squared_error(Y_validation,predictions))

In [None]:
predictions

In [None]:
Y_validation

In [None]:
solution={'Y_validation':Y_validation,'predictions':predictions}

In [None]:
sorted_solution=pd.DataFrame(solution).sort_values(by='Y_validation').reset_index(drop=True);sorted_solution

In [None]:
lws = [6, 1]
ax = sorted_solution.plot(legend=True,figsize=[20,20],color=['r','b'])
for i, l in enumerate(ax.lines):
    plt.setp(l, linewidth=lws[i])

The solution isn't perfect but its the best from tested algorithms