# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

Importing the categorical variables encoded dataset 

In [2]:
data = pd.read_csv('Metro-Interstate-Traffic-Volume-Encoded.csv')

In [3]:
data.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,Year,Month,Day,Hour,weather_main,weather_description,traffic_volume
0,7,288.28,0.0,0.0,2012,10,2,9,1,24,5545
1,7,289.36,0.0,0.0,2012,10,2,10,1,2,4516
2,7,289.58,0.0,0.0,2012,10,2,11,1,19,4767
3,7,290.13,0.0,0.0,2012,10,2,12,1,19,5026
4,7,291.14,0.0,0.0,2012,10,2,13,1,2,4918


In [4]:
X = data.drop('traffic_volume',axis=1)
Y = data['traffic_volume']

### Train Test Split

In [5]:
from sklearn.model_selection import train_test_split

The test size is taken as 25%

In [6]:
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.25,random_state=0)

# Modeling - Decision Tree

Fitting the model without any hyper parameter and checking the metrics

In [7]:
DT = DecisionTreeRegressor()

In [8]:
DT.fit(xtrain,ytrain)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [9]:
pred = DT.predict(xtest)

### Evaluation Metrics

In [10]:
import sklearn.metrics as metrics

In [11]:
print('R Squared : ',metrics.r2_score(ytest,pred))

R Squared :  0.6838977670564941


In [12]:
print('Mean Absolute Error : ',metrics.mean_absolute_error(ytest,pred))

Mean Absolute Error :  611.2754128288109


In [13]:
print('Mean Squared Error : ',metrics.mean_squared_error(ytest,pred))

Mean Squared Error :  1234052.3612978177


In [14]:
print('Root Mean Squared Error : ',np.sqrt(metrics.mean_squared_error(ytest,pred)))

Root Mean Squared Error :  1110.879093915183


Decision Tree model without any hyper parameter gives 68 % Score of R squared. Lets try finding the best hyper paramter for the model using Grid search

## Grid Search for Hyper Parameter

In [15]:
from sklearn.model_selection import GridSearchCV

In [16]:
Dt = DecisionTreeRegressor()

In [17]:
parameter ={'max_depth':np.arange(1,10)}
GS=GridSearchCV(DT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [18]:
GS.best_params_

{'max_depth': 7}

So from the Grid search we have got the best maxmium depth as 7 lets implement it and check the accuarcy.

In [19]:
DT = DecisionTreeRegressor(max_depth=7)

In [20]:
DT.fit(xtrain,ytrain)

DecisionTreeRegressor(criterion='mse', max_depth=7, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [21]:
pred = DT.predict(xtest)

In [22]:
print('R Squared : ',metrics.r2_score(ytest,pred))

R Squared :  0.7819791791506119


In [23]:
print('Mean Absolute Error : ',metrics.mean_absolute_error(ytest,pred))

Mean Absolute Error :  622.4893981948587


In [24]:
print('Mean Squared Error : ',metrics.mean_squared_error(ytest,pred))

Mean Squared Error :  851145.8659305343


In [25]:
print('Root Mean Squared Error : ',np.sqrt(metrics.mean_squared_error(ytest,pred)))

Root Mean Squared Error :  922.5756694876222


In [26]:
print('Bias Error')
print('Actual value :',np.mean(ytest))
print('Predicted value :',np.mean(pred))

Bias Error
Actual value : 3257.4061903576467
Predicted value : 3257.3341148925037


In [27]:
print('Variance Error')
print('Actual value :',np.var(ytest,ddof=1))
print('Predicted value :',np.var(pred,ddof=1))

Variance Error
Actual value : 3904289.9532529605
Predicted value : 3101054.3039220218


## Inference:
    Decision tree has given a better R squared score.
    It has less Bias error.
    The variance error is 0.79 and the model 21 percent chances for overfitting

# Modeling - Random Forest

Fitting the model without any hyper parameter and checking the metrics

In [28]:
RT = RandomForestRegressor()

In [29]:
RT.fit(xtrain,ytrain)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [30]:
pred = RT.predict(xtest)

### Evaluation Metrics

In [31]:
print('R Squared : ',metrics.r2_score(ytest,pred))

R Squared :  0.8257104291523046


In [32]:
print('Mean Absolute Error : ',metrics.mean_absolute_error(ytest,pred))

Mean Absolute Error :  526.6772301053854


In [33]:
print('Mean Squared Error : ',metrics.mean_squared_error(ytest,pred))

Mean Squared Error :  680420.5539814124


In [34]:
print('Root Mean Squared Error : ',np.sqrt(metrics.mean_squared_error(ytest,pred)))

Root Mean Squared Error :  824.8760840159039


Random Forest model without any hyper parameter gives 81 % Score of R squared. Lets try finding the best hyper paramter for the model using Grid search

## Grid Search for Hyper Parameter

In [35]:
RT = RandomForestRegressor()

In [36]:
parameter ={'max_depth':np.arange(1,10),'n_estimators':np.arange(1,15)}
GS=GridSearchCV(RT,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'n_estimators': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [37]:
GS.best_params_

{'max_depth': 7, 'n_estimators': 10}

So from the Grid search we have got the best maxmium depth as 7 and the estimator is 9 lets implement it and check the accuarcy.

In [38]:
RT = RandomForestRegressor(n_estimators=9,max_depth=7)

In [39]:
RT.fit(xtrain,ytrain)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=7,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=9, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [40]:
pred = RT.predict(xtest)

In [41]:
print('R Squared : ',metrics.r2_score(ytest,pred))

R Squared :  0.7870667936812938


In [42]:
print('Mean Absolute Error : ',metrics.mean_absolute_error(ytest,pred))

Mean Absolute Error :  617.5657036615753


In [43]:
print('Mean Squared Error : ',metrics.mean_squared_error(ytest,pred))

Mean Squared Error :  831283.9919206687


In [44]:
print('Root Mean Squared Error : ',np.sqrt(metrics.mean_squared_error(ytest,pred)))

Root Mean Squared Error :  911.7477677080809


In [45]:
print('Bias Error')
print('Actual value :',np.mean(ytest))
print('Predicted value :',np.mean(pred))

Bias Error
Actual value : 3257.4061903576467
Predicted value : 3253.0828561715643


In [46]:
print('Variance Error')
print('Actual value :',np.var(ytest,ddof=1))
print('Predicted value :',np.var(pred,ddof=1))

Variance Error
Actual value : 3904289.9532529605
Predicted value : 3073390.9469802957


## Inference:
    Random forest has given a better R squared score than the Decision tree.
    But both the bias error as well as variance error has increased a little. 

# Overall comparing all the models Decision tree is the better model. All the scores MAE,MSE,RMSE,R^2 scores are better when compared with other model. Since it as less chance of overfitting when compared with other models it can be used for prediction.The variance error can further reduced by using Ensembling Techniques.