# Importing libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve,auc
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

## Importing the preloaded dataset

In [2]:
from sklearn.datasets import load_boston
boston = load_boston()

In [3]:
data = pd.DataFrame(boston.data,columns=boston.feature_names)

In [4]:
data['medv']= boston.target

In [5]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,medv
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


Checking Null values

In [6]:
data.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
medv       0
dtype: int64

In [7]:
x = data.drop('medv',axis=1)
y = data.medv

## Using OLS model to check P values of all the columns

In [8]:
import statsmodels.api as sm

In [9]:
model = sm.OLS(y,x).fit()

In [10]:
model.summary()

0,1,2,3
Dep. Variable:,medv,R-squared:,0.959
Model:,OLS,Adj. R-squared:,0.958
Method:,Least Squares,F-statistic:,891.3
Date:,"Sun, 16 Jun 2019",Prob (F-statistic):,0.0
Time:,09:03:51,Log-Likelihood:,-1523.8
No. Observations:,506,AIC:,3074.0
Df Residuals:,493,BIC:,3128.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
CRIM,-0.0929,0.034,-2.699,0.007,-0.161,-0.025
ZN,0.0487,0.014,3.382,0.001,0.020,0.077
INDUS,-0.0041,0.064,-0.063,0.950,-0.131,0.123
CHAS,2.8540,0.904,3.157,0.002,1.078,4.630
NOX,-2.8684,3.359,-0.854,0.394,-9.468,3.731
RM,5.9281,0.309,19.178,0.000,5.321,6.535
AGE,-0.0073,0.014,-0.526,0.599,-0.034,0.020
DIS,-0.9685,0.196,-4.951,0.000,-1.353,-0.584
RAD,0.1712,0.067,2.564,0.011,0.040,0.302

0,1,2,3
Omnibus:,204.082,Durbin-Watson:,0.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1374.225
Skew:,1.609,Prob(JB):,3.9e-299
Kurtosis:,10.404,Cond. No.,8500.0


In [11]:
data.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'medv'],
      dtype='object')

### dropping the columns which is not significant

In [12]:
x = data.drop(['INDUS','AGE','medv'],axis=1)
y = data.medv
y = np.array(y).reshape(-1,1)

## Scaling the data set

In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
sc= StandardScaler()

In [15]:
#after y predict sc.fit_inverse_tranform
x  = sc.fit_transform(x)
y  = sc.fit_transform(y)

## Train Test split

In [16]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size =0.30,random_state=0)

# Modelling

## Linear Regression

In [17]:
LR = LinearRegression()

In [18]:
LR.fit(xtrain,ytrain)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [19]:
metrics.r2_score(ytest,LR.predict(xtest))

0.6751767591417993

In [20]:
np.sqrt(metrics.mean_squared_error(ytest,LR.predict(xtest)))

0.5660236359080063

## KNN 

In [21]:
knn = KNeighborsRegressor()

In [22]:
parameter ={'n_neighbors':np.arange(1,10)}
GS=GridSearchCV(knn,parameter,cv=3)
GS.fit(x,y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [23]:
GS.best_params_

{'n_neighbors': 4}

In [24]:
knn = KNeighborsRegressor(4)

In [25]:
knn.fit(xtrain,ytrain)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=4, p=2,
          weights='uniform')

In [26]:
metrics.r2_score(ytest,knn.predict(xtest))

0.6931747733475818

In [27]:
print('RMSE:',np.sqrt(metrics.mean_squared_error(ytest,knn.predict(xtest))))

RMSE: 0.5501188811590958


## Decision Tree 

In [28]:
dt = DecisionTreeRegressor()

In [29]:
parameter ={'max_depth':np.arange(1,10)}
GS=GridSearchCV(dt,parameter,cv=3)
GS.fit(x,y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [30]:
GS.best_params_

{'max_depth': 9}

In [31]:
dt = DecisionTreeRegressor(max_depth=4)
dt.fit(xtrain,ytrain)

DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [32]:
metrics.r2_score(ytest,dt.predict(xtest))

0.6939953056853774

In [33]:
print('RMSE:',np.sqrt(metrics.mean_squared_error(ytest,dt.predict(xtest))))

RMSE: 0.5493828065454935


## Random Forest ( Bagging by nature)

In [34]:
y=y.reshape(506,)
ytrain=ytrain.reshape(len(ytrain),)
ytest=ytest.reshape(len(ytest),)

In [35]:
rf = RandomForestRegressor(random_state=0)

In [36]:
parameter ={'n_estimators':np.arange(1,10),'max_depth':np.arange(1,10)}
GS=GridSearchCV(rf,parameter,cv=3)
GS.fit(x,y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [37]:
GS.best_params_

{'max_depth': 5, 'n_estimators': 2}

In [38]:
rf = RandomForestRegressor(max_depth=5,n_estimators=2,random_state=0)

In [39]:
rf.fit(xtrain,ytrain)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=2, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [40]:
metrics.r2_score(ytest,rf.predict(xtest))

0.7404471170787381

In [41]:
print('RMSE:',np.sqrt(metrics.mean_squared_error(ytest,rf.predict(xtest))))

RMSE: 0.5059690450554135


# Ada Boost Regressor

In [42]:
from sklearn.ensemble import AdaBoostRegressor

In [43]:
bt_LR = AdaBoostRegressor(base_estimator=LR,n_estimators=300,random_state=0)

In [44]:
bt_dt = AdaBoostRegressor(base_estimator=dt,n_estimators=300,random_state=0)

In [45]:
bt_rf = AdaBoostRegressor(base_estimator=rf,n_estimators=50,random_state=0)

In [46]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5,shuffle=True,random_state=2)
for model,name in zip([LR,bt_LR,knn,dt,bt_dt,rf,bt_rf],
                      ['LR','Boost_LR','knn','dt','Boost_dt','rf','Boost_rf']):
    rmse=[]
    for train,test in kf.split(x,y):
        xtrain,xtest=x[train,:],x[test,:]
        ytrain,ytest=y[train],y[test]
        model.fit(xtrain,ytrain)
        y_predict = model.predict(xtest)
        mse=metrics.mean_squared_error(ytest,y_predict)
        rmse.append(np.sqrt(mse))
    print(' B.E: %0.02f  V.E: (+/- %0.5f) [%s]' %(np.mean(rmse),np.var(rmse,ddof=1),name))

 B.E: 0.52  V.E: (+/- 0.00731) [LR]
 B.E: 0.56  V.E: (+/- 0.00489) [Boost_LR]
 B.E: 0.46  V.E: (+/- 0.00760) [knn]
 B.E: 0.50  V.E: (+/- 0.01373) [dt]
 B.E: 0.36  V.E: (+/- 0.00231) [Boost_dt]
 B.E: 0.49  V.E: (+/- 0.01452) [rf]
 B.E: 0.34  V.E: (+/- 0.00082) [Boost_rf]


# Inference :
          The bias error is controlled well for Linear regression because of its nature. Base Linear model has high bias and less variance so it is reduced significantly compared to others.