In [24]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import StandardScaler 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
import statsmodels.api as sm 
from sklearn.datasets import load_boston
boston= load_boston()
bos = pd.DataFrame(boston.data)

In [25]:
print(boston.keys())

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])


In [26]:
price = boston.target

In [27]:
boston = pd.DataFrame(boston.data, columns=boston.feature_names)
boston['price'] = price
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [28]:
boston.isna().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
price      0
dtype: int64

In [29]:
boston.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,price
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [30]:
# the magnitude of the data differs much hence the data will be standardized before fitting the model
X = boston.drop('price', axis=1)
y= boston['price']

In [31]:
scaler =StandardScaler()

X_scaled = scaler.fit_transform(X)

In [32]:
#lets check out for multi collinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor
variables = X_scaled

# we create a new data frame which will include all the VIFs
# note that each variable has its own variance inflation factor as this measure is variable specific (not model specific)
# we do not include categorical values for mulitcollinearity as they do not provide much information as numerical ones do
vif = pd.DataFrame()

# here we make use of the variance_inflation_factor, which will basically output the respective VIFs 
vif["VIF"] = [variance_inflation_factor(variables, i) for i in range(variables.shape[1])]
# Finally, I like to include names so it is easier to explore the result
vif["Features"] = X.columns

In [33]:
vif

Unnamed: 0,VIF,Features
0,1.792192,CRIM
1,2.298758,ZN
2,3.991596,INDUS
3,1.073995,CHAS
4,4.39372,NOX
5,1.933744,RM
6,3.100826,AGE
7,3.955945,DIS
8,7.484496,RAD
9,9.008554,TAX



Here, we have the correlation values for all the features. As a thumb rule, a VIF value greater than 5 means a very severe multicollinearity. 

the 'RAD'  and 'TAX' have a high variance inflation factor because they "explain" the same variance within this dataset. We would need to discard one of these variables before moving on to model building or risk building a model with high multicolinearity.

Here we will drop the "TAX" column

let's split our data in train and test.

In [34]:
X =  X.drop(columns=['TAX'])
scaler =StandardScaler()

X_scaled = scaler.fit_transform(X)

In [41]:
# splitting the dataset into train and test sets

x_train,x_test,y_train,y_test = train_test_split(X_scaled,y, test_size= 0.25, random_state=355) 

In [42]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
model_dt = dt.fit(x_train,y_train)

In [104]:
# evaluating the model performance
from sklearn.metrics import mean_squared_error, mean_absolute_error

model_dt.score(x_test,y_test)

print('Model accuracy = {}'.format(model_dt.score(x_test,y_test)))

y_pred = model_dt.predict(x_test)

MSE = mean_squared_error(y_test, y_pred)
MAE = mean_absolute_error(y_test, y_pred)

print('mse = {}\nmae = {}'.format(MSE, MAE))

Model accuracy = 0.7930170712696701
mse = 18.503070866141734
mae = 3.104724409448819


The model accuracy stood at about 79% using one decision tree to predict the price 

In [44]:
#creating an instance of the model
rf = RandomForestRegressor(random_state=42)

In [45]:
# fitting the model

model = rf.fit(x_train,y_train)

In [46]:
# evaluating the model performance
rf.score(x_test,y_test)

0.8553770765153326

In [35]:
# we are tuning the following  hyperparameters right now, we are passing the different values for both parameters
grid_param = {
    "n_estimators" : [10,20,50,80,100,120,150,200],
    'max_depth' : range(5,15,1),
    'min_samples_leaf' : range(1,10,1),
    'min_samples_split': range(2,10,1),
    'max_features' : ['auto','log2']
}

In [36]:
grid_search = GridSearchCV(estimator=rf, param_grid=grid_param, cv=5, n_jobs =-1, verbose = 3)

In [37]:
grid_search.fit(x_train,y_train)

Fitting 5 folds for each of 11520 candidates, totalling 57600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed:   40.7s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1560 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2040 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 3864 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 4600 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 5400 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 6264 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | e

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': range(5, 15),
                         'max_features': ['auto', 'log2'],
                         'min_samples_leaf': range(1, 10),
                         'min_samples_split': range(2, 10),
                         'n_estimators': [10, 20, 50, 80, 100, 120, 150, 200]},
             verbose=3)

In [38]:
grid_search.best_params_

{'max_depth': 12,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'n_estimators': 50}

In [74]:
rf = RandomForestRegressor(random_state=50,max_depth=12,max_features='log2', min_samples_leaf =1,min_samples_split= 3,n_estimators =50)

In [75]:
model_new= rf.fit(x_train,y_train)

In [103]:
# evaluating the model performance
from sklearn.metrics import mean_squared_error, mean_absolute_error

print('Model accuracy = {}'.format(model_new.score(x_test,y_test)))

y_pred = model_new.predict(x_test)

MSE = mean_squared_error(y_test, y_pred)
MAE = mean_absolute_error(y_test, y_pred)

print('mse = {}\nmae = {}'.format(MSE, MAE))

Model accuracy = 0.8790201573904806
mse = 10.814894807557124
mae = 2.3587968255786342


Our model accuracy Increased from about 85% to 87.9% after carrying out hyper parameter tunning