In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler 
from sklearn import datasets 
import klib

In [2]:
sns.set()

In [3]:
boston_data = datasets.load_boston()

In [4]:
boston_data.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [5]:
print(boston_data.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [6]:
boston = pd.DataFrame(data=np.c_[boston_data['data'], boston_data['target']], columns=list(boston_data['feature_names']) + ['target'])
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [7]:
boston.dtypes

CRIM       float64
ZN         float64
INDUS      float64
CHAS       float64
NOX        float64
RM         float64
AGE        float64
DIS        float64
RAD        float64
TAX        float64
PTRATIO    float64
B          float64
LSTAT      float64
target     float64
dtype: object

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(boston.drop('target', axis=1),
                                                    boston[['target']], 
                                                    test_size=0.20, 
                                                    random_state=30)

In [10]:
categorical_column = ['CHAS', 'RAD']
numerical_column = ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'B', 'LSTAT']

# categorical_data = boston[categorical_column]
# numerical_data = boston[numerical_column]

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [12]:
num_pipeline = Pipeline([
    ('scaler', StandardScaler()) 
])

full_pipeline = ColumnTransformer([
    ('num_encoder', num_pipeline, numerical_column),
    ('categorical_encoder', OneHotEncoder(), categorical_column)
])

In [13]:
prepared_x = full_pipeline.fit_transform(X_train)
prepared_y = num_pipeline.fit_transform(y_train)

In [14]:
from sklearn.ensemble import RandomForestRegressor

In [15]:
rdm_reg = RandomForestRegressor(n_jobs=-1)

In [16]:
rdm_reg.fit(prepared_x, prepared_y.ravel())

RandomForestRegressor(n_jobs=-1)

In [17]:
rdm_reg.score(prepared_x, prepared_y.ravel())

0.9812409313050927

In [18]:
from sklearn.model_selection import cross_val_score

In [19]:
cross_val_score(estimator=rdm_reg, X=prepared_x, y=prepared_y.ravel(), cv = 5)

array([0.80675763, 0.8185184 , 0.89299855, 0.82498718, 0.86285874])

In [20]:
from sklearn.model_selection import GridSearchCV

In [21]:
param_grid = dict(n_estimators=[20, 50, 90, 110],
    criterion=['mse'],
    max_depth=range(18,30,2),
    min_samples_split=range(8,20,2),
    max_leaf_nodes=range(20,30,2),
    oob_score=[True],
    verbose=[2])

In [22]:
grid_search = GridSearchCV(estimator=rdm_reg, param_grid=param_grid, cv=5, n_jobs=-1)

In [23]:
grid_search.fit(prepared_x, prepared_y.ravel())

building tree 1 of 90
building tree 2 of 90
building tree 3 of 90
building tree 4 of 90building tree 5 of 90
building tree 6 of 90
building tree 7 of 90
building tree 8 of 90
building tree 9 of 90

building tree 10 of 90
building tree 11 of 90building tree 12 of 90building tree 13 of 90

building tree 14 of 90building tree 15 of 90


building tree 16 of 90building tree 17 of 90
building tree 18 of 90
building tree 19 of 90
building tree 20 of 90

building tree 21 of 90building tree 22 of 90building tree 23 of 90

building tree 24 of 90building tree 25 of 90building tree 26 of 90

building tree 27 of 90

building tree 28 of 90
building tree 29 of 90
building tree 30 of 90
building tree 31 of 90building tree 32 of 90building tree 33 of 90


building tree 34 of 90
building tree 35 of 90
building tree 36 of 90building tree 37 of 90building tree 38 of 90

building tree 39 of 90
building tree 40 of 90building tree 41 of 90building tree 42 of 90building tree 43 of 90



building tree 44 of 90

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    0.0s finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(n_jobs=-1), n_jobs=-1,
             param_grid={'criterion': ['mse'], 'max_depth': range(18, 30, 2),
                         'max_leaf_nodes': range(20, 30, 2),
                         'min_samples_split': range(8, 20, 2),
                         'n_estimators': [20, 50, 90, 110], 'oob_score': [True],
                         'verbose': [2]})

In [24]:
best_clf = grid_search.best_estimator_
best_clf

RandomForestRegressor(max_depth=22, max_leaf_nodes=24, min_samples_split=12,
                      n_estimators=90, n_jobs=-1, oob_score=True, verbose=2)

In [25]:
cross_val_score(estimator=best_clf, X=prepared_x, y=prepared_y.ravel(), cv = 5)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done  90 out of  90 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done  90 out of  90 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n

array([0.79778342, 0.81371299, 0.88251366, 0.7969549 , 0.86269461])

In [30]:
# Scores of the random forest with default parameters - 
    # array([0.80675763, 0.8185184 , 0.89299855, 0.82498718, 0.86285874])
    
# Scores of the random forest with defined parameters
    # array([0.79778342, 0.81371299, 0.88251366, 0.7969549 , 0.86269461])
    
# hence we will use random forest with default parameters for our model

## Checking Our Model on Test Data

In [26]:
prepared_test_x = full_pipeline.transform(X_test)
prepared_test_y = num_pipeline.transform(y_test)

In [29]:
cross_val_score(estimator=rdm_reg, X=prepared_test_x, y=prepared_test_y.ravel(), cv = 5)

array([0.74259188, 0.26178383, 0.86218115, 0.84628173, 0.86468486])

In [28]:
from sklearn.metrics import mean_squared_error

In [31]:
y_pred = rdm_reg.predict(prepared_test_x)
y_pred

array([-0.36956862, -1.35452207, -0.2582637 ,  0.51622419, -0.71911251,
       -0.34528007,  0.12348892,  2.36278751, -0.78289636,  0.17438927,
        0.41907   ,  0.60366297, -0.6611368 ,  0.55476906, -0.75438371,
        0.40312404, -1.17626524,  0.29234713, -0.51065341,  0.04418153,
       -0.2801234 , -0.78817647,  0.37704025, -0.25076593, -1.28661973,
       -0.73072878, -0.31349375, -1.20731234, -0.8292558 , -0.87065194,
       -0.73685371, -0.26565587,  1.20052765,  0.10891579, -0.25530684,
       -0.19226221, -0.90064301, -0.15931427, -0.36386609, -0.26882394,
       -0.27917298, -0.35351706,  2.72584851,  0.1316203 , -0.15825824,
       -0.26069256,  0.03298768, -0.32606044, -0.25520123, -0.52512093,
       -0.64519084, -0.57179719, -0.20873618,  0.24757172,  0.53586624,
        0.0118672 ,  2.79586289,  0.44430897, -0.33640947,  1.41162682,
       -0.20672974, -1.09695785,  0.27640117, -0.28381948, -1.33730888,
       -1.33604165, -0.79324539,  0.04069665,  2.11831798,  1.29

In [34]:
np.sqrt(mean_squared_error(y_pred, prepared_test_y))

0.26604792167376806

In [35]:
import joblib

In [None]:
joblib.dump(value=num_pipeline, 'num_pipeline.pkl')
joblib.dump(value=full_pipeline, 'full_pipeline.pkl')
joblib.dump(value=num_pipeline, 'num_pipeline.pkl')