In [1]:
!pip install numpy
!pip install pandas
!pip install Pyarrow
!pip install matplotlib
!pip install seaborn
!pip install scikit-learn
!pip install klib
!pip install flask



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler 
from sklearn import datasets 
import klib

In [3]:
sns.set()

In [4]:
from sklearn.datasets import fetch_openml
boston_data = fetch_openml(name='boston', version=1, as_frame=True)

In [5]:
boston_data.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [6]:
print(boston_data.DESCR)

**Author**:   
**Source**: Unknown - Date unknown  
**Please cite**:   

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.
Variables in order:
CRIM     per capita crime rate by town
ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
INDUS    proportion of non-retail business acres per town
CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
NOX      nitric oxides concentration (parts per 10 million)
RM       average number of rooms per dwelling
AGE      proportion of owner-occupied units built prior to 1940
DIS      weighted distances to five Boston employment centres
RAD      index of accessibility to radial highways
TAX      full-value property-tax rate per $10

In [7]:
boston = pd.DataFrame(data=np.c_[boston_data['data'], boston_data['target']], columns=list(boston_data['feature_names']) + ['target'])
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [8]:
boston.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'target'],
      dtype='object')

In [9]:
boston.dtypes

CRIM       object
ZN         object
INDUS      object
CHAS       object
NOX        object
RM         object
AGE        object
DIS        object
RAD        object
TAX        object
PTRATIO    object
B          object
LSTAT      object
target     object
dtype: object

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(boston.drop('target', axis=1),
                                                    boston[['target']], 
                                                    test_size=0.20, 
                                                    random_state=30)

In [12]:
categorical_column = ['CHAS', 'RAD']
numerical_column = ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'B', 'LSTAT']

# categorical_data = boston[categorical_column]
# numerical_data = boston[numerical_column]

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [14]:
num_pipeline = Pipeline([
    ('scaler', StandardScaler()) 
])

full_pipeline = ColumnTransformer([
    ('num_encoder', num_pipeline, numerical_column),
    ('categorical_encoder', OneHotEncoder(handle_unknown='ignore'), categorical_column)
])

In [15]:
y_train

Unnamed: 0,target
471,19.6
436,9.6
428,11.0
38,24.7
157,41.3
...,...
500,16.8
301,22.0
429,9.5
421,14.2


In [16]:
prepared_x = full_pipeline.fit_transform(X_train)
prepared_y = num_pipeline.fit_transform(y_train)

In [17]:
from sklearn.ensemble import RandomForestRegressor

In [18]:
rdm_reg = RandomForestRegressor(n_jobs=-1, criterion='squared_error')

In [19]:
rdm_reg.fit(prepared_x, prepared_y.ravel())

In [20]:
rdm_reg.score(prepared_x, prepared_y.ravel())

0.9805760286133585

In [21]:
from sklearn.model_selection import cross_val_score

In [22]:
cross_val_score(estimator=rdm_reg, X=prepared_x, y=prepared_y.ravel(), cv = 5)

array([0.81541659, 0.82247497, 0.89099879, 0.82111794, 0.8588614 ])

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
param_grid = dict(n_estimators=[20, 50, 90, 110],
    criterion=['squared_error'],
    max_depth=range(18,30,2),
    min_samples_split=range(8,20,2),
    max_leaf_nodes=range(20,30,2),
    oob_score=[True],
    verbose=[2])

In [25]:
grid_search = GridSearchCV(estimator=rdm_reg, param_grid=param_grid, cv=5, n_jobs=-1)

In [26]:
grid_search.fit(prepared_x, prepared_y.ravel())

building tree 1 of 20building tree 2 of 20

building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished


In [27]:
best_clf = grid_search.best_estimator_
best_clf

In [28]:
cross_val_score(estimator=best_clf, X=prepared_x, y=prepared_y.ravel(), cv = 5)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  11 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=16)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  11 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=16)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 co

building tree 1 of 20
building tree 2 of 20
building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20
building tree 1 of 20
building tree 2 of 20
building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20
building tree 1 of 20
building tree 2 of 20
building tree 3 of 20
building tree 4 of 20
building t

[Parallel(n_jobs=16)]: Done  11 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=16)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  11 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=16)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  11 out of  20 | elapsed:    0.0s re

array([0.78461505, 0.84584012, 0.88419543, 0.79660985, 0.87937621])

In [29]:
# Scores of the random forest with default parameters - 
    # array([0.80675763, 0.8185184 , 0.89299855, 0.82498718, 0.86285874])
    
# Scores of the random forest with defined parameters
    # array([0.79778342, 0.81371299, 0.88251366, 0.7969549 , 0.86269461])
    
# hence we will use random forest with default parameters for our model

## Checking Our Model on Test Data

In [30]:
prepared_test_x = full_pipeline.transform(X_test)
prepared_test_y = num_pipeline.transform(y_test)

In [31]:
cross_val_score(estimator=rdm_reg, X=prepared_test_x, y=prepared_test_y.ravel(), cv = 5)

array([0.76043423, 0.24807467, 0.87801382, 0.83668737, 0.85279745])

In [32]:
from sklearn.metrics import mean_squared_error

In [33]:
y_pred = rdm_reg.predict(prepared_test_x)
y_pred

array([-3.65766932e-01, -1.39401736e+00, -2.77905747e-01,  6.03662967e-01,
       -6.94823965e-01, -3.50454586e-01,  5.09400814e-02,  2.39151136e+00,
       -7.88070872e-01,  1.52635179e-01,  3.80525127e-01,  6.87511262e-01,
       -6.44134820e-01,  5.08198411e-01, -7.31679198e-01,  4.08509759e-01,
       -1.13592513e+00,  1.64462646e-01, -5.34836351e-01,  3.33044830e-02,
       -2.31546300e-01, -8.70757540e-01,  3.78835489e-01, -2.95646948e-01,
       -1.33139515e+00, -8.32107067e-01, -2.35664793e-01, -1.14205007e+00,
       -8.08346530e-01, -8.99903798e-01, -7.70540876e-01, -2.77272133e-01,
        1.07813449e+00,  9.80387454e-02, -2.17184376e-01, -1.92684622e-01,
       -8.67800673e-01, -1.38827405e-01, -3.35353445e-01, -2.75160085e-01,
       -3.46652900e-01, -3.22469954e-01,  2.61295956e+00,  1.14618320e-01,
       -8.28581411e-02, -2.87832371e-01, -2.36509612e-01, -2.94590924e-01,
       -2.59742137e-01, -5.13504669e-01, -6.31462534e-01, -5.05267683e-01,
       -2.08736185e-01,  

In [34]:
np.sqrt(mean_squared_error(y_pred, prepared_test_y))

0.27491752633270145

In [35]:
import joblib
import os

In [36]:
DIR = 'model'

if not os.path.isdir(DIR):
    os.mkdir(DIR)

In [37]:
joblib.dump(num_pipeline, os.path.join(DIR, 'num_pipeline.pkl'))
joblib.dump(full_pipeline, os.path.join(DIR, 'full_pipeline.pkl'))
joblib.dump(best_clf, os.path.join(DIR, 'random_forest.pkl'))

['model\\random_forest.pkl']