In [1]:
!pip install numpy
!pip install pandas
!pip install Pyarrow
!pip install matplotlib
!pip install seaborn
!pip install scikit-learn
!pip install klib
!pip install flask

Collecting flask
  Downloading flask-3.0.1-py3-none-any.whl.metadata (3.6 kB)
Collecting Werkzeug>=3.0.0 (from flask)
  Downloading werkzeug-3.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting itsdangerous>=2.1.2 (from flask)
  Downloading itsdangerous-2.1.2-py3-none-any.whl (15 kB)
Collecting click>=8.1.3 (from flask)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting blinker>=1.6.2 (from flask)
  Downloading blinker-1.7.0-py3-none-any.whl.metadata (1.9 kB)
Downloading flask-3.0.1-py3-none-any.whl (101 kB)
   ---------------------------------------- 0.0/101.2 kB ? eta -:--:--
   ------------ --------------------------- 30.7/101.2 kB 1.3 MB/s eta 0:00:01
   ---------------------------------------- 101.2/101.2 kB 1.9 MB/s eta 0:00:00
Downloading blinker-1.7.0-py3-none-any.whl (13 kB)
Downloading click-8.1.7-py3-none-any.whl (97 kB)
   ---------------------------------------- 0.0/97.9 kB ? eta -:--:--
   ---------------------------------------- 97.9/97.9 kB 5.8 MB/s 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler 
from sklearn import datasets 
import klib

In [3]:
sns.set()

In [4]:
from sklearn.datasets import fetch_openml
boston_data = fetch_openml(name='boston', version=1, as_frame=True)

In [5]:
boston_data.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [6]:
print(boston_data.DESCR)

**Author**:   
**Source**: Unknown - Date unknown  
**Please cite**:   

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.
Variables in order:
CRIM     per capita crime rate by town
ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
INDUS    proportion of non-retail business acres per town
CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
NOX      nitric oxides concentration (parts per 10 million)
RM       average number of rooms per dwelling
AGE      proportion of owner-occupied units built prior to 1940
DIS      weighted distances to five Boston employment centres
RAD      index of accessibility to radial highways
TAX      full-value property-tax rate per $10

In [7]:
boston = pd.DataFrame(data=np.c_[boston_data['data'], boston_data['target']], columns=list(boston_data['feature_names']) + ['target'])
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [8]:
boston.dtypes

CRIM       object
ZN         object
INDUS      object
CHAS       object
NOX        object
RM         object
AGE        object
DIS        object
RAD        object
TAX        object
PTRATIO    object
B          object
LSTAT      object
target     object
dtype: object

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(boston.drop('target', axis=1),
                                                    boston[['target']], 
                                                    test_size=0.20, 
                                                    random_state=30)

In [11]:
categorical_column = ['CHAS', 'RAD']
numerical_column = ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'B', 'LSTAT']

# categorical_data = boston[categorical_column]
# numerical_data = boston[numerical_column]

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [13]:
num_pipeline = Pipeline([
    ('scaler', StandardScaler()) 
])

full_pipeline = ColumnTransformer([
    ('num_encoder', num_pipeline, numerical_column),
    ('categorical_encoder', OneHotEncoder(), categorical_column)
])

In [14]:
y_train

Unnamed: 0,target
471,19.6
436,9.6
428,11.0
38,24.7
157,41.3
...,...
500,16.8
301,22.0
429,9.5
421,14.2


In [15]:
prepared_x = full_pipeline.fit_transform(X_train)
prepared_y = num_pipeline.fit_transform(y_train)

In [16]:
from sklearn.ensemble import RandomForestRegressor

In [17]:
rdm_reg = RandomForestRegressor(n_jobs=-1, criterion='squared_error')

In [18]:
rdm_reg.fit(prepared_x, prepared_y.ravel())

In [19]:
rdm_reg.score(prepared_x, prepared_y.ravel())

0.9816480785664385

In [20]:
from sklearn.model_selection import cross_val_score

In [21]:
cross_val_score(estimator=rdm_reg, X=prepared_x, y=prepared_y.ravel(), cv = 5)

array([0.82530418, 0.83160519, 0.88713105, 0.82668606, 0.87713139])

In [22]:
from sklearn.model_selection import GridSearchCV

In [23]:
param_grid = dict(n_estimators=[20, 50, 90, 110],
    criterion=['squared_error'],
    max_depth=range(18,30,2),
    min_samples_split=range(8,20,2),
    max_leaf_nodes=range(20,30,2),
    oob_score=[True],
    verbose=[2])

In [24]:
grid_search = GridSearchCV(estimator=rdm_reg, param_grid=param_grid, cv=5, n_jobs=-1)

In [25]:
grid_search.fit(prepared_x, prepared_y.ravel())

building tree 1 of 50building tree 2 of 50

building tree 3 of 50
building tree 4 of 50
building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50
building tree 21 of 50
building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50
building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  45 out of  50 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.0s finished


In [26]:
best_clf = grid_search.best_estimator_
best_clf

In [27]:
cross_val_score(estimator=best_clf, X=prepared_x, y=prepared_y.ravel(), cv = 5)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  45 out of  50 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done  45 out of  50 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=16)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s


building tree 1 of 50building tree 2 of 50
building tree 3 of 50
building tree 4 of 50

building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50
building tree 21 of 50
building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50
building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 

[Parallel(n_jobs=-1)]: Done  45 out of  50 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done  45 out of  50 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=16)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  45 out of  50 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done  45 out of  50 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=16)]: Done  50

building tree 1 of 50
building tree 2 of 50
building tree 3 of 50
building tree 4 of 50
building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50
building tree 21 of 50
building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50
building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 

[Parallel(n_jobs=-1)]: Done  45 out of  50 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   9 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done  45 out of  50 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=16)]: Done  50 out of  50 | elapsed:    0.0s finished


array([0.80338345, 0.84253654, 0.88106769, 0.76075591, 0.8546235 ])

In [28]:
# Scores of the random forest with default parameters - 
    # array([0.80675763, 0.8185184 , 0.89299855, 0.82498718, 0.86285874])
    
# Scores of the random forest with defined parameters
    # array([0.79778342, 0.81371299, 0.88251366, 0.7969549 , 0.86269461])
    
# hence we will use random forest with default parameters for our model

## Checking Our Model on Test Data

In [29]:
prepared_test_x = full_pipeline.transform(X_test)
prepared_test_y = num_pipeline.transform(y_test)

In [30]:
cross_val_score(estimator=rdm_reg, X=prepared_test_x, y=prepared_test_y.ravel(), cv = 5)

array([0.73235929, 0.30045929, 0.86729096, 0.85434267, 0.86273066])

In [31]:
from sklearn.metrics import mean_squared_error

In [32]:
y_pred = rdm_reg.predict(prepared_test_x)
y_pred

array([-0.37125826, -1.36275905, -0.26660629,  0.57800159, -0.73864896,
       -0.32943971,  0.12211609,  2.42424809, -0.79176696,  0.19741059,
        0.45708686,  0.59986128, -0.67233066,  0.54368081, -0.84646899,
        0.41980921, -1.14669657,  0.28865105, -0.49766431, -0.01970791,
       -0.22499895, -0.74688594,  0.43881764, -0.3066296 , -1.30689539,
       -0.83949923, -0.30578478, -1.2321289 , -0.75586214, -0.9311621 ,
       -0.68785421, -0.29987104,  1.1586035 ,  0.0829376 , -0.26586708,
       -0.19965438, -0.95904113, -0.13460331, -0.35679073, -0.25931973,
       -0.32669405, -0.30589038,  2.74960904,  0.1403853 , -0.10366181,
       -0.25171636, -0.15255572, -0.32183634, -0.26618388, -0.52892262,
       -0.62502079, -0.5194184 , -0.19733113,  0.26309527,  0.5388231 ,
        0.00922714,  2.73852079,  0.51749142, -0.34612489,  1.40529068,
       -0.21802919, -1.14954784,  0.23986275, -0.2136995 , -1.29327268,
       -1.2808116 , -0.82735496,  0.09307543,  2.22318115,  1.37

In [33]:
np.sqrt(mean_squared_error(y_pred, prepared_test_y))

0.26654974624810623

In [34]:
import joblib
import os

In [35]:
DIR = 'model'

if not os.path.isdir(DIR):
    os.mkdir(DIR)

In [36]:
joblib.dump(num_pipeline, os.path.join(DIR, 'num_pipeline.pkl'))
joblib.dump(full_pipeline, os.path.join(DIR, 'full_pipeline.pkl'))
joblib.dump(best_clf, os.path.join(DIR, 'random_forest.pkl'))

['model\\random_forest.pkl']