In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, VotingRegressor,StackingRegressor

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.datasets import load_boston, load_wine


In [2]:
boston = load_boston()
x = pd.DataFrame(boston.data, columns=boston.feature_names)
y = pd.Series(boston.target)


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [3]:
x

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [4]:
x1=[['ZN','INDUS','NOX','RM','AGE']]
y1=x['LSTAT']

In [5]:
x1_train, x1_test, y1_train, y1_test = train_test_split(x,y, test_size=0.25, random_state=1)

## Model Building

### Decision Tree

In [6]:
dt_reg = DecisionTreeRegressor(random_state=20)
dt_reg.fit(x1_train, y1_train)

In [7]:
# Testing Data Evaluation

y1_pred = dt_reg.predict(x1_test)

mse = mean_squared_error(y1_test, y1_pred)
print('MSE :', mse)

rmse = np.sqrt(mse)
print('RMSE :', rmse)

mae = mean_absolute_error(y1_test, y1_pred)
print('MAE :', mae)

accuracy = r2_score(y1_test, y1_pred)
print('R2_Score :', accuracy)


MSE : 27.017007874015743
RMSE : 5.197788748498321
MAE : 3.407874015748031
R2_Score : 0.7272620210679492


In [8]:
# Training Data Evaluation

y1_pred_train = dt_reg.predict(x1_train)

mse = mean_squared_error(y1_train, y1_pred_train)
print('MSE :', mse)

rmse = np.sqrt(mse)
print('RMSE :', rmse)

mae = mean_absolute_error(y1_train, y1_pred_train)
print('MAE :', mae)

accuracy = r2_score(y1_train, y1_pred_train)
print('R2_Score :', accuracy)


MSE : 0.0
RMSE : 0.0
MAE : 0.0
R2_Score : 1.0


## Random Forest 

In [9]:
rf_reg = RandomForestRegressor(random_state=20)
rf_reg.fit(x1_train, y1_train)

In [10]:
# Testing Data Evaluation

y1_pred = rf_reg.predict(x1_test)

mse = mean_squared_error(y1_test, y1_pred)
print('MSE :', mse)

rmse = np.sqrt(mse)
print('RMSE :', rmse)

mae = mean_absolute_error(y1_test, y1_pred)
print('MAE :', mae)

accuracy = r2_score(y1_test, y1_pred)
print('R2_Score :', accuracy)


MSE : 8.29527103937007
RMSE : 2.8801512181429065
MAE : 2.219937007874015
R2_Score : 0.9162588444833925


In [11]:
# Training Data Evaluation

y1_pred_train = rf_reg.predict(x1_train)

mse = mean_squared_error(y1_train, y1_pred_train)
print('MSE :', mse)

rmse = np.sqrt(mse)
print('RMSE :', rmse)

mae = mean_absolute_error(y1_train, y1_pred_train)
print('MAE :', mae)

accuracy = r2_score(y1_train, y1_pred_train)
print('R2_Score :', accuracy)


MSE : 1.514805952506595
RMSE : 1.230774533578996
MAE : 0.8330343007915559
R2_Score : 0.9809153556173695


## Hyperparameter Tuning

In [12]:
rf_reg = RandomForestRegressor(random_state=30)

hyp = { 'n_estimators':np.arange(10,150),
    'criterion':["squared_error", "absolute_error"],
    'max_depth':np.arange(5, 15) ,
    'min_samples_split':np.arange(5,20) ,
    'min_samples_leaf':np.arange(3,15),
    'max_features':['sqrt', 'log2'],}
                                 
rscv_rf_reg = RandomizedSearchCV(rf_reg, hyp, cv=5)
rscv_rf_reg.fit(x1_train, y1_train)
rscv_rf_reg.best_params_

{'n_estimators': 47,
 'min_samples_split': 5,
 'min_samples_leaf': 3,
 'max_features': 'log2',
 'max_depth': 6,
 'criterion': 'squared_error'}

In [13]:
rscv_rf_reg.best_estimator_

In [14]:
rf_reg = RandomForestRegressor(criterion='absolute_error', max_depth=7,
                      max_features='log2', min_samples_leaf=3,
                      min_samples_split=17, n_estimators=136, random_state=30)
rf_reg.fit(x1_train, y1_train)

In [15]:
# Testing Data Evaluation

y_pred = rf_reg.predict(x1_test)

mse = mean_squared_error(y1_test, y1_pred)
print('MSE :', mse)

rmse = np.sqrt(mse)
print('RMSE :', rmse)

mae = mean_absolute_error(y1_test, y1_pred)
print('MAE :', mae)

accuracy = r2_score(y1_test, y1_pred)
print('R2_Score :', accuracy)


MSE : 8.29527103937007
RMSE : 2.8801512181429065
MAE : 2.219937007874015
R2_Score : 0.9162588444833925


In [16]:
# Training Data Evaluation

y1_pred_train = rf_reg.predict(x1_train)

mse = mean_squared_error(y1_train, y1_pred_train)
print('MSE :', mse)

rmse = np.sqrt(mse)
print('RMSE :', rmse)

mae = mean_absolute_error(y1_train, y1_pred_train)
print('MAE :', mae)

accuracy = r2_score(y1_train, y1_pred_train)
print('R2_Score :', accuracy)


MSE : 10.560538985538344
RMSE : 3.2496982914631234
MAE : 2.0376416265714736
R2_Score : 0.8669505287496371


In [20]:
import pickle

In [21]:
file=open('rf_model.pkl','wb')
pickle.dump(rf_reg,file)
file.close()