## Decision tree regression

### Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('Real_Estate.csv', index_col='No')
dataset.head()

Unnamed: 0_level_0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
2,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
3,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
4,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
5,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


In [3]:
X = dataset.iloc[:, : -1].values
y = dataset.iloc[:, -1].values

In [4]:
corr = dataset.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
X1 transaction date,1.0,0.0175488,0.06088,0.00963544,0.0350578,-0.0410818,0.0874906
X2 house age,0.0175488,1.0,0.025622,0.0495925,0.0544199,-0.0485201,-0.210567
X3 distance to the nearest MRT station,0.06088,0.025622,1.0,-0.602519,-0.591067,-0.806317,-0.673613
X4 number of convenience stores,0.00963544,0.0495925,-0.602519,1.0,0.444143,0.449099,0.571005
X5 latitude,0.0350578,0.0544199,-0.591067,0.444143,1.0,0.412924,0.546307
X6 longitude,-0.0410818,-0.0485201,-0.806317,0.449099,0.412924,1.0,0.523287
Y house price of unit area,0.0874906,-0.210567,-0.673613,0.571005,0.546307,0.523287,1.0


### Splitting the dataset

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Feature scaling

We don't have to apply feature scaling because indeed the decision to regression
more works with splits. It splits the features in different successive ranges which is 
a technique that absolutely doesn't need feature scaling.
It doesn't have to do with any coefficients or whatsoever. It is just because you split your dataset
through different node in order to collect and different final ranges your final predictions for different
ranges of values of the features.

So you clearly don't need to apply feature scaling.

In [114]:
from sklearn.preprocessing import StandardScaler

train_feature_scaler = StandardScaler()
test_feature_scaler = StandardScaler()

X_train = train_feature_scaler.fit_transform(X_train)
X_test = test_feature_scaler.fit_transform(X_test)

In [112]:
train_predictor_scaler = StandardScaler()
test_predictor_scaler = StandardScaler()

y_train_scaled = train_predictor_scaler.fit_transform(np.expand_dims(y_train, axis=1))
y_test_scaled = test_predictor_scaler.fit_transform(np.expand_dims(y_test, axis=1))

y_train = y_train_scaled.reshape(len(y_train), )
y_test = y_test_scaled.reshape(len(y_test), )

array([-1.08e+00, -6.34e-01,  3.09e-02, -3.48e-02, -2.69e-01,  4.98e-01,
        1.20e+00, -5.58e-03, -2.22e+00, -6.19e-01, -9.26e-01, -1.77e+00,
       -5.83e-01, -5.24e-01,  1.92e-01, -1.26e+00,  2.06e-01,  1.27e+00,
        1.84e-01,  2.06e-01,  1.70e-01, -3.56e-01, -8.60e-01,  2.87e-01,
       -1.40e+00,  2.92e+00, -1.63e+00,  1.04e-01,  2.21e-01, -1.06e+00,
       -1.74e-01,  6.66e-01, -1.74e-01,  1.24e+00,  2.57e-01, -4.21e-02,
       -1.18e+00, -4.88e-01, -8.59e-02, -8.31e-01,  1.48e-01,  8.78e-01,
        9.03e-03,  1.57e+00,  1.37e+00, -9.26e-01, -4.07e-01, -1.29e-02,
        8.64e-01,  1.08e+00, -1.18e+00, -6.70e-01, -6.34e-01, -4.21e-02,
       -4.21e-02,  4.40e-01,  9.29e-01,  3.16e-01, -1.08e-01,  9.80e-01,
       -3.71e-01,  4.55e-02, -1.21e+00,  9.07e-01, -4.21e-02,  1.04e+00,
        1.52e+00,  4.25e-01, -1.16e+00,  1.12e+00, -1.50e+00,  5.64e-01,
       -1.01e-01,  1.26e-01, -1.65e+00, -1.88e+00,  6.30e-01, -4.88e-01,
       -1.15e+00, -1.09e+00, -1.50e+00,  6.44e-01, 

### Training the model

<a href="https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html"> DecisionTreeRegressor </a>

In [6]:
from sklearn.tree import DecisionTreeRegressor
tree_regressor = DecisionTreeRegressor(max_depth=2)
tree_regressor.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [7]:
y_predicted = tree_regressor.predict(X_test)

np.set_printoptions(precision=2)
y_predicted = np.expand_dims(y_predicted, axis=1)
y_test = np.reshape(y_test, newshape=(len(y_test), 1))

### Visualizing the results

In [8]:
print(y_test.shape)
print(y_predicted.shape)

np.concatenate((y_predicted, y_test), axis=1);

(83, 1)
(83, 1)


### Score calculation

In [9]:
score = tree_regressor.score(X_test, y_test)
score

0.6480639711400162

In [10]:
from sklearn.metrics import r2_score
r2Score = r2_score(y_test, y_predicted)
r2Score

0.6480639711400162

### With bagging

 - The Approach is to use the same training algorithm for every <br>
 predictor, but to train them on different random subsets of the training set. 

 
 - When sampling is performed with replacement, this method is called bagging (short for <br>
 bootstrap aggregating). When sampling is performed without replacement, it is called <br>
 pasting.
 

 - Once all predictors are trained, the ensemble can make a prediction for a new <br>
 instance by simply aggregating the predictions of all predictors. The aggregation <br>
 function is typically the statistical mode.

<a href="https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html"> BaggingRegressor </a>

In [43]:
from sklearn.ensemble import BaggingRegressor
bagging_regressor = BaggingRegressor(base_estimator = DecisionTreeRegressor(),
                 n_estimators = 100,
                 bootstrap = True,
                 oob_score = True)

In [44]:
bagging_regressor.fit(X_train, y_train)

BaggingRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=100, n_jobs=None, oob_score=True,
         random_state=None, verbose=0, warm_start=False)

In [45]:
y_predicted = bagging_regressor.predict(X_test)

y_predicted = np.expand_dims(y_predicted, axis=1)
y_test = np.reshape(y_test, newshape=(len(y_test), 1))

np.concatenate((y_predicted, y_test), axis=1);
score = bagging_regressor.score(X_test, y_test)
score

0.7224933451573536