# Decision Tree 

In [87]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.stats import norm, skew, probplot
import seaborn as sns
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV


In [88]:
from clean_data import *

In [89]:
tree_reg = DecisionTreeRegressor()
tree = tree_reg.fit(X1, Y)

In [90]:
Y_pred = tree_reg.predict(X1)

RMSE = np.sqrt(mean_squared_error(Y,Y_pred))
print(RMSE)
R2 = tree_reg.score(X1,Y)
print(R2)

0.0
1.0


- Obviously this does not make a lot of sense since we just extremely overfitted our training data by not restricting the Decision Tree

### Try a crossvalidation 

In [91]:
scores = cross_validate(tree_reg, X1, Y, cv = 20, scoring = ('r2','neg_mean_squared_error'))
scores

{'fit_time': array([0.03857064, 0.03140903, 0.02549076, 0.02503181, 0.02785468,
        0.10432005, 0.02499509, 0.02508068, 0.02479625, 0.024786  ,
        0.02525687, 0.02722597, 0.03211498, 0.03443813, 0.03135395,
        0.02676892, 0.02446198, 0.02449894, 0.02495193, 0.02453113]),
 'score_time': array([0.00485396, 0.00278497, 0.00262809, 0.00278497, 0.00332737,
        0.00251818, 0.00247765, 0.00242114, 0.00242901, 0.00251913,
        0.00268507, 0.00253201, 0.00363612, 0.00465178, 0.00493503,
        0.00270391, 0.00234294, 0.0023191 , 0.00238705, 0.00255919]),
 'test_r2': array([0.32175351, 0.49200578, 0.27699769, 0.48447761, 0.48772857,
        0.5059318 , 0.37076304, 0.22344322, 0.37997346, 0.46735688,
        0.26305996, 0.17547023, 0.50022331, 0.43789462, 0.29616092,
        0.01091559, 0.37123098, 0.31111048, 0.36926235, 0.47013149]),
 'test_neg_mean_squared_error': array([-134.71932076, -122.10376435, -173.13433992, -113.13126326,
        -133.79801853, -102.9108567 , -164

In [92]:
print('Mean R^2: ', scores['test_r2'].mean())
print('Mean RMSE: ', np.sqrt(-scores['test_neg_mean_squared_error']).mean())


print('Mean price/SF is: ', Y.mean())
(np.sqrt(-scores['test_neg_mean_squared_error']).mean()/Y.mean())*100

Mean R^2:  0.3607945742066839
Mean RMSE:  11.64137048047734
Mean price/SF is:  70.479999108507


16.51726820052161

- The mean R squared is only 0.37 and the RMSE is 11.6 Dollars when the mean price per SF is 70.47 dollars

## Try something similar with GridSearch

In [93]:
from sklearn.model_selection import GridSearchCV
grid_para_tree = [{
    "splitter":["best","random"],
    "min_samples_leaf": range(1,10),
    "min_samples_split": np.linspace(start=2, stop=30, num=15, dtype=int),
    "max_depth": range(1,10),
    "max_features" : ["auto", "sqrt", "log2"],
}]
tree_reg.set_params(random_state = None)
grid_search_tree = GridSearchCV(tree_reg, grid_para_tree, 
                                cv = 10, 
                                scoring='neg_mean_squared_error',
                                n_jobs=-1)

grid_search_tree.fit(X1, Y)

GridSearchCV(cv=10, estimator=DecisionTreeRegressor(), n_jobs=-1,
             param_grid=[{'max_depth': range(1, 10),
                          'max_features': ['auto', 'sqrt', 'log2'],
                          'min_samples_leaf': range(1, 10),
                          'min_samples_split': array([ 2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]),
                          'splitter': ['best', 'random']}],
             scoring='neg_mean_squared_error')

In [94]:
print(grid_search_tree.best_estimator_)
print(grid_search_tree.best_estimator_.score(X1,Y))


DecisionTreeRegressor(max_depth=8, max_features='auto', min_samples_leaf=9,
                      min_samples_split=6)
0.7209391276100383


In [95]:
mse = grid_search_tree.best_score_
rmse = np.sqrt(-mse)
rmse

9.514266883906059

- This is not bad. 0.72 R2 and an RMSE of 9.514 dollars

In [96]:
grid_search_tree.best_params_
feature_list = list(X1.columns) # Saving feature names for later use
# Get numerical feature importances
importances = list(grid_search_tree.best_estimator_.feature_importances_)

## List of tuples with variable and importance
feature_importances = [(feature, round(importance, 5)) for feature, importance in zip(feature_list, importances)]

## Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

## Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

Variable: OverallQual          Importance: 0.40306
Variable: TotalBsmtSF          Importance: 0.1588
Variable: OverallCond          Importance: 0.09249
Variable: Bsmt_ratio           Importance: 0.09201
Variable: YearRemodAdd         Importance: 0.07114
Variable: GarageArea           Importance: 0.03816
Variable: MSZoning_RM          Importance: 0.03028
Variable: bsmt_above_ratio     Importance: 0.02863
Variable: BldgType_Duplex      Importance: 0.0282
Variable: HouseStyle_2Story    Importance: 0.01076
Variable: KitchenQual          Importance: 0.00884
Variable: patioSF              Importance: 0.00772
Variable: income               Importance: 0.00551
Variable: dist                 Importance: 0.0054
Variable: LotArea              Importance: 0.00524
Variable: Artery               Importance: 0.00517
Variable: Bedr_ratio           Importance: 0.00461
Variable: MSZoning_RL          Importance: 0.00274
Variable: bathrm_cnt           Importance: 0.00123
Variable: LotFrontage          Imp

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [25]:
# RETURNS a DATA FRAME BUT DOES NOT FIT BEST ESTIMATE BACK TO DATA. 
# CANT BE RUN WITH refit = True because there are two scoring methods, and it does not know which one to use.
Results = pd.DataFrame(grid_search_tree.cv_results_)

In [28]:
Results.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_max_depth', 'param_max_features', 'param_min_samples_leaf',
       'param_min_samples_split', 'params', 'split0_test_score',
       'split1_test_score', 'split2_test_score', 'split3_test_score',
       'split4_test_score', 'split5_test_score', 'split6_test_score',
       'split7_test_score', 'split8_test_score', 'split9_test_score',
       'mean_test_score', 'std_test_score', 'rank_test_score'],
      dtype='object')

## Fit model again with the best parameters. 

In [97]:
grid_search_tree.best_estimator_

DecisionTreeRegressor(max_depth=8, max_features='auto', min_samples_leaf=9,
                      min_samples_split=6)

In [98]:
tree_reg = DecisionTreeRegressor()
tree_reg.set_params(max_depth=8, max_features='auto', min_samples_leaf=9,
                      min_samples_split=4)
tree = tree_reg.fit(X1, Y)



In [99]:
Y_pred = tree_reg.predict(X1)

RMSE = np.sqrt(mean_squared_error(Y,Y_pred))
print(RMSE)
R2 = tree_reg.score(X1,Y)
print(R2)

7.800554947347207
0.7209391276100383


In [100]:
feature_list = list(X1.columns) # Saving feature names for later use
# Get numerical feature importances
importances = list(tree.feature_importances_)

## List of tuples with variable and importance
feature_importances = [(feature, round(importance, 5)) for feature, importance in zip(feature_list, importances)]

## Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

## Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

Variable: OverallQual          Importance: 0.40306
Variable: TotalBsmtSF          Importance: 0.15456
Variable: OverallCond          Importance: 0.09249
Variable: Bsmt_ratio           Importance: 0.09201
Variable: YearRemodAdd         Importance: 0.07114
Variable: GarageArea           Importance: 0.03816
Variable: bsmt_above_ratio     Importance: 0.03286
Variable: MSZoning_RM          Importance: 0.03028
Variable: BldgType_Duplex      Importance: 0.0282
Variable: HouseStyle_2Story    Importance: 0.01076
Variable: KitchenQual          Importance: 0.00884
Variable: patioSF              Importance: 0.00772
Variable: income               Importance: 0.00551
Variable: dist                 Importance: 0.0054
Variable: LotArea              Importance: 0.00524
Variable: Artery               Importance: 0.00517
Variable: Bedr_ratio           Importance: 0.00461
Variable: MSZoning_RL          Importance: 0.00274
Variable: bathrm_cnt           Importance: 0.00123
Variable: LotFrontage          Im

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

## Refit with only variables that are important above

In [101]:
feature_list = X1.columns # Saving feature names for later use
importances = tree.feature_importances_[tree.feature_importances_ > 0]
importances

feature_list = feature_list[tree.feature_importances_ > 0]



In [102]:
tree_reg = DecisionTreeRegressor()
tree_reg.set_params(max_depth=8, max_features='auto', min_samples_leaf=9,
                      min_samples_split=4)
tree = tree_reg.fit(X1[feature_list], Y)



In [103]:
Y_pred = tree_reg.predict(X1[feature_list])

RMSE = np.sqrt(mean_squared_error(Y,Y_pred))
print(RMSE)
R2 = tree_reg.score(X1[feature_list],Y)
print(R2)

7.800554947347207
0.7209391276100383


- Shows that this step is not necessary

# Run this on test data.

In [104]:
from clean_data_test import X1_test, Y_test

In [108]:
train_error = (1 - tree_reg.score(X1[feature_list], Y))
test_error = (1 - tree_reg.score(X1_test[feature_list], Y_test))
print("The training error is: %.5f" %train_error)
print("The test     error is: %.5f" %test_error)

The training error is: 0.27906
The test     error is: 0.45713


In [109]:
Y_pred = tree_reg.predict(X1_test[feature_list])

RMSE = np.sqrt(mean_squared_error(Y_test,Y_pred))
print(RMSE)
R2 = tree_reg.score(X1_test[feature_list],Y_test)
print(R2)

9.69536103384011
0.5428709092442197
