In [2]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('./data/housing.csv')

In [5]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    int64  
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  PRICE    506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


***
Overview of machine learning techniques / approaches

http://www.jonathanbech.tel/ml-explorer/

# Main models in Unit 3:

* Tree based Ensembles
    * Gradient Boosting Regressor

* Neural Networks

# Tree Based Ensemble 

* Will focus on **Gradient Boosting Regressor** as part of classes since this is one of the most effective general purpose approaches.

But first... conversation about decision trees!

## Decision Trees:

* Look for values to split your data on
* Keep splitting your data until there is nothing less



**So models...**
* Given an objective function, and it focuses on creating a split at each step to maximise the value it gives you

### Why are decision trees good?

Example - predicting the price of something

The decision tree will try to find the factors that recursively provide the best values based off the splits.

In [8]:
df['PRICE'] - df['PRICE'].mean() # Our error if we were predicting every single column

0       1.467194
1      -0.932806
2      12.167194
3      10.867194
4      13.667194
         ...    
501    -0.132806
502    -1.932806
503     1.367194
504    -0.532806
505   -10.632806
Name: PRICE, Length: 506, dtype: float64

In [17]:
df.shape[0]

506

In [10]:
np.mean((df['PRICE'] - df['PRICE'].mean())**2) # Sum of squared error

84.4195561561656

So the tree would start with this number - 84.4; and look to see if it can do better by using every unique value.

In [11]:
left= df[df.CHAS < 1]
right= df[df.CHAS >= 1]

In [14]:
left.shape[0]

471

In [16]:
right.shape[0]

35

In [24]:
(471/506)*np.mean((left['PRICE'] - left['PRICE'].mean())**2) 
# So get a bit of an improvement on the left

72.44405562129178

In [23]:
(35/506)*np.mean((right['PRICE'] - right['PRICE'].mean())**2) 

# But didn't improve on the right, until we adjust for the error
# and weight by (items / total_items) so it's less important

9.382458498023714

In [25]:
node_value = np.mean((df['PRICE'] - df['PRICE'].mean())**2) 
left_error = (471/506)*np.mean((left['PRICE'] - left['PRICE'].mean())**2) 
right_error = (35/506)*np.mean((right['PRICE'] - right['PRICE'].mean())**2) 

In [28]:
node_value - left_error - right_error 

# Ideal value in a split is 0, i.e. we are doing no better than the prior estimate
# A positive number, means things have gotten slightly better
# Test until we get negative number, then stop testing

2.593042036850102

In [33]:
left= df[df.LSTAT < 10]
right= df[df.LSTAT >= 10] #Low socio economic status
print(f"left sample: {left.shape[0]}\nright sample: {right.shape[0]}")

left sample: 219
right sample: 287


In [32]:
node_value = np.mean((df['PRICE'] - df['PRICE'].mean())**2) 
left_error = (left.shape[0]/506)*np.mean((left['PRICE'] - left['PRICE'].mean())**2) 
right_error = (right.shape[0]/506)*np.mean((right['PRICE'] - right['PRICE'].mean())**2) 
node_value - left_error - right_error 

36.769206053230754

This means LSTAT gives us a better gain in predictive accuracy than CHAS. Partly because a much cleaner split.


In [34]:
from sklearn.tree import DecisionTreeRegressor

In [35]:
tree = DecisionTreeRegressor(max_depth=4)

In [36]:
X = df.drop('PRICE', axis=1)
y = df['PRICE']

In [37]:
X # Everything except PRICE

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48


In [38]:
y # Price column

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: PRICE, Length: 506, dtype: float64

In [39]:
tree.fit(X,y)

DecisionTreeRegressor(max_depth=4)

In [40]:
tree.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': 4,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [43]:
from prep import draw_tree

ModuleNotFoundError: No module named 'graphviz'

In [42]:
draw_tree(tree,X)

NameError: name 'draw_tree' is not defined

draw_tree output gives:

* mean squared error
* samples - number of sample in node
* value - our prediction