# Decision trees: code

In [142]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_breast_cancer, load_iris
from sklearn.tree import DecisionTreeClassifier, export_graphviz

%matplotlib inline

## Build a decision tree and visualize the result EXAMPLE

**The iris dataset**  
Measures of petal and sepal width and height (cm) for three different iris flower species.  

_Can we build a decision tree to predict flower species from petal and sepal dimensions?_

![http://5047-presscdn.pagely.netdna-cdn.com/wp-content/uploads/2015/04/iris_petal_sepal.png](./images/iris_petal_sepal.png)

In [89]:
# Load the iris dataset (object known as a "bunch")
iris = load_iris()

# Review the keys available in the file
print "\nKeys:"
print iris.keys()

# Review the target names, and the target values
print "\nTarget names:"
print iris['target_names']

print "\nTarget values:"
print iris.target

# Review the feature names, and feature values
print "\nFeature values (first 5 rows):"
print iris.data[:5]


Keys:
['target_names', 'data', 'target', 'DESCR', 'feature_names']

Target names:
['setosa' 'versicolor' 'virginica']

Target values:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]

Feature values (first 5 rows):
[[ 5.1  3.5  1.4  0.2]
 [ 4.9  3.   1.4  0.2]
 [ 4.7  3.2  1.3  0.2]
 [ 4.6  3.1  1.5  0.2]
 [ 5.   3.6  1.4  0.2]]


In [199]:
# Instantiate a tree object
tree = DecisionTreeClassifier()

# Fit the model
tree.fit(iris.data, iris.target)

# Check fit accuracy, the long way
print tree.predict(iris.data) == iris.target

print "\nAccuracy:", tree.score(iris.data, iris.target)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True]

Accuracy: 1.0


In [42]:
export_graphviz(tree, 
                out_file="./images/iris_tree.dot",
                class_names = iris["target_names"],
                feature_names = iris["feature_names"],
                impurity = False,
                filled = True)

**Anticipating some problems with rendering dot files...**  
Example dot graph:  
```python 
digraph graphname {
     a -> b -> c;
     b -> d;
 }
```
![dot file](https://upload.wikimedia.org/wikipedia/commons/e/ec/DotLanguageDirected.svg)

In [207]:
# Quick fix: shell command to convert dot to png
!dot -Tpng ./images/iris_tree.dot -o ./images/iris_tree.png

![iris_tree](./images/iris_tree.png)

## Predicting with decision trees EXERCISE

In [145]:
# Load the brest cancer dataset
cancer = load_breast_cancer()

# Review the keys available in the file
print "\nKeys:"
print cancer.keys()


Keys:
['target_names', 'data', 'target', 'DESCR', 'feature_names']

Target names:
['malignant' 'benign']

Target values:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1
 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0
 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 

In [None]:
# Review the target names, and the target values
print "\nTarget names:"
print cancer['target_names']

print "\nTarget values:"
print cancer.target

# Review the feature names, and feature values
print "\nFeature values (first 5 rows):"
print cancer.data[:5]

In [188]:
# Split into train and test sets
print rand_seed
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size = 0.7, 
                                                    random_state = 7)

7


In [194]:
# Instantiate a tree object
tree = DecisionTreeClassifier(max_depth= 3, random_state = 42)

In [195]:
# Fit the data to the training set
tree.fit(X_train, y_train)

# Evaluate performance
print "Training accuracy:", tree.score(X_train, y_train)
print "Testing accuracy:", tree.score(X_test, y_test)

Training accuracy: 0.971428571429
Testing accuracy: 0.733333333333


0.7