# Introduction

This notebook trains a decision tree to predict a species of the Iris plant based on petal length and petal width of Iris flowers.
The target variables describe three varieties of Iris.

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
# what does the data set contain
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [2]:
# how is the set described
list(iris.keys())

['data',
 'target',
 'frame',
 'target_names',
 'DESCR',
 'feature_names',
 'filename']

In [3]:
# what features are included for each data array
print(iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [4]:
# text description of the dataset with references
iris.DESCR



In [5]:
# what are the target variables
print(iris.target_names)

['setosa' 'versicolor' 'virginica']


The target features are used to classify a flower into each of _Iris setosa_, _Iris versicolor_, and _Iris verginica_.

In [12]:
# size of feature matrix
print("feature rows/cols: ", iris.data.shape)
print("target rows/cols: ", iris.target.shape)

feature rows/cols:  (150, 4)
target rows/cols:  (150,)


In [27]:
# convert to dataframe and explore the dataset
import pandas as pd
import numpy as np

df = pd.DataFrame(np.c_[iris['data'], iris['target']],
                 columns=iris['feature_names'] + ['species'])
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0


In [31]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [32]:
# what is distribution for each Iris species
df['species'].value_counts()

2.0    50
1.0    50
0.0    50
Name: species, dtype: int64

## Predictive analysis
This section builds a model for Iris varieties based on petal length and width.  

In [34]:
#  petal length and petal width
#X = df[:, 2:3]
y = df[:, -1]
#print(X)
print(y)

TypeError: '(slice(None, None, None), -1)' is an invalid key

In [7]:
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)

DecisionTreeClassifier(max_depth=2)

In [13]:
# create a visual representation of the tree
from sklearn.tree import export_graphviz

f = open('iris_tree.dot', 'w')

export_graphviz(
    tree_clf,
    out_file = f,
    feature_names = iris.feature_names[2:],
    class_names = iris.target_names,
    rounded = True,
    filled = True
)

# use graphviz to convert dot file to png format
from graphviz import Source

dot_path = "/home/daire/MachineLearning/Portfolio/iris_classifier/iris_tree.dot"
output = Source.from_file(dot_path, format= "png")
output.view()

'/home/daire/MachineLearning/Portfolio/iris_classifier/iris_tree.dot.png'

In [26]:
# predict Iris variety from petal width and petal length
predict = tree_clf.predict([[2.4, 2.0]])

#return the predicted iris variety
print("Iris ", iris.target_names[predict])

Iris  ['setosa']


In [None]:
import from matplotlib.pyplot