# Constracting a decision tree classifier

In [24]:
import sklearn.datasets as data
import sklearn.model_selection as model_select
import sklearn.tree as tree
import sklearn.metrics as metrics
DOT_FILE = 'iris-tree.dot'
DEBUGGING = True

In [25]:
# load the built-in iris data set
iris = data.load_iris()
if ( DEBUGGING ):
    print('classes = ', iris.target_names)
    print('attributes = ', iris.feature_names)

classes =  ['setosa' 'versicolor' 'virginica']
attributes =  ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [27]:
# split the data into training and test sets
X_train, X_test, y_train, y_test = model_select.train_test_split( iris.data, iris.target, random_state=0 )
M_train = len( X_train )
M_test = len( X_test )
if ( DEBUGGING ):
    print('number of training instances = ' + str( M_train ))
    print('number of test instances = ' + str( M_test ))

number of training instances = 112
number of test instances = 38


In [4]:
# initialise the decision tree
clf = tree.DecisionTreeClassifier( random_state = 0 )

In [5]:
# fit the tree model to the training data
clf.fit( X_train, y_train )

DecisionTreeClassifier(random_state=0)

## A) How good is your model?

#### 1. Count the number of correctly predicted labels

In [6]:
# predict the labels for the test set
y_hat = clf.predict( X_test )
# count the number of correctly predicted labels
count = 0.0
for i in range( M_test ):
    if ( y_hat[i] == y_test[i] ):
        count += 1
score = ( count / M_test )
print('number of correct predictions = {} out of {} = {}'.format( count, M_test, score ))

number of correct predictions = 37.0 out of 38 = 0.9736842105263158


#### 2. Use the scikit-learn classifier score()

In [7]:
print('training score = ', clf.score( X_train, y_train ))
print('test score = ', clf.score( X_test, y_test ))

training score =  1.0
test score =  0.9736842105263158


#### 3. Use scikit-learn metrics package to compute the accuracy()

In [8]:
print('accuracy score = ', metrics.accuracy_score( y_test, y_hat ))

accuracy score =  0.9736842105263158


#### 4. Compute a confusion matrix

In [9]:
cm = metrics.confusion_matrix( y_test, y_hat )
print('confusion matrix =')
#print '%10s\t%s' % ( ' ','predicted-->' )
print('\t predicted-->')
#print '%10s\t' % ( 'actual:' ),
print('actual:', end='')
for i in range( len( iris.target_names )):
    #print '%10s\t' % ( iris.target_names[i] ),
    print( iris.target_names[i], end='' )
#print '\n',
print()
for i in range( len( iris.target_names )):
    #print '%10s\t' % ( iris.target_names[i] ),
    for j in range( len( iris.target_names )):
        #print '%10s\t' % ( cm[i,j] ),
        print(cm[i,j], end='') 
    #print '\n',
    print()
# print '\n',
print()

confusion matrix =
	 predicted-->
actual:setosaversicolorvirginica
1300
0151
009



#### 5. Compute precision

In [10]:
print('precision score = tp / (tp + fp) =')
precision = metrics.precision_score( y_test, y_hat, average=None )
for i in range( len( iris.target_names )):
    print('\t {} = {}'.format( iris.target_names[i], precision[i] ))

precision score = tp / (tp + fp) =
	 setosa = 1.0
	 versicolor = 1.0
	 virginica = 0.9


#### 6. Compute recall

In [11]:
print('recall score = tp / (tp + fn) =')
recall = metrics.recall_score( y_test, y_hat, average=None )
for i in range( len( iris.target_names )):
    print('\t {} = {}'.format( iris.target_names[i], recall[i] ))

recall score = tp / (tp + fn) =
	 setosa = 1.0
	 versicolor = 0.9375
	 virginica = 1.0


#### 7. Compute F1 score

In [12]:
print('f1 score = 2 * (precision * recall) / (precision + recall) =')
f1 = metrics.f1_score( y_test, y_hat, average=None )
for i in range( len( iris.target_names )):
    print('\t {} = {}'.format( iris.target_names[i], f1[i] ))

f1 score = 2 * (precision * recall) / (precision + recall) =
	 setosa = 1.0
	 versicolor = 0.967741935483871
	 virginica = 0.9473684210526316


## B) What does the decision tree look like?

In [13]:
# what does the tree look like?
print('decision path: ')
print(clf.decision_path( iris.data ))                                                                                

decision path: 
  (0, 0)	1
  (0, 1)	1
  (1, 0)	1
  (1, 1)	1
  (2, 0)	1
  (2, 1)	1
  (3, 0)	1
  (3, 1)	1
  (4, 0)	1
  (4, 1)	1
  (5, 0)	1
  (5, 1)	1
  (6, 0)	1
  (6, 1)	1
  (7, 0)	1
  (7, 1)	1
  (8, 0)	1
  (8, 1)	1
  (9, 0)	1
  (9, 1)	1
  (10, 0)	1
  (10, 1)	1
  (11, 0)	1
  (11, 1)	1
  (12, 0)	1
  :	:
  (143, 12)	1
  (144, 0)	1
  (144, 2)	1
  (144, 8)	1
  (144, 12)	1
  (145, 0)	1
  (145, 2)	1
  (145, 8)	1
  (145, 12)	1
  (146, 0)	1
  (146, 2)	1
  (146, 8)	1
  (146, 12)	1
  (147, 0)	1
  (147, 2)	1
  (147, 8)	1
  (147, 12)	1
  (148, 0)	1
  (148, 2)	1
  (148, 8)	1
  (148, 12)	1
  (149, 0)	1
  (149, 2)	1
  (149, 8)	1
  (149, 12)	1


In [14]:
# output the tree to "dot" format for later visualising
tree.export_graphviz( clf, out_file = DOT_FILE, class_names=iris.target_names, impurity=True )
print('output dot file written to: ', DOT_FILE)

output dot file written to:  iris-tree.dot
