# Import Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load Data

In [2]:
iris = sns.load_dataset('iris')

In [3]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


# Data Wrangling


In [4]:
# The next step is to specify your x and y variables using subsetting.
# y is the column you are predicting, and x is everything you are using to predict it.

x = iris.drop('species', axis=1)
y = iris['species']

# Train Test Split

In [5]:
# You will split the data into training and testing sets.
# The train variables are creating your initial model, and the test variables are what you'll use to 
# determine the fit of the model. Note that just for following along, you will set the random_state to 76, 
# which is not necessary, but it will give you the same split as the example.

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=76)

# Create Initial Decision Tree

In [6]:
# Before you jump into the Random Forest, try a single decision tree.
# To do this, utilize the DecisionTreeClassifier() function and then fit() the model. 
# Once more, to keep everyone on the same page, the random_state is 76.

decisionTree = DecisionTreeClassifier(random_state=76)
decisionTree.fit(x_train, y_train)

DecisionTreeClassifier(random_state=76)

# Assess the model

In [9]:
# Now that the data is fit, the next step is to create a set of predictions and interpret the results.
# You can start by using the predict() function, and then you'll utilize the same confusion matrix and 
# classification report coding as you did last lesson.

treePredictions = decisionTree.predict(x_test)
print(treePredictions)


['setosa' 'versicolor' 'setosa' 'virginica' 'virginica' 'versicolor'
 'setosa' 'virginica' 'versicolor' 'virginica' 'setosa' 'virginica'
 'versicolor' 'versicolor' 'setosa' 'setosa' 'setosa' 'virginica'
 'versicolor' 'setosa' 'versicolor' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'virginica' 'virginica' 'setosa' 'versicolor' 'virginica'
 'versicolor' 'versicolor' 'virginica' 'versicolor' 'setosa' 'setosa'
 'setosa' 'versicolor' 'virginica' 'virginica' 'virginica' 'virginica'
 'setosa' 'setosa']


In [None]:
# the above output is hard to understand. So you can use some other sklearn tools to make this pretty and usable.
# You'll call on the functions confusion_matrix() and classification_report(). Start with the confusion matrix:

In [10]:
print(confusion_matrix(y_test, treePredictions))


[[19  0  0]
 [ 0 10  3]
 [ 0  2 11]]


In [None]:

# setosa (actual)	versicolor (actual)	virginica (actual)
# setosa (predicted)	19	0	0
# versicolor (predicted)	0	10	3
# virginica (predicted)	0	2	11


# How Well Does your Model Fit?

In [12]:
print(classification_report(y_test, treePredictions))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        19
  versicolor       0.83      0.77      0.80        13
   virginica       0.79      0.85      0.81        13

    accuracy                           0.89        45
   macro avg       0.87      0.87      0.87        45
weighted avg       0.89      0.89      0.89        45



In [13]:
# So setosa was predicted with 100% precision, while versicolor was predicted with 83% accuracy and virginica 
# was predicted with 79% accuracy! Not too shabby, especially considering that you can predict the species of 
# the flower with 89% accuracy. 