# Lecture 11
In this example, we will build a **decision tree** to classify iris flowers into three species (setosa, versicolor, or virginica) based on the length and width of the petals and sepals.

In [1]:
# Install packages using Anaconda:
# (1) run 'conda install -c anaconda graphviz'
# (2) run 'pip install pydot'
# (3) add to environment variable PATH
import os
os.environ["PATH"] += os.pathsep + 'C:\\Users\\user_name\\Anaconda3\\pkgs\\graphviz-2.38.0-4\\Library\\bin' # replace user_name
os.environ["PATH"] += os.pathsep + 'C:\\Users\\user_name\\Anaconda3\\Library\\bin\\graphviz' # replace user_name

In [2]:
# Load libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  
from sklearn import tree
from six import StringIO
from sklearn import metrics
import pydot # must be installed separately

ModuleNotFoundError: No module named 'pydot'

In [None]:
# Load dataset
data = sns.load_dataset("iris")
data.head()

In [None]:
# Explore dataset
data.info()

In [None]:
# Explore categorical variables
print(data.species.unique())

In [None]:
# Partition dataset into training and test sets
x_train, x_test, y_train, y_test = train_test_split(data[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']],\
                                                    data['species'], test_size=0.25, random_state = 0)

In [None]:
# Standardize data
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train) 
x_test = scaler.transform(x_test)

In [None]:
# Build decision tree
model = tree.DecisionTreeClassifier(criterion = "entropy", random_state = 0)
model.fit(x_train, y_train)

In [None]:
# Show decision tree
model.tree_.__getstate__()['nodes']

In [None]:
# Plot decision tree
dot_data = StringIO() 
tree.export_graphviz(model, out_file = dot_data, feature_names = data.columns[0:4]) 
figure = pydot.graph_from_dot_data(dot_data.getvalue()) 
figure[0].write_pdf("tree.pdf")

In [None]:
# Predict class labels using decision tree
y_pred = model.predict(x_test)

In [None]:
# Compute confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
print(conf_matrix)

In [None]:
# Plot confusion matrix
sns.heatmap(conf_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix')
plt.tight_layout()

In [None]:
# Compute accuracy
metrics.accuracy_score(y_test, y_pred)

In [None]:
# Compute error
1 - metrics.accuracy_score(y_test, y_pred)

In [None]:
# Compute precision
metrics.precision_score(y_test, y_pred, average = None)

In [None]:
# Compute recall
metrics.recall_score(y_test, y_pred, average = None)

In [None]:
# Compute F1 score
metrics.f1_score(y_test, y_pred, average = None)