## Decision Tree for iris data set

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np

from sklearn.tree import export_graphviz
from io import StringIO
from IPython.display import Image
import pydotplus
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('data/Evolution_DataSets.csv')
# select only needed columns
selected_features = ['Cranial_Capacity', 'Height', 'Time','Species']
df = df[selected_features]
df.head()

#### Explore the Data

In [None]:
print(df.shape)

In [None]:
df.Species.value_counts()

In [None]:
df.describe()

**count says no missing numerical data.**

In [None]:
scatter_matrix(df)
plt.plot()

**groupings correspond to different species hopefully. Looks as if one species is easy to seperate. Separating the other two might be harder. petal_length or petal_width look to be most useful features.**

In [None]:
# sepal_lengths for setosa
# d[:,0] == "Homo Sapiens" is a row selector and selects all setosa rows.
d = np.array(df)
print(d[d[:,0] == 'Homo Sapiens', 0])

In [None]:
# sepal_length and sepal_width for each species?
d = np.array(df)
# d = df.to_numpy()
plt.scatter(
 d[d[:,3] == 'hominino Orrorin tugenencin', 2], d[d[:,3] == 'hominino Orrorin tugenencin',0],
 c='lightblue', marker='v', edgecolor='black',
 label='hominino Orrorin tugenencin'
)
plt.scatter(
 d[d[:,3] == 'hominino Orrorin tugenencin', 2], d[d[:,3] == 'hominino Orrorin tugenencin', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='hominino Orrorin tugenencin'
)
plt.scatter(
 d[d[:,3] == 'hominino Ardipithecus ramidus / kabadda', 2], d[d[:,3] == 'hominino Ardipithecus ramidus / kabadda', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='hominino Ardipithecus ramidus / kabadda'
)
plt.scatter(
 d[d[:,3] == 'Paranthropus Robustus', 2], d[d[:,3] == 'Paranthropus Robustus', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Paranthropus Robustus'
)
plt.scatter(
 d[d[:,3] == 'Australopithecus Garhi', 2], d[d[:,3] == 'Australopithecus Garhi', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Australopithecus Garhi'
)
plt.scatter(
 d[d[:,3] == 'Australopithecus Bahrelghazali', 2], d[d[:,3] == 'Australopithecus Bahrelghazali', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Australopithecus Bahrelghazali'
)
plt.scatter(
 d[d[:,3] == 'Homo Georgicus', 2], d[d[:,3] == 'Homo Georgicus', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Georgicus'
)
plt.scatter(
 d[d[:,3] == 'Australopithecus Sediba', 2], d[d[:,3] == 'Australopithecus Sediba', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Australopithecus Sediba'
)
plt.scatter(
 d[d[:,3] == 'Homo Ergaster', 2], d[d[:,3] == 'Homo Ergaster', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Ergaster'
)
plt.scatter(
 d[d[:,3] == 'Homo Antecesor', 2], d[d[:,3] == 'Homo Antecesor', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Antecesor'
)
plt.scatter(
 d[d[:,3] == 'Homo Sapiens', 2], d[d[:,3] == 'Homo Sapiens', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Sapiens'
)
plt.scatter(
 d[d[:,3] == 'Homo Habilis', 2], d[d[:,3] == 'Homo Habilis', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Habilis'
)
plt.scatter(
 d[d[:,3] == 'Homo Rudolfensis', 2], d[d[:,3] == 'Homo Rudolfensis', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Rudolfensis'
)
plt.scatter(
 d[d[:,3] == 'Paranthropus Boisei', 2], d[d[:,3] == 'Paranthropus Boisei', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Paranthropus Boisei'
)
plt.scatter(
 d[d[:,3] == 'Homo Floresiensis', 2], d[d[:,3] == 'Homo Floresiensis', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Floresiensis'
)
plt.scatter(
 d[d[:,3] == 'Homo Naledi', 2], d[d[:,3] == 'Homo Naledi', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Naledi'
)
plt.scatter(
 d[d[:,3] == 'Homo Erectus', 2], d[d[:,3] == 'Homo Erectus', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Erectus'
)
plt.scatter(
 d[d[:,3] == 'Paranthropus Aethiopicus', 2], d[d[:,3] == 'Paranthropus Aethiopicus', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Paranthropus Aethiopicus'
)
plt.scatter(
 d[d[:,3] == 'Homo Neanderthalensis\n', 2], d[d[:,3] == 'Homo Neanderthalensis\n', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Neanderthalensis\n'
)
plt.scatter(
 d[d[:,3] == 'hominino Sahelanthropus tchadensis', 2], d[d[:,3] == 'hominino Sahelanthropus tchadensis', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='hominino Sahelanthropus tchadensis'
)
plt.scatter(
 d[d[:,3] == 'Homo Rodhesiensis', 2], d[d[:,3] == 'Homo Rodhesiensis', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Rodhesiensis'
)
plt.scatter(
 d[d[:,3] == 'Australopithecus Africanus', 2], d[d[:,3] == 'Australopithecus Africanus', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Australopithecus Africanus'
)
plt.scatter(
 d[d[:,3] == 'Australopithecus Anamensis', 2], d[d[:,3] == 'Australopithecus Anamensis', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Australopithecus Anamensis'
)
plt.scatter(
 d[d[:,3] == 'Australopithecus Afarensis', 2], d[d[:,3] == 'Australopithecus Afarensis', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Australopithecus Afarensis'
)
plt.scatter(
 d[d[:,3] == 'Homo Heidelbergensis', 2], d[d[:,3] == 'Homo Heidelbergensis', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Heidelbergensis'
)
plt.xlabel('Time')
plt.ylabel('Cranial_Capacity')
plt.legend(loc="upper left")
plt.show()

In [None]:
# sepal_length and sepal_width for each species?
d = np.array(df)
# d = df.to_numpy()
plt.scatter(
 d[d[:,3] == 'hominino Orrorin tugenencin', 1], d[d[:,3] == 'hominino Orrorin tugenencin',0],
 c='lightblue', marker='v', edgecolor='black',
 label='hominino Orrorin tugenencin'
)
plt.scatter(
 d[d[:,3] == 'hominino Orrorin tugenencin', 1], d[d[:,3] == 'hominino Orrorin tugenencin', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='hominino Orrorin tugenencin'
)
plt.scatter(
 d[d[:,3] == 'hominino Ardipithecus ramidus / kabadda', 1], d[d[:,3] == 'hominino Ardipithecus ramidus / kabadda', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='hominino Ardipithecus ramidus / kabadda'
)
plt.scatter(
 d[d[:,3] == 'Paranthropus Robustus', 1], d[d[:,3] == 'Paranthropus Robustus', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Paranthropus Robustus'
)
plt.scatter(
 d[d[:,3] == 'Australopithecus Garhi', 1], d[d[:,3] == 'Australopithecus Garhi', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Australopithecus Garhi'
)
plt.scatter(
 d[d[:,3] == 'Australopithecus Bahrelghazali', 1], d[d[:,3] == 'Australopithecus Bahrelghazali', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Australopithecus Bahrelghazali'
)
plt.scatter(
 d[d[:,3] == 'Homo Georgicus', 1], d[d[:,3] == 'Homo Georgicus', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Georgicus'
)
plt.scatter(
 d[d[:,3] == 'Australopithecus Sediba', 1], d[d[:,3] == 'Australopithecus Sediba', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Australopithecus Sediba'
)
plt.scatter(
 d[d[:,3] == 'Homo Ergaster', 1], d[d[:,3] == 'Homo Ergaster', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Ergaster'
)
plt.scatter(
 d[d[:,3] == 'Homo Antecesor', 1], d[d[:,3] == 'Homo Antecesor', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Antecesor'
)
plt.scatter(
 d[d[:,3] == 'Homo Sapiens', 1], d[d[:,3] == 'Homo Sapiens', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Sapiens'
)
plt.scatter(
 d[d[:,3] == 'Homo Habilis', 1], d[d[:,3] == 'Homo Habilis', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Habilis'
)
plt.scatter(
 d[d[:,3] == 'Homo Rudolfensis', 1], d[d[:,3] == 'Homo Rudolfensis', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Rudolfensis'
)
plt.scatter(
 d[d[:,3] == 'Paranthropus Boisei', 1], d[d[:,3] == 'Paranthropus Boisei', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Paranthropus Boisei'
)
plt.scatter(
 d[d[:,3] == 'Homo Floresiensis', 1], d[d[:,3] == 'Homo Floresiensis', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Floresiensis'
)
plt.scatter(
 d[d[:,3] == 'Homo Naledi', 1], d[d[:,3] == 'Homo Naledi', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Naledi'
)
plt.scatter(
 d[d[:,3] == 'Homo Erectus', 1], d[d[:,3] == 'Homo Erectus', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Erectus'
)
plt.scatter(
 d[d[:,3] == 'Paranthropus Aethiopicus', 1], d[d[:,3] == 'Paranthropus Aethiopicus', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Paranthropus Aethiopicus'
)
plt.scatter(
 d[d[:,3] == 'Homo Neanderthalensis\n', 1], d[d[:,3] == 'Homo Neanderthalensis\n', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Neanderthalensis\n'
)
plt.scatter(
 d[d[:,3] == 'hominino Sahelanthropus tchadensis', 1], d[d[:,3] == 'hominino Sahelanthropus tchadensis', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='hominino Sahelanthropus tchadensis'
)
plt.scatter(
 d[d[:,3] == 'Homo Rodhesiensis', 1], d[d[:,3] == 'Homo Rodhesiensis', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Rodhesiensis'
)
plt.scatter(
 d[d[:,3] == 'Australopithecus Africanus', 1], d[d[:,3] == 'Australopithecus Africanus', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Australopithecus Africanus'
)
plt.scatter(
 d[d[:,3] == 'Australopithecus Anamensis', 1], d[d[:,3] == 'Australopithecus Anamensis', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Australopithecus Anamensis'
)
plt.scatter(
 d[d[:,3] == 'Australopithecus Afarensis', 1], d[d[:,3] == 'Australopithecus Afarensis', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Australopithecus Afarensis'
)
plt.scatter(
 d[d[:,3] == 'Homo Heidelbergensis', 1], d[d[:,3] == 'Homo Heidelbergensis', 0],
 c='lightgreen', marker='s', edgecolor='black',
 label='Homo Heidelbergensis'
)
plt.xlabel('Time')
plt.ylabel('Height')
plt.legend(loc="upper left")
plt.show()

**this shows that petal_length and petal_width can separate the classes (sspecies) pretty well. Expect a very good model** 

#### Build the Model

In [None]:
# X = df[["sepal_length","sepal_width","petal_length","petal_width"]]
X = df.drop(columns='Species')
y = df.Species

In [None]:
# 125 training and 25 test
# stratify ensures the same proportions of species in traing and test data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=25, random_state=1, stratify=y)

In [None]:
# model = DecisionTreeClassifier(max_depth=2)
model = DecisionTreeClassifier()
model.fit(X_train,y_train)
print(model.get_depth())

#### Evaluation

In [None]:
print("[training accuracy: ", model.score(X_train, y_train), "]")
print("test accuracy: ", model.score(X_test, y_test))

**testing accuracy is the important one**

In [None]:
#Predict the response for test dataset
y_hat = model.predict(X_test)
print("test accuracy:", accuracy_score(y_test, y_hat))

In [None]:
# confusion matrix
cm = confusion_matrix(y_test, y_hat)
print(cm)

In [None]:
#### Draw the tree

In [None]:
feature_names = ["Cranial_Capacity", "Height", "Time"]
target_names = ["hominino Orrorin tugenencin", "hominino Ardipithecus ramidus / kabadda", "Paranthropus Robustus", "Australopithecus Garhi", 
"Australopithecus Bahrelghazali", "Homo Georgicus", "Australopithecus Sediba", "Homo Ergaster", "Homo Antecesor", "Homo Sapiens", 
"Homo Habilis", "Homo Rudolfensis", "Paranthropus Boisei", "Homo Floresiensis", "Homo Naledi", "Homo Erectus", 
"Paranthropus Aethiopicus", "Homo Neanderthalensis", "hominino Sahelanthropus tchadensis", "Homo Rodhesiensis", 
"Australopithecus Africanus", "Australopithecus Anamensis", "Australopithecus Afarensis", "Homo Heidelbergensis"]

dot_data = StringIO()
export_graphviz(model, out_file=dot_data,
 filled=True, rounded=True,
 special_characters=True, feature_names = feature_names,
 class_names = target_names)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
# graph.write_png('plots/evo.png')
Image(graph.create_png())