In [160]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [161]:
df = pd.read_csv('drug200.csv')
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [162]:
df.shape

(200, 6)

In [163]:
df.columns

Index(['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K', 'Drug'], dtype='object')

In [164]:
X = df[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']].values
X[0:5]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093],
       [47, 'M', 'LOW', 'HIGH', 10.114],
       [28, 'F', 'NORMAL', 'HIGH', 7.798],
       [61, 'F', 'LOW', 'HIGH', 18.043]], dtype=object)

In [165]:
Y = df[['Drug']].values
Y[0:5]

array([['drugY'],
       ['drugC'],
       ['drugC'],
       ['drugX'],
       ['drugY']], dtype=object)

In [166]:
from sklearn import preprocessing
#changing the F&M to 0&1 for better data processing
le_sex = preprocessing.LabelEncoder()
le_sex.fit(['F','M'])

X[:,1] = le_sex.transform(X[:,1])

X[0:5]

array([[23, 0, 'HIGH', 'HIGH', 25.355],
       [47, 1, 'LOW', 'HIGH', 13.093],
       [47, 1, 'LOW', 'HIGH', 10.114],
       [28, 0, 'NORMAL', 'HIGH', 7.798],
       [61, 0, 'LOW', 'HIGH', 18.043]], dtype=object)

In [167]:
#changing HIGH,,NORMAL,LOW to 0,1,2 for data processing
le_BP = preprocessing.LabelEncoder()
le_BP.fit(['LOW','NORMAL','HIGH'])

X[:,2] = le_BP.transform(X[:,2])

X[0:5]

array([[23, 0, 0, 'HIGH', 25.355],
       [47, 1, 1, 'HIGH', 13.093],
       [47, 1, 1, 'HIGH', 10.114],
       [28, 0, 2, 'HIGH', 7.798],
       [61, 0, 1, 'HIGH', 18.043]], dtype=object)

In [168]:
df['Cholesterol'].value_counts()

Cholesterol
HIGH      103
NORMAL     97
Name: count, dtype: int64

In [169]:
#changing HIGH and NORMAL to 0,1 for data processing
le_chol = preprocessing.LabelEncoder()
le_chol.fit(['NORMAL','HIGH'])

X[:,3] =le_chol.transform(X[:,3])

X[0:5]

array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.114],
       [28, 0, 2, 0, 7.798],
       [61, 0, 1, 0, 18.043]], dtype=object)

In [170]:
from sklearn.model_selection import train_test_split

X_trainset , X_testset , Y_trainset , Y_testset = train_test_split(X,Y,test_size = 0.3,random_state=3)

In [171]:
DrugTree = DecisionTreeClassifier(criterion='entropy',max_depth=4)
DrugTree

In [172]:
DrugTree.fit(X_trainset,Y_trainset)

In [173]:
pred_tree = DrugTree.predict(X_testset)

In [174]:
print(pred_tree[0:5])
print(Y_testset[0:5])

['drugY' 'drugX' 'drugX' 'drugX' 'drugX']
[['drugY']
 ['drugX']
 ['drugX']
 ['drugX']
 ['drugX']]


Great results

In [175]:
from sklearn import metrics
acc = metrics.accuracy_score(Y_testset,pred_tree)
print(f'the accuracy for this model is {acc}')


the accuracy for this model is 0.9833333333333333


In [176]:
from matplotlib import pyplot as plt
from io import StringIO
import pydotplus
import matplotlib.image as mpimg
from sklearn import tree
%matplotlib inline

In [177]:
dot_data = StringIO()
filename = "drugtree.png"
featureNames = df.columns[0:5]
out=tree.export_graphviz(DrugTree,feature_names=featureNames, out_file=dot_data, class_names= np.unique(Y_trainset), filled=True,  special_characters=True,rotate=False)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(100, 200))
plt.imshow(img,interpolation='nearest')

InvocationException: GraphViz's executables not found