In [1]:
# Step 1. Importing the packages

import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Step 2. Reading the data

my_data = pd.read_csv("drug200.csv")
my_data.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [3]:
# Feature data
# Independent Data
# converting into array - .values

X = my_data[['Age','Sex','BP','Cholesterol','Na_to_K']].values
X[0:3]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093],
       [47, 'M', 'LOW', 'HIGH', 10.114]], dtype=object)

As you may figure out, some features in this dataset are categorical such as Sex or BP. Unfortunately, Sklearn Decision Trees do not handle categorical variables. But still we can convert these features to numerical values. pandas.get_dummies() Convert categorical variable into dummy/indicator variables.

In [4]:
X[:,1]

array(['F', 'M', 'M', 'F', 'F', 'F', 'F', 'M', 'M', 'M', 'F', 'F', 'M',
       'F', 'F', 'F', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'F', 'F', 'F',
       'M', 'F', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'F',
       'M', 'F', 'F', 'M', 'M', 'F', 'F', 'F', 'M', 'M', 'F', 'F', 'M',
       'M', 'F', 'F', 'F', 'M', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'F',
       'F', 'M', 'M', 'M', 'F', 'M', 'F', 'F', 'F', 'M', 'M', 'F', 'F',
       'F', 'F', 'M', 'M', 'F', 'F', 'F', 'M', 'F', 'M', 'F', 'F', 'M',
       'M', 'F', 'F', 'M', 'M', 'F', 'F', 'M', 'F', 'M', 'F', 'F', 'M',
       'M', 'M', 'M', 'M', 'M', 'M', 'M', 'F', 'M', 'F', 'F', 'M', 'M',
       'F', 'F', 'F', 'M', 'M', 'M', 'F', 'F', 'F', 'M', 'M', 'M', 'F',
       'F', 'M', 'M', 'M', 'F', 'M', 'F', 'F', 'M', 'F', 'M', 'F', 'M',
       'M', 'M', 'M', 'F', 'F', 'F', 'M', 'M', 'M', 'M', 'F', 'M', 'M',
       'M', 'M', 'F', 'F', 'F', 'F', 'M', 'F', 'M', 'M', 'F', 'F', 'F',
       'F', 'F', 'M', 'F', 'F', 'M', 'F', 'M', 'M', 'M', 'F', 'F

In [5]:
# Label and encoding 
# converting gender, BP, Cholesterol into numbers
from sklearn import preprocessing

le_gender = preprocessing.LabelEncoder()
le_gender.fit(['F','M'])
X[:,1] = le_gender.transform(X[:,1])  #I AM UPDATING MY FIRST COLUMN FROM F,M TO 0,1

le_BP = preprocessing.LabelEncoder()
le_BP.fit(['LOW','NORMAL','HIGH'])
X[:,2] = le_BP.transform(X[:,2])   #I AM UPDATING MY SECOND COLUMN FROM HIGH,LOW,NORMAL TO 0,1,2

le_Chol = preprocessing.LabelEncoder()
le_Chol.fit(['NORMAL','HIGH'])
X[:,3] = le_Chol.transform(X[:,3])  #I AM UPDATING MY SECOND COLUMN FROM HIGH,NORMAL TO 0,1 

X[0:4]

array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.114],
       [28, 0, 2, 0, 7.798]], dtype=object)

In [6]:
y = my_data['Drug']
y[0:3]

0    drugY
1    drugC
2    drugC
Name: Drug, dtype: object

In [7]:
# Train-Test_Split

from sklearn.model_selection import train_test_split

X_trainset, X_testset, y_trainset, y_testset = train_test_split(X,y,test_size = 0.3, random_state = 300)

In [8]:
# Modeling

drug_tree = DecisionTreeClassifier(criterion = 'entropy',max_depth = 4)
drug_tree

DecisionTreeClassifier(criterion='entropy', max_depth=4)

In [9]:
# training Model
drug_tree.fit(X_trainset, y_trainset)

DecisionTreeClassifier(criterion='entropy', max_depth=4)

In [10]:
# Prediction

y_pred = drug_tree.predict(X_testset)

In [11]:
print (y_pred [0:5])
print (y_testset [0:5])

['drugX' 'drugA' 'drugC' 'drugB' 'drugB']
30     drugX
187    drugA
193    drugC
54     drugB
124    drugB
Name: Drug, dtype: object


In [12]:
# Evaluation

from sklearn import metrics
print("DecisionTree's Accuracy: ", metrics.accuracy_score(y_testset, y_pred))

DecisionTree's Accuracy:  0.9833333333333333


In [13]:
my_data.head(2)

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC


In [14]:
drug_tree.predict([[22,1,0,1,25]])

array(['drugY'], dtype=object)