In [1]:
# 1. About the dataset
# 2. Downloading the Data
# 3. Pre-processing
#4. Setting up the Decision Tree
# 5. Modeling
# 6. Prediction
# 7. Evaluation
#8. Visualization

In [2]:
# Import libarary
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [3]:
# Problem : about data: Imagine that you are a medical researcher compiling data for a study. 
#You have collected data about a set of patients, all of whom suffered from the same illness. 
#During their course of treatment, each patient responded to one of 5 medications, Drug A, Drug B, Drug c, Drug x and y.

#Part of your job is to build a model to find out which drug might be appropriate for a future patient with the same illness. 
#The feature sets of this dataset are Age, Sex, Blood Pressure, #and Cholesterol of patients, and the target is the drug that each patient responded to.

#It is a sample of binary classifier, and you can use the training part of the dataset to build a decision tree, 
# and then use it to predict the class of a unknown patient, or to prescribe it to a #new patient.



In [4]:
path="https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/drug200.csv"
df=pd.read_csv(path)
df.head( 10)

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
5,22,F,NORMAL,HIGH,8.607,drugX
6,49,F,NORMAL,HIGH,16.275,drugY
7,41,M,LOW,HIGH,11.037,drugC
8,60,M,NORMAL,HIGH,15.171,drugY
9,43,M,LOW,NORMAL,19.368,drugY


In [5]:
# df.info
 # explore data: 

In [6]:
df.shape

(200, 6)

In [7]:
df.size

1200

In [8]:
df.columns

Index(['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K', 'Drug'], dtype='object')

In [9]:
df["Na_to_K"].dtype

dtype('float64')

In [10]:
df.dtypes

Age              int64
Sex             object
BP              object
Cholesterol     object
Na_to_K        float64
Drug            object
dtype: object

In [11]:
# Covert data frame to numpy array
#Pre-processing
#Using my_data as the Drug.csv data read by pandas, declare the following variables:

#X as the Feature Matrix (data of my_data)
#y as the response vector (target)
#Remove the column containing the target name since it doesn't contain numeric values.

In [12]:
X=df[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']].values 
X[0:5]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093],
       [47, 'M', 'LOW', 'HIGH', 10.113999999999999],
       [28, 'F', 'NORMAL', 'HIGH', 7.797999999999999],
       [61, 'F', 'LOW', 'HIGH', 18.043]], dtype=object)

In [13]:
# As you may figure out, some features in this dataset are categorical such as Sex or BP.
# Unfortunately, Sklearn Decision Trees do not handle categorical variables.
# But still we can convert these features to numerical values. pandas.get_dummies()
# Convert categorical variable into dummy/indicator variables.

In [14]:
from sklearn import preprocessing
le_sex = preprocessing.LabelEncoder()
le_sex.fit(['F','M'])
X[:,1] = le_sex.transform(X[:,1]) 


le_BP = preprocessing.LabelEncoder()
le_BP.fit([ 'LOW', 'NORMAL', 'HIGH'])
X[:,2] = le_BP.transform(X[:,2])


le_Chol = preprocessing.LabelEncoder()
le_Chol.fit([ 'NORMAL', 'HIGH'])
X[:,3] = le_Chol.transform(X[:,3]) 

X[0:5]

array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.113999999999999],
       [28, 0, 2, 0, 7.797999999999999],
       [61, 0, 1, 0, 18.043]], dtype=object)

In [15]:
y = df["Drug"]
y[0:5]

0    drugY
1    drugC
2    drugC
3    drugX
4    drugY
Name: Drug, dtype: object

In [16]:
# Setting up the Decision Tree
# We will be using train/test split on our decision tree. Let's import train_test_split from sklearn.cross_validation.
from sklearn.model_selection import train_test_split

In [17]:
# Create train and test data by using train_test_split()
x_train,x_test, y_train, y_test=train_test_split(X,y, test_size=.3, random_state= 3)

#print train and test data shape

x_train.shape  # 70% of 200
#x_test.shape

(140, 5)

In [18]:
#y_train.shape
y_test.shape

(60,)

In [19]:
# Modeling
#We will first create an instance of the DecisionTreeClassifier called drugTree.
# Inside of the classifier, specify criterion="entropy" so we can see the information gain of each node.

In [20]:
drugTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
drugTree # it shows the default parameters

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [21]:
# Next, we will fit the data with the training feature matrix X_trainset and training response vector y_trainset
drugTree.fit(x_train,y_train)# Train data set 

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [22]:
# Prediction
# Let's make some predictions on the testing dataset and store it into a variable called predTree.

predtree=drugTree.predict(x_test)   # predicted value yhat from the test set 

In [23]:
print (predtree [0:10])  # Predicted varaibles from train data 
#print (y_testset [0:5])

['drugY' 'drugX' 'drugX' 'drugX' 'drugX' 'drugC' 'drugY' 'drugA' 'drugB'
 'drugA']


In [24]:
print (y_test [0:10])  # Actual target value y from test

40     drugY
51     drugX
139    drugX
197    drugX
170    drugX
82     drugC
183    drugY
46     drugA
70     drugB
100    drugA
Name: Drug, dtype: object


In [25]:
# Evaluation
# Next, let's import metrics from sklearn and check the accuracy of our model.
from sklearn import metrics
import matplotlib.pyplot as plt
print("Decision Tree Accuracy:" , metrics.accuracy_score(y_test, predtree))


Decision Tree Accuracy: 0.9833333333333333


In [26]:
# Accuracy classification score computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.

# In multilabel classification, the function returns the subset accuracy. 
# If the entire set of predicted labels for a sample strictly match with the true set of labels,
# then the subset accuracy is 1.0; otherwise it is 0.0.
# Conclusion: the predicted value is very close to the actual value so its a good classifier

In [None]:
# Visualization: visualize the tree:
# Notice: You might need to uncomment and install the pydotplus and graphviz libraries if you have not installed these before
!conda install -c conda-forge pydotplus -y
!conda install -c conda-forge python-graphviz -y


In [None]:
from sklearn.externals.six import StringIO
import pydotplus
import matplotlib.image as mpimg
from sklearn import tree
%matplotlib inline 

In [None]:
dot_data = StringIO()
filename = "drugtree.png"
featureNames = df.columns[0:5]
targetNames = df["Drug"].unique().tolist()
out=tree.export_graphviz(drugTree,feature_names=featureNames, out_file=dot_data, class_names= np.unique(y_trainset), filled=True,  special_characters=True,rotate=False)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(100, 200))
plt.imshow(img,interpolation='nearest')