In [13]:
import numpy as np
import pandas as pd
import sys
import sklearn.tree as tree
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing

In [2]:
df = pd.read_csv('./Drugs_data.csv')
print("The number of rows and columns in the dataframe is ", df.shape)

df.head()

The number of rows and columns in the dataframe is  (200, 6)


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [4]:
# Data Preprocessing
df_processed = df[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']].values
df_processed[0:5]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093],
       [47, 'M', 'LOW', 'HIGH', 10.114],
       [28, 'F', 'NORMAL', 'HIGH', 7.798],
       [61, 'F', 'LOW', 'HIGH', 18.043]], dtype=object)

In [7]:
# Label Encoder is a class which is used to convert categorical data into numbers
# Like for in the given case, we can convert 'F' to 0 and 'M' to 1 or vice versa
# le stands for label encoder

# Create an object of the LabelEncoder class
le_sex = preprocessing.LabelEncoder()

# Fit the data to the label encoder which will convert the data to numbers
le_sex.fit(['F', 'M'])

# Transform the data to numbers using the label encoder
df_processed[:, 1] = le_sex.transform(df_processed[:, 1])

In [9]:
# Creating another object of the LabelEncoder class for the BP column
le_BP = preprocessing.LabelEncoder()

# Fit the data to the label encoder to convert the data to numbers for the BP column
le_BP.fit(['LOW', 'NORMAL', 'HIGH'])

# Transform the data to numbers using the label encoder for the BP column
df_processed[:, 2] = le_BP.transform(df_processed[:, 2])

In [10]:
# Creating another object of the Label Encoder class for the Cholesterol column  
le_Chol = preprocessing.LabelEncoder()

# Fit the data to the label encoder to convert the data to numbers for the Cholesterol column
le_Chol.fit(['NORMAL', 'HIGH'])

# Transform the data to numbers using the label encoder for the Cholesterol column
df_processed[:, 3] = le_Chol.transform(df_processed[:, 3])

In [11]:
df_processed[0:5]

array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.114],
       [28, 0, 2, 0, 7.798],
       [61, 0, 1, 0, 18.043]], dtype=object)

In [12]:
# To get the target variable, we will use the Drug Column
y = df['Drug']
y[0 : 5]

0    drugY
1    drugC
2    drugC
3    drugX
4    drugY
Name: Drug, dtype: object

In [14]:
# Setting up a Decision Tree
X_trainset, X_testset, Y_trainset, Y_testset = train_test_split(df_processed, y, test_size=0.3, random_state=3)
print(f"The shape of the Training Set of X is :- {X_trainset.shape} and that of Y is :- {Y_trainset.shape}")
print(f"The shape of the Testing Set of X is :- {X_testset.shape} and that of Y is :- {Y_testset.shape}")

The shape of the Training Set of X is :- (140, 5) and that of Y is :- (140,)
The shape of the Testing Set of X is :- (60, 5) and that of Y is :- (60,)


In [16]:
# Modelling the dataset
drug_tree = DecisionTreeClassifier(criterion='entropy', max_depth=4)
drug_tree

In [17]:
drug_tree.fit(X_trainset, Y_trainset)
prediction_tree = drug_tree.predict(X_testset)

In [18]:
print(f"The predicted values of the first 5 elements of the test set are :- {prediction_tree[0:5]}")
print(f"The actual values of the first 5 elements of the test set are :- {Y_testset[0:5]}")

The predicted values of the first 5 elements of the test set are :- ['drugY' 'drugX' 'drugX' 'drugX' 'drugX']
The actual values of the first 5 elements of the test set are :- 40     drugY
51     drugX
139    drugX
197    drugX
170    drugX
Name: Drug, dtype: object


In [19]:
# Model Evaluation
print("The decision tree's accuracy is :- ", metrics.accuracy_score(Y_testset, prediction_tree))

The decision tree's accuracy is :-  0.9833333333333333


In [21]:
export_graphviz(drug_tree, out_file='drug_tree.dot', feature_names=['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K'])
# To convert the .dot file to .png, we can use the following command
!dot -Tpng drug_tree.dot -o drug_tree.png

'dot' is not recognized as an internal or external command,
operable program or batch file.
