## Classification – Simple Decision Rules

#### Import libaries

In [388]:
import pandas as pd

#### Load dataset

In [389]:
data = pd.read_csv('/users/jorge/desktop/drug.csv')
data.head()

Unnamed: 0,age,sex,bp,cholesterol,Na_to_K,drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


##### Using our columns needed

In [390]:
x = data[['age', 'sex', 'bp', 'cholesterol', 'Na_to_K']].values

#### Current dataset

In [391]:
x[0:5]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093],
       [47, 'M', 'LOW', 'HIGH', 10.114],
       [28, 'F', 'NORMAL', 'HIGH', 7.798],
       [61, 'F', 'LOW', 'HIGH', 18.043]], dtype=object)

#### ML Preprocessing

In [392]:
# Since we can't use labels
# in ML we need to convert them into numerical values by preprocessing
from sklearn import preprocessing

#sex label
sex = preprocessing.LabelEncoder()
sex.fit(['F','M'])
x[:,1] = sex.transform(x[:,1])

#blood pressure label
bp = preprocessing.LabelEncoder()
bp.fit([ 'LOW', 'NORMAL', 'HIGH'])
x[:,2] = bp.transform(x[:,2])

#cholesterol label
chol = preprocessing.LabelEncoder()
chol.fit([ 'NORMAL', 'HIGH'])
x[:,3] = chol.transform(x[:,3])

#### After preprocessing

In [393]:
#we now have our x-value
x[0:5]

array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.114],
       [28, 0, 2, 0, 7.798],
       [61, 0, 1, 0, 18.043]], dtype=object)

In [394]:
#making our y-value
y = data["drug"]
y[0:5]

0    drugY
1    drugC
2    drugC
3    drugX
4    drugY
Name: drug, dtype: object

### Train_test_split / Decision tree

In [395]:
#Train_test_split our model
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [396]:
#DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
clf = clf.fit(x_train, y_train)

In [397]:
#checking our parameters
clf.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 4,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [398]:
predict = clf.predict(x_test)
predict

array(['drugY', 'drugX', 'drugX', 'drugX', 'drugC', 'drugB', 'drugX',
       'drugY', 'drugY', 'drugX', 'drugY', 'drugY', 'drugX', 'drugC',
       'drugX', 'drugC', 'drugX', 'drugY', 'drugX', 'drugX', 'drugY',
       'drugY', 'drugY', 'drugA', 'drugY', 'drugX', 'drugY', 'drugY',
       'drugC', 'drugC', 'drugY', 'drugX', 'drugY', 'drugY', 'drugX',
       'drugY', 'drugY', 'drugY', 'drugA', 'drugA', 'drugY', 'drugY',
       'drugC', 'drugX', 'drugY', 'drugY', 'drugY', 'drugC', 'drugY',
       'drugX', 'drugB', 'drugA', 'drugX', 'drugX', 'drugC', 'drugX',
       'drugX', 'drugY', 'drugX', 'drugY'], dtype=object)

In [399]:
clf.predict_proba(x_test)

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0.

In [400]:
#Visually comparing the predictions to the actual values
print (predict[0:5])
print (y_test [0:5])

['drugY' 'drugX' 'drugX' 'drugX' 'drugC']
87     drugY
182    drugX
5      drugX
72     drugX
195    drugC
Name: drug, dtype: object


#### Prediction accuracy

In [401]:
from sklearn.metrics import accuracy_score
print('Accuracy: ',accuracy_score(y_test, predict))


Accuracy:  0.9833333333333333
