In [1]:
import pandas as pd
random_state = 44233

df = pd.read_csv("drug200.csv")

In [2]:
X = df[["Age", "Sex", "BP", "Cholesterol", "Na_to_K"]]
y = df["Drug"]

In [3]:
sex = pd.get_dummies(X["Sex"], drop_first=False)
X = X.drop("Sex", axis=1)
X = pd.concat([X, sex], axis=1)

In [4]:
print(X.BP.unique())
print(X.Cholesterol.unique())

['HIGH' 'LOW' 'NORMAL']
['HIGH' 'NORMAL']


In [5]:
dic_to_replace = {
    "BP": 
        {"LOW": 0, "NORMAL": 1, "HIGH": 2},
    "Cholesterol": 
        {"HIGH": 1, "NORMAL": 0}
}

X.replace(dic_to_replace, inplace=True)

In [6]:
print(X.BP.unique())
print(X.Cholesterol.unique())
XCOLS = X.columns

[2 0 1]
[1 0]


In [7]:
# By running the standard scaler the accuracy will go from 45% to 50%
# By running the min max scaler the accuracy will go from 45% to 42.5%

def scale_standard(X):
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler() 
    X = scaler.fit_transform(X)
    return X

def scale_min_max(X):
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler() 
    X = scaler.fit_transform(X)
    return X

# X = scale_standard(X)
# X = scale_min_max(X)

In [8]:
X = pd.DataFrame(data=X, columns=XCOLS)
X.head()

Unnamed: 0,Age,BP,Cholesterol,Na_to_K,F,M
0,23,2,1,25.355,1,0
1,47,0,1,13.093,0,1
2,47,0,1,10.114,0,1
3,28,1,1,7.798,1,0
4,61,0,1,18.043,1,0


In [9]:
drug = pd.get_dummies(y, drop_first=False)
y = pd.concat([y, drug], axis=1)
y = y.drop("Drug", axis=1)

In [10]:
y.head()

Unnamed: 0,drugA,drugB,drugC,drugX,drugY
0,0,0,0,0,1
1,0,0,1,0,0
2,0,0,1,0,0
3,0,0,0,1,0
4,0,0,0,0,1


In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=random_state)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(160, 6)
(40, 6)
(160, 5)
(40, 5)


In [12]:
from sklearn.tree import DecisionTreeClassifier

drugTree = DecisionTreeClassifier(criterion="entropy")

drugTree.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy')

In [13]:
y_pred = drugTree.predict(X_test) # Predict y values
y_test = y_test.to_numpy()        # Convert real y values to array

In [14]:
correct = 0
incorrect = 0
for real, pred in zip(y_test, y_pred):
    print(real, pred, end="")
    if list(real) == list(pred):
        print(" Correct!")
        correct+=1
    elif list(real) != list(pred):
        print(" Incorrect!")
        incorrect+=1

print(f"{correct/len(y_test)*100}% accuracy")

[0 0 0 0 1] [0 0 0 0 1] Correct!
[1 0 0 0 0] [1 0 0 0 0] Correct!
[0 0 0 0 1] [0 0 0 0 1] Correct!
[0 0 1 0 0] [0 0 1 0 0] Correct!
[0 0 1 0 0] [0 0 1 0 0] Correct!
[0 0 0 0 1] [0 0 0 0 1] Correct!
[0 1 0 0 0] [0 1 0 0 0] Correct!
[0 0 0 0 1] [0 0 0 0 1] Correct!
[0 0 1 0 0] [0 0 1 0 0] Correct!
[0 0 0 1 0] [0 0 0 1 0] Correct!
[0 0 0 0 1] [0 0 0 0 1] Correct!
[0 0 0 1 0] [0 0 0 0 1] Incorrect!
[0 0 0 1 0] [0 0 0 1 0] Correct!
[1 0 0 0 0] [1 0 0 0 0] Correct!
[0 0 0 0 1] [0 0 0 0 1] Correct!
[0 0 0 1 0] [0 0 0 1 0] Correct!
[0 0 0 1 0] [0 0 0 1 0] Correct!
[0 0 0 1 0] [0 0 0 1 0] Correct!
[0 1 0 0 0] [0 1 0 0 0] Correct!
[0 0 0 0 1] [0 0 0 0 1] Correct!
[0 0 0 0 1] [0 0 0 0 1] Correct!
[0 0 0 1 0] [0 0 0 1 0] Correct!
[0 0 0 0 1] [0 0 0 0 1] Correct!
[0 0 0 1 0] [0 0 0 1 0] Correct!
[0 0 0 0 1] [0 0 0 0 1] Correct!
[1 0 0 0 0] [1 0 0 0 0] Correct!
[0 0 0 1 0] [0 0 0 1 0] Correct!
[0 0 0 0 1] [0 0 0 0 1] Correct!
[1 0 0 0 0] [1 0 0 0 0] Correct!
[0 1 0 0 0] [0 1 0 0 0] Correct!
[0 0 0 0

In [15]:
from sklearn.metrics import accuracy_score

print(f"{accuracy_score(y_test, y_pred)*100}% accuracy")

97.5% accuracy


# Results without Na_to_K:
## Accuracy without scaled data: 45%
## Accuracy with standard scaled data: 50%
## Accuracy with min max scaled data: 42.5%

# Results with Na_to_K:
## Accuracy without scaled data: 97.5%
## Accuracy with standard scaled data: 97.5%
## Accuracy with min max scaled data: 95%

Min max scaler is normalization

# Conclusions:
Use standard scaled data (standardization) when data has varying scales (age 0-100 and BP 0-1)