

<h1 align="center"><font size="5">Predicting Type Of Drug</font></h1>

In [1]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.ticker as ticker
from sklearn import preprocessing
%matplotlib inline

### Load Data From CSV File  

In [2]:
df = pd.read_csv ('E:\Datasets\drug200.csv')   
df.head(100)

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
95,36,M,LOW,NORMAL,11.424,drugX
96,58,F,LOW,HIGH,38.247,drugY
97,56,F,HIGH,HIGH,25.395,drugY
98,20,M,HIGH,NORMAL,35.639,drugY


In [3]:
df.isnull()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
195,False,False,False,False,False,False
196,False,False,False,False,False,False
197,False,False,False,False,False,False
198,False,False,False,False,False,False


In [4]:
df.describe()

Unnamed: 0,Age,Na_to_K
count,200.0,200.0
mean,44.315,16.084485
std,16.544315,7.223956
min,15.0,6.269
25%,31.0,10.4455
50%,45.0,13.9365
75%,58.0,19.38
max,74.0,38.247


In [5]:
df.shape

(200, 6)

# Data visualization and pre-processing

In [6]:
X = df[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']].values
X[0:5]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093],
       [47, 'M', 'LOW', 'HIGH', 10.113999999999999],
       [28, 'F', 'NORMAL', 'HIGH', 7.797999999999999],
       [61, 'F', 'LOW', 'HIGH', 18.043]], dtype=object)

As you may figure out, some features in this dataset are categorical such as __Sex__ or __BP__. Unfortunately, Sklearn Decision Trees do not handle categorical variables. But still we can convert these features to numerical values. __pandas.get_dummies()__
Convert categorical variable into dummy/indicator variables.

# Pre-processing:  Feature selection/extraction

In [7]:
from sklearn import preprocessing
le_sex = preprocessing.LabelEncoder()
le_sex.fit(['F','M'])
X[:,1] = le_sex.transform(X[:,1]) 


le_BP = preprocessing.LabelEncoder()
le_BP.fit([ 'LOW', 'NORMAL', 'HIGH'])
X[:,2] = le_BP.transform(X[:,2])


le_Chol = preprocessing.LabelEncoder()
le_Chol.fit([ 'NORMAL', 'HIGH'])
X[:,3] = le_Chol.transform(X[:,3]) 

X[0:5]


array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.113999999999999],
       [28, 0, 2, 0, 7.797999999999999],
       [61, 0, 1, 0, 18.043]], dtype=object)

In [8]:
y = df["Drug"]
y[0:5]

0    drugY
1    drugC
2    drugC
3    drugX
4    drugY
Name: Drug, dtype: object

## Normalize Data 

In [9]:
X= preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[-1.29159102, -1.040833  , -1.11016894, -0.97043679,  1.28652212],
       [ 0.16269866,  0.96076892,  0.10979693, -0.97043679, -0.4151454 ],
       [ 0.16269866,  0.96076892,  0.10979693, -0.97043679, -0.82855818],
       [-0.988614  , -1.040833  ,  1.32976279, -0.97043679, -1.14996267],
       [ 1.0110343 , -1.040833  ,  0.10979693, -0.97043679,  0.27179427]])

# Classification 

- K Nearest Neighbor(KNN)
- Decision Tree
- Support Vector Machine
- Logistic Regression
- Naive Bayes
- Random Forest

## Divide the dataset into Training and Test Set

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=4)

# K Nearest Neighbor(KNN)

In [11]:
from sklearn.neighbors import KNeighborsClassifier

Ks = 10
for n in range(1,Ks):
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat_KNN=neigh.predict(X_test)
    print("The predicted values using K = ", n, "is ", yhat_KNN[0:5])

The predicted values using K =  1 is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugC']
The predicted values using K =  2 is  ['drugY' 'drugY' 'drugY' 'drugB' 'drugC']
The predicted values using K =  3 is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugY']
The predicted values using K =  4 is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugY']
The predicted values using K =  5 is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugY']
The predicted values using K =  6 is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugY']
The predicted values using K =  7 is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugY']
The predicted values using K =  8 is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugY']
The predicted values using K =  9 is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugY']


# Decision Tree

In [12]:
from sklearn.tree import DecisionTreeClassifier
dtree_criterions = ['entropy', 'gini']
for i in dtree_criterions:
    drugTree = DecisionTreeClassifier(criterion=i)
    drugTree.fit(X_train, y_train)
    predTree = drugTree.predict(X_test)
    print ("The values predicted by Decision Tree using criterion = ",i," is ", predTree [0:5])

The values predicted by Decision Tree using criterion =  entropy  is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugC']
The values predicted by Decision Tree using criterion =  gini  is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugC']


# Support Vector Machine

In [13]:
from sklearn import svm
svm_kernels = ['rbf', 'linear', 'poly', 'sigmoid']
for i in svm_kernels:
    clf = svm.SVC(kernel=i)
    clf.fit(X_train, y_train) 
    yhat_SVM = clf.predict(X_test)
    print("The values predicted by SVM using kernels = ", i, " is ", yhat_SVM [0:5])

The values predicted by SVM using kernels =  rbf  is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugY']
The values predicted by SVM using kernels =  linear  is  ['drugY' 'drugY' 'drugY' 'drugB' 'drugY']
The values predicted by SVM using kernels =  poly  is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugY']
The values predicted by SVM using kernels =  sigmoid  is  ['drugY' 'drugA' 'drugY' 'drugB' 'drugC']


# Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
c_param_range=[0.001,0.01,0.1]
lr_solvers=['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
for i in lr_solvers:
    for f in c_param_range:
        LR = LogisticRegression(C=f, solver=i).fit(X_train,y_train)
        yhat_LR = LR.predict(X_test)
        print("The values predicted by Logistic Regression using c = ",f," and solver = ",i, " is ",yhat_LR[0:5])

The values predicted by Logistic Regression using c =  0.001  and solver =  liblinear  is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugX']
The values predicted by Logistic Regression using c =  0.01  and solver =  liblinear  is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugX']
The values predicted by Logistic Regression using c =  0.1  and solver =  liblinear  is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugY']
The values predicted by Logistic Regression using c =  0.001  and solver =  newton-cg  is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugY']
The values predicted by Logistic Regression using c =  0.01  and solver =  newton-cg  is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugY']
The values predicted by Logistic Regression using c =  0.1  and solver =  newton-cg  is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugY']
The values predicted by Logistic Regression using c =  0.001  and solver =  lbfgs  is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugY']
The values predicted by Logistic Regression using c =  0.01  and solver =  lbfg

# Naive bayes

In [16]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred_NB = gnb.fit(X_train, y_train).predict(X_test)
print("The values predicted by Naive Bayes is ", y_pred_NB[0:5])

The values predicted by Naive Bayes is  ['drugA' 'drugA' 'drugY' 'drugB' 'drugC']


# Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier
rf_criterion=['entropy', 'gini']
for i in rf_criterion:
    clf=RandomForestClassifier(criterion=i)
    clf.fit(X_train,y_train)
    y_pred_RF=clf.predict(X_test)
    print("The values predicted by Random Forest using criterion = ", i," is ",y_pred_RF[0:5])


The values predicted by Random Forest using criterion =  entropy  is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugC']
The values predicted by Random Forest using criterion =  gini  is  ['drugY' 'drugY' 'drugY' 'drugY' 'drugC']


# Model Evaluation using Test set

In [18]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

### KNN

In [19]:
Ks = 10
accuracy_scor = np.zeros((Ks-1))
f1_scor = np.zeros((Ks-1))


for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat_KNN=neigh.predict(X_test)
    accuracy_scor[n-1] = accuracy_score(y_test, yhat_KNN)
    f1_scor[n-1] = f1_score(y_test, yhat_KNN, average = 'weighted')
    print("The accuracy score with k = ",n, "is ",accuracy_scor[n-1])
    print("The f1 score with k = ",n, "is ",f1_scor[n-1])
    

print( "\nThe best accuracy score for KNN was ", accuracy_scor.max(), "with k=", accuracy_scor.argmax()+1) 
print( "The best f1 score for KNN was ", f1_scor.max(), "with k=", f1_scor.argmax()+1) 

The accuracy score with k =  1 is  0.8833333333333333
The f1 score with k =  1 is  0.8844955300127714
The accuracy score with k =  2 is  0.8333333333333334
The f1 score with k =  2 is  0.8335500033346671
The accuracy score with k =  3 is  0.8333333333333334
The f1 score with k =  3 is  0.8335819019689988
The accuracy score with k =  4 is  0.7166666666666667
The f1 score with k =  4 is  0.7164793092212448
The accuracy score with k =  5 is  0.6833333333333333
The f1 score with k =  5 is  0.6881845752813495
The accuracy score with k =  6 is  0.6833333333333333
The f1 score with k =  6 is  0.6892581927876047
The accuracy score with k =  7 is  0.7166666666666667
The f1 score with k =  7 is  0.7207855650097028
The accuracy score with k =  8 is  0.65
The f1 score with k =  8 is  0.6488748137108793
The accuracy score with k =  9 is  0.6666666666666666
The f1 score with k =  9 is  0.6611744687013504

The best accuracy score for KNN was  0.8833333333333333 with k= 1
The best f1 score for KNN was

#### Classification Report of KNN

In [20]:
neigh = KNeighborsClassifier(n_neighbors = accuracy_scor.argmax()+1).fit(X_train,y_train)
yhat_KNN=neigh.predict(X_test)
print (classification_report(y_test, yhat_KNN))

              precision    recall  f1-score   support

       drugA       0.88      1.00      0.93         7
       drugB       0.67      0.67      0.67         3
       drugC       1.00      1.00      1.00         8
       drugX       0.75      0.92      0.83        13
       drugY       0.96      0.83      0.89        29

    accuracy                           0.88        60
   macro avg       0.85      0.88      0.86        60
weighted avg       0.90      0.88      0.88        60



### Decision tree

In [21]:
f = 0
ascore_dtree = np.zeros(2)
fscore_dtree = np.zeros(2)
dtree_criterions = ['entropy', 'gini']
for i in dtree_criterions:
    drugTree = DecisionTreeClassifier(criterion=i)
    drugTree.fit(X_train, y_train)
    predTree = drugTree.predict(X_test)
    ascore_dtree[f] = accuracy_score(y_test, predTree)
    fscore_dtree[f] = f1_score(y_test, predTree, average = 'weighted')
    print("DecisionTrees's Accuracy with criterion = ", i," using Accuracy Score: ", accuracy_score(y_test, predTree))
    print("DecisionTrees's Accuracy with criterion = ", i," using F1 Score: ", f1_score(y_test, predTree, average = 'weighted'))
    f += 1
print("\nThe best accuracy score using Decision Tree is ",ascore_dtree.max())
print("The best f1 score using Decision Tree is ",fscore_dtree.max())

DecisionTrees's Accuracy with criterion =  entropy  using Accuracy Score:  0.9666666666666667
DecisionTrees's Accuracy with criterion =  entropy  using F1 Score:  0.9653634651600753
DecisionTrees's Accuracy with criterion =  gini  using Accuracy Score:  0.9666666666666667
DecisionTrees's Accuracy with criterion =  gini  using F1 Score:  0.9653634651600753

The best accuracy score using Decision Tree is  0.9666666666666667
The best f1 score using Decision Tree is  0.9653634651600753


#### Classification Report of Decision Tree

In [22]:
k = ['entropy','gini']
for i in k:
    drugTree = DecisionTreeClassifier(criterion=i)
    drugTree.fit(X_train, y_train)
    predTree = drugTree.predict(X_test)
    print("Classification report for criterion = ",i,"\n\n",classification_report(y_test, predTree))


Classification report for criterion =  entropy 

               precision    recall  f1-score   support

       drugA       0.88      1.00      0.93         7
       drugB       1.00      0.67      0.80         3
       drugC       1.00      1.00      1.00         8
       drugX       1.00      0.92      0.96        13
       drugY       0.97      1.00      0.98        29

    accuracy                           0.97        60
   macro avg       0.97      0.92      0.94        60
weighted avg       0.97      0.97      0.97        60

Classification report for criterion =  gini 

               precision    recall  f1-score   support

       drugA       0.88      1.00      0.93         7
       drugB       1.00      0.67      0.80         3
       drugC       1.00      1.00      1.00         8
       drugX       1.00      0.92      0.96        13
       drugY       0.97      1.00      0.98        29

    accuracy                           0.97        60
   macro avg       0.97      0.92 

### Support Vector Machine

In [23]:
f=0
ascore_svm = np.zeros(4)
fscore_svm = np.zeros(4)
svm_kernels = ['rbf', 'linear', 'poly', 'sigmoid']

for i in svm_kernels:
    clf = svm.SVC(kernel=i)
    clf.fit(X_train, y_train) 
    yhat_SVM = clf.predict(X_test)
    ascore_svm[f] = accuracy_score(y_test, yhat_SVM)
    fscore_svm[f] = f1_score(y_test, yhat_SVM, average='weighted')
    print("The accuracy score for SVM with kernel = ", i," is : ", accuracy_score(y_test, yhat_SVM))
    print("The f1 score for SVM with kernel = ", i," is: ", f1_score(y_test, yhat_SVM, average='weighted'))
    f += 1
print("\nThe best accuracy score using SVM is ",ascore_svm.max())
print("The best f1 score using SVM is ",fscore_svm.max())    

The accuracy score for SVM with kernel =  rbf  is :  0.8833333333333333
The f1 score for SVM with kernel =  rbf  is:  0.8826190476190476
The accuracy score for SVM with kernel =  linear  is :  0.9333333333333333
The f1 score for SVM with kernel =  linear  is:  0.936466165413534
The accuracy score for SVM with kernel =  poly  is :  0.7666666666666667
The f1 score for SVM with kernel =  poly  is:  0.7256016215220195
The accuracy score for SVM with kernel =  sigmoid  is :  0.85
The f1 score for SVM with kernel =  sigmoid  is:  0.8566363919305096

The best accuracy score using SVM is  0.9333333333333333
The best f1 score using SVM is  0.936466165413534


#### Classification Report of SVM

In [24]:
k = ['rbf', 'linear', 'poly', 'sigmoid']
for i in k:
    clf = svm.SVC(kernel=i)
    clf.fit(X_train, y_train) 
    yhat_SVM = clf.predict(X_test)
    print ("Classification report for kernel = ",i," \n\n",classification_report(y_test, yhat_SVM))

Classification report for kernel =  rbf  

               precision    recall  f1-score   support

       drugA       0.86      0.86      0.86         7
       drugB       0.67      0.67      0.67         3
       drugC       1.00      0.75      0.86         8
       drugX       0.92      0.92      0.92        13
       drugY       0.87      0.93      0.90        29

    accuracy                           0.88        60
   macro avg       0.86      0.83      0.84        60
weighted avg       0.89      0.88      0.88        60

Classification report for kernel =  linear  

               precision    recall  f1-score   support

       drugA       0.88      1.00      0.93         7
       drugB       0.50      0.67      0.57         3
       drugC       1.00      0.88      0.93         8
       drugX       1.00      1.00      1.00        13
       drugY       0.96      0.93      0.95        29

    accuracy                           0.93        60
   macro avg       0.87      0.89      0

### Logistic Regression

In [25]:
g = 0
ascore_lr = np.zeros(15)
fscore_lr = np.zeros(15)
c_param_range=[0.001,0.01,0.1]
lr_solvers=['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
for i in lr_solvers:
    for f in c_param_range:
        LR = LogisticRegression(C=f, solver=i).fit(X_train,y_train)
        yhat_LR = LR.predict(X_test)
        ascore_lr[g] = accuracy_score(y_test, yhat_LR)
        fscore_lr[g] = f1_score(y_test, yhat_LR, average='weighted')
        print("The accuracy score for Logistic Regression with solver = ", f, " and kernel = ", i," is : ", accuracy_score(y_test, yhat_LR))
        print("The f1 score for Logistic Regression with solver = ", f, " and kernel = ", i," is : ", f1_score(y_test, yhat_LR, average='weighted'))
        g += 1
print("\nThe best accuracy score using Logistic Regression is ",ascore_lr.max())
print("The best f1 score using Logistic Regression is ",fscore_lr.max())              

The accuracy score for Logistic Regression with solver =  0.001  and kernel =  liblinear  is :  0.7666666666666667
The f1 score for Logistic Regression with solver =  0.001  and kernel =  liblinear  is :  0.7282407407407407
The accuracy score for Logistic Regression with solver =  0.01  and kernel =  liblinear  is :  0.7833333333333333
The f1 score for Logistic Regression with solver =  0.01  and kernel =  liblinear  is :  0.7421320346320345
The accuracy score for Logistic Regression with solver =  0.1  and kernel =  liblinear  is :  0.7833333333333333
The f1 score for Logistic Regression with solver =  0.1  and kernel =  liblinear  is :  0.7386554621848741
The accuracy score for Logistic Regression with solver =  0.001  and kernel =  newton-cg  is :  0.48333333333333334
The f1 score for Logistic Regression with solver =  0.001  and kernel =  newton-cg  is :  0.31498127340823967
The accuracy score for Logistic Regression with solver =  0.01  and kernel =  newton-cg  is :  0.61666666666

#### Classification Report of Logistic Regression

In [26]:
k = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'sag']
for i in k:
    LR = LogisticRegression(C=0.01, solver=i).fit(X_train,y_train)
    yhat_LR = LR.predict(X_test)
    print("Classification report for solver = ",i," \n\n",classification_report(y_test, yhat_LR))

Classification report for solver =  liblinear  

               precision    recall  f1-score   support

       drugA       1.00      1.00      1.00         7
       drugB       0.60      1.00      0.75         3
       drugC       0.00      0.00      0.00         8
       drugX       0.55      0.92      0.69        13
       drugY       0.96      0.86      0.91        29

    accuracy                           0.78        60
   macro avg       0.62      0.76      0.67        60
weighted avg       0.73      0.78      0.74        60

Classification report for solver =  newton-cg  

               precision    recall  f1-score   support

       drugA       0.00      0.00      0.00         7
       drugB       0.00      0.00      0.00         3
       drugC       0.00      0.00      0.00         8
       drugX       0.89      0.62      0.73        13
       drugY       0.57      1.00      0.72        29

    accuracy                           0.62        60
   macro avg       0.29      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Naive Bayes

In [27]:
gnb = GaussianNB()
y_pred_NB = gnb.fit(X_train, y_train).predict(X_test)
print("The accuracy score for Naive Bayes is : ", accuracy_score(y_test, y_pred_NB))
print("The f1 score for Naive Bayes is: ", f1_score(y_test, y_pred_NB, average='weighted'))

The accuracy score for Naive Bayes is :  0.85
The f1 score for Naive Bayes is:  0.8565713088654265


#### Classification report of Naive Bayes

In [28]:
print(classification_report(y_test, y_pred_NB))

              precision    recall  f1-score   support

       drugA       0.70      1.00      0.82         7
       drugB       0.40      0.67      0.50         3
       drugC       0.80      1.00      0.89         8
       drugX       1.00      0.92      0.96        13
       drugY       0.96      0.76      0.85        29

    accuracy                           0.85        60
   macro avg       0.77      0.87      0.80        60
weighted avg       0.89      0.85      0.86        60



## Random Forest

In [29]:
f=0
ascore_rf = np.zeros(2)
fscore_rf = np.zeros(2)
rf_criterion=['entropy', 'gini']
for i in rf_criterion:
    clf=RandomForestClassifier(criterion=i)
    clf.fit(X_train,y_train)
    y_pred_RF=clf.predict(X_test)
    ascore_rf[f] = accuracy_score(y_test, y_pred_RF)
    fscore_rf[f] = f1_score(y_test, y_pred_RF, average='weighted')
    print("The accuracy score for Random Forest with criterion = ",i," is ", accuracy_score(y_test, y_pred_RF))
    print("The f1 score for Random Forest with criterion = ",i," is ", f1_score(y_test, y_pred_RF, average='weighted'))
    f += 1
    
print("\nThe best accuracy score using Random Forest is ",ascore_rf.max())
print("The best f1 score using Random Forest is ",fscore_rf.max())    

The accuracy score for Random Forest with criterion =  entropy  is  0.9666666666666667
The f1 score for Random Forest with criterion =  entropy  is  0.9653634651600753
The accuracy score for Random Forest with criterion =  gini  is  0.9666666666666667
The f1 score for Random Forest with criterion =  gini  is  0.9653634651600753

The best accuracy score using Random Forest is  0.9666666666666667
The best f1 score using Random Forest is  0.9653634651600753


#### Classification report of Random Forest

In [30]:
k = ['entropy', 'gini']
for i in k:
    clf=RandomForestClassifier(criterion=i)
    clf.fit(X_train,y_train)
    y_pred_RF=clf.predict(X_test)
    print("Classification report for criterion = ",i," \n\n",classification_report(y_test, y_pred_RF))


Classification report for criterion =  entropy  

               precision    recall  f1-score   support

       drugA       0.88      1.00      0.93         7
       drugB       1.00      0.67      0.80         3
       drugC       1.00      1.00      1.00         8
       drugX       1.00      0.92      0.96        13
       drugY       0.97      1.00      0.98        29

    accuracy                           0.97        60
   macro avg       0.97      0.92      0.94        60
weighted avg       0.97      0.97      0.97        60

Classification report for criterion =  gini  

               precision    recall  f1-score   support

       drugA       0.88      1.00      0.93         7
       drugB       1.00      0.67      0.80         3
       drugC       1.00      1.00      1.00         8
       drugX       1.00      0.92      0.96        13
       drugY       0.97      1.00      0.98        29

    accuracy                           0.97        60
   macro avg       0.97      0.9

# Report on accuracy of different algorithms using F1 score and Accuracy Score

In [31]:
# js = jaccard score.......fs = F1-Score
# KNN
knnas = accuracy_scor.max()
knnfs = f1_scor.max()
# DTree with max accuracy
dtreeas = ascore_dtree.max()
dtreefs = fscore_dtree.max()

# SVM with max accuracy
svmas = ascore_svm.max()
svmfs = fscore_svm.max()

# Logistic regression with max accuracy
lras = ascore_lr.max()
lrfs = fscore_lr.max()

# Naive Bayes
nbas = accuracy_score(y_test, y_pred_NB)
nbfs = f1_score(y_test, y_pred_NB, average='weighted')

# Random Forest
rfas = ascore_rf.max()
rffs = fscore_rf.max()

#max of all
max_as = [knnas,dtreeas,svmas,lras,nbas,rfas]
max_fs = [knnfs,dtreefs,svmfs,lrfs,nbfs,rffs]

## Final Report

In [32]:
data = {'Algorithm':['K-Nearest Neighor', 'Decision tree', 'Support Vector Machine', 'Logistic regression', 'Naive Bayes', 'Random Forest'], 
        'Accuracy_Score':max_as, 'F1-Score':max_fs}
s = pd.DataFrame(data, index = [1,2,3,4,5,6])
s

Unnamed: 0,Algorithm,Accuracy_Score,F1-Score
1,K-Nearest Neighor,0.883333,0.884496
2,Decision tree,0.966667,0.965363
3,Support Vector Machine,0.933333,0.936466
4,Logistic regression,0.783333,0.742132
5,Naive Bayes,0.85,0.856571
6,Random Forest,0.966667,0.965363
