## Drugs Classification using Logistic Regression And Support Vector Machine (SVM) Algorithm

#### Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
# ignoring warnings
import warnings
warnings.filterwarnings('ignore')

Loading the data

In [2]:
# url for dataset
url = 'https://raw.githubusercontent.com/Lakshit11/CDS-Assignment/main/drug.csv'
data = pd.read_csv(url)
# getting first 5 rows of dataframe named data
data.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


Checking for any missing values in the data

In [3]:
data.isnull().sum()

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

Therefore, the data contains no Null Values, so we can proceed ahead

Now checking value counts of all variables

In [4]:
data.Sex.value_counts()

M    104
F     96
Name: Sex, dtype: int64

In [5]:
# Now for BP
data.BP.value_counts()

HIGH      77
LOW       64
NORMAL    59
Name: BP, dtype: int64

In [6]:
# For cholesterol
data.Cholesterol.value_counts()

HIGH      103
NORMAL     97
Name: Cholesterol, dtype: int64

In [7]:
# And finally for Drug column
data.Drug.value_counts()

DrugY    91
drugX    54
drugA    23
drugC    16
drugB    16
Name: Drug, dtype: int64

Therefore we have 5 types of drugs that can be given to respective person based on other variables or columns

### Encoding the data

In [8]:
# Sex Variable
data['Sex'].replace({'F':0,'M':1},inplace = True)
# BP Variable
data['BP'] = data['BP'].astype('category')
data['BP'] = data['BP'].cat.codes
data = pd.get_dummies(data, columns = ['BP'])
# Cholesterol Variable
data['Cholesterol'] = data['Cholesterol'].astype('category')
data['Cholesterol'] = data['Cholesterol'].cat.codes
data = pd.get_dummies(data, columns = ['Cholesterol'])

In [9]:
data.head()

Unnamed: 0,Age,Sex,Na_to_K,Drug,BP_0,BP_1,BP_2,Cholesterol_0,Cholesterol_1
0,23,0,25.355,DrugY,1,0,0,1,0
1,47,1,13.093,drugC,0,1,0,1,0
2,47,1,10.114,drugC,0,1,0,1,0
3,28,0,7.798,drugX,0,0,1,1,0
4,61,0,18.043,DrugY,0,1,0,1,0


In [10]:
# Dropping Redudant columns
data.drop(['BP_0','Cholesterol_0'],axis = 1, inplace = True)

In [11]:
data.head()

Unnamed: 0,Age,Sex,Na_to_K,Drug,BP_1,BP_2,Cholesterol_1
0,23,0,25.355,DrugY,0,0,0
1,47,1,13.093,drugC,1,0,0
2,47,1,10.114,drugC,1,0,0
3,28,0,7.798,drugX,0,1,0
4,61,0,18.043,DrugY,1,0,0


In [12]:
# Defining Target vector and Feature Matrix
# Feature Matrix
X = data.drop('Drug',axis = 1)
# target vector
y = data['Drug']

### Splitting The Data

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [14]:
# Printing shape of each set
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(140, 6)
(140,)
(60, 6)
(60,)


## Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
LR = LogisticRegression()
LR.fit(X_train, y_train)

LogisticRegression()

In [17]:
predictions = LR.predict(X_test)
predictions

array(['drugX', 'DrugY', 'drugX', 'drugC', 'DrugY', 'DrugY', 'DrugY',
       'drugX', 'drugA', 'drugX', 'DrugY', 'drugX', 'DrugY', 'DrugY',
       'drugB', 'DrugY', 'drugB', 'drugX', 'drugC', 'DrugY', 'drugB',
       'drugX', 'drugX', 'DrugY', 'DrugY', 'DrugY', 'drugC', 'drugX',
       'DrugY', 'drugX', 'DrugY', 'drugC', 'DrugY', 'DrugY', 'drugB',
       'DrugY', 'drugX', 'DrugY', 'DrugY', 'drugA', 'drugX', 'drugX',
       'drugX', 'DrugY', 'DrugY', 'drugC', 'DrugY', 'DrugY', 'drugB',
       'drugX', 'drugX', 'DrugY', 'drugX', 'DrugY', 'drugX', 'DrugY',
       'drugB', 'DrugY', 'DrugY', 'DrugY'], dtype=object)

#### Checking Accuracy using classification_report

In [18]:
from sklearn.metrics import classification_report

In [19]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

       DrugY       0.86      0.96      0.91        26
       drugA       1.00      0.29      0.44         7
       drugB       0.50      1.00      0.67         3
       drugC       1.00      0.83      0.91         6
       drugX       1.00      1.00      1.00        18

    accuracy                           0.88        60
   macro avg       0.87      0.82      0.79        60
weighted avg       0.92      0.88      0.87        60



In [20]:
# Now showing accuracy
from sklearn.metrics import accuracy_score
acc_sc_LR = accuracy_score(y_test, predictions)*100
print('Accuracy of the developed Logistic Regression Model is:', acc_sc_LR,'%')

Accuracy of the developed Logistic Regression Model is: 88.33333333333333 %


## Support Vector Machine(SVM)

In [21]:
# splitting data again for SVM
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, random_state=0)

In [22]:
from sklearn.svm import SVC
sv = SVC()

In [23]:
# fitting the model
sv.fit(X_train1, y_train1)

SVC()

In [24]:
sv_pred = sv.predict(X_test1)

In [25]:
sv_pred

array(['drugX', 'drugX', 'DrugY', 'DrugY', 'DrugY', 'drugX', 'drugX',
       'drugX', 'DrugY', 'drugX', 'drugX', 'DrugY', 'DrugY', 'DrugY',
       'drugX', 'drugX', 'DrugY', 'DrugY', 'drugX', 'DrugY', 'DrugY',
       'drugX', 'drugX', 'drugX', 'DrugY', 'DrugY', 'DrugY', 'DrugY',
       'DrugY', 'drugX', 'drugX', 'DrugY', 'drugX', 'drugX', 'DrugY',
       'DrugY', 'drugX', 'DrugY', 'drugX', 'drugX', 'DrugY', 'drugX',
       'DrugY', 'drugX', 'DrugY', 'drugX', 'DrugY', 'DrugY', 'drugX',
       'DrugY'], dtype=object)

#### Checking Accuracy of SVM

In [26]:
acc_sc_svm = accuracy_score(y_test1, sv_pred)*100
print('Accuracy of SVM is:', acc_sc_svm,'%')

Accuracy of SVM is: 82.0 %


## Hence Accuracy scores for Logistic Regression And SVM are as follows:

In [27]:
print('Accuracy score for Logistic Regression is: ', acc_sc_LR,'%')
print('Accuracy score for Support Vector Machine(SVM) algorithm is: ',acc_sc_svm,'%')

Accuracy score for Logistic Regression is:  88.33333333333333 %
Accuracy score for Support Vector Machine(SVM) algorithm is:  82.0 %


### As a result we could conclude that Logistic Regression is better as compared to Support Vector Machine Algorithm