# Dataset Cleaning

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# This will prompt for authorization.
from google.colab import drive
drive.mount('/content/drive/')
#     #      #      #       #      #
data = pd.read_csv('/content/drive/MyDrive/Assigment/drug200.csv') #Your Path
data.head()

Mounted at /content/drive/


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [None]:
# Number of samples
num_samples, num_features = data.shape
print('The number of samples',num_samples)
print('The number of features with target feature',num_features)


The number of samples 200
The number of features with target feature 6


In [None]:
# Number if columns with types, No-Null count, names of features and target featur
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [None]:
# Checking for null value
data.isnull().sum()

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

 No null value is shown

In [None]:
# Checking for 0 value
zero_values = data.eq(0).sum().sum()

if zero_values > 0:
    print("There are zero values in the dataset.")
else:
    print("There are no zero values in the dataset.")

There are no zero values in the dataset.


No 0 value  in the datasts

In [None]:
# Remove duplicated row
data = data.drop_duplicates()
data


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


No dupicate rows

In [None]:
# Replace 'Features' with numbers
data['Sex'].replace({'F': 2, 'M': 1}, inplace=True)
data['BP'].replace({'HIGH': 3, 'NORMAL': 2, 'LOW': 1}, inplace=True)
data['Cholesterol'].replace({'HIGH': 2, 'NORMAL': 1}, inplace=True)
data

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,2,3,2,25.355,DrugY
1,47,1,1,2,13.093,drugC
2,47,1,1,2,10.114,drugC
3,28,2,2,2,7.798,drugX
4,61,2,1,2,18.043,DrugY
...,...,...,...,...,...,...
195,56,2,1,2,11.567,drugC
196,16,1,1,2,12.006,drugC
197,52,1,2,2,9.894,drugX
198,23,1,2,1,14.020,drugX


# Model


In [None]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Split the dataset
X = data.drop('Drug', axis=1)
y = data['Drug']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize the Naive Bayes classifier
NB_classifier = GaussianNB()

In [None]:
# Measure the start time for both Classification and Computation
start_time_Classification = time.time()
start_time_Computation = time.time()

In [None]:
# Train the NBclassifier
NB_classifier.fit(X_train, y_train)

In [None]:
# Measure the end time for Classification
end_time_Classification = time.time()

In [None]:
test_pred= NB_classifier.predict(X_test)

In [None]:
# Measure the end time for Computation
end_time_Computation =  time.time()

In [None]:
# Convert training labels to a NumPy array
y_test = np.array(y_test)

In [None]:
# Defining method that Clculate Error Rate
def Erorr_Rate() :
  # Compare testing labels with predicted labels and assign 1 if equal, 0 otherwise
  comparison_result = (y_test != test_pred).astype(int)
  # Calculate the sum of the comparison_result
  return (np.sum(comparison_result))/ len(comparison_result)

res= Erorr_Rate()

# Evaluation

In [None]:
# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, test_pred)
classification_rep = classification_report(y_test, test_pred)

In [None]:
# Display the results
print(f'Accuracy: ',round(accuracy,2))
print(f'Error rate: ',round(res,2))
print('\nClassification Report:')
print(classification_rep)

Accuracy:  0.92
Error rate:  0.08

Classification Report:
              precision    recall  f1-score   support

       DrugY       1.00      0.80      0.89        15
       drugA       0.86      1.00      0.92         6
       drugB       0.75      1.00      0.86         3
       drugC       0.83      1.00      0.91         5
       drugX       1.00      1.00      1.00        11

    accuracy                           0.93        40
   macro avg       0.89      0.96      0.92        40
weighted avg       0.94      0.93      0.92        40



In [None]:
# Calculate the elapsed time in seconds
classification_time = end_time_Classification - start_time_Classification
computational_time = end_time_Computation - start_time_Computation
print("Classification Time:",round(classification_time,4) ,"seconds")
print(f"Computational Time:",round(computational_time,4) ,"seconds")

Classification Time: 0.0261 seconds
Computational Time: 0.0448 seconds
