<a href="https://colab.research.google.com/github/Karan-990/classification-model---build-a-model-that-classification-the-side-effects-of-drug/blob/main/classification_model_build_a_model_that_classification_the_side_effects_of_drug.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
#Accuracy and additional metrics (precision, recall, F1-score) are calculated using the function calculate_additional_metrics
def calculate_additional_metrics(model_name, y_true, y_pred):
    precision = metrics.precision_score(y_true, y_pred,average='weighted')
    recall = metrics.recall_score(y_true, y_pred,average='weighted')
    f1_score = metrics.f1_score(y_true, y_pred,average='weighted')

    print(f"{model_name} - Additional Metrics:")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1_score}\n")

In [None]:
ds=pd.read_csv("/content/drug200.csv")

In [None]:
print("Description:\n",ds.describe())

Description:
               Age     Na_to_K
count  200.000000  200.000000
mean    44.315000   16.084485
std     16.544315    7.223956
min     15.000000    6.269000
25%     31.000000   10.445500
50%     45.000000   13.936500
75%     58.000000   19.380000
max     74.000000   38.247000


In [None]:
print("null values:\n",ds.isnull().sum())

null values:
 Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64


In [None]:
print("data:\n",ds)

data:
      Age Sex      BP Cholesterol  Na_to_K   Drug
0     23   F    HIGH        HIGH   25.355  drugY
1     47   M     LOW        HIGH   13.093  drugC
2     47   M     LOW        HIGH   10.114  drugC
3     28   F  NORMAL        HIGH    7.798  drugX
4     61   F     LOW        HIGH   18.043  drugY
..   ...  ..     ...         ...      ...    ...
195   56   F     LOW        HIGH   11.567  drugC
196   16   M     LOW        HIGH   12.006  drugC
197   52   M  NORMAL        HIGH    9.894  drugX
198   23   M  NORMAL      NORMAL   14.020  drugX
199   40   F     LOW      NORMAL   11.349  drugX

[200 rows x 6 columns]


In [None]:
#Label encoding for categorical variables
from sklearn import preprocessing
sex_encoding=preprocessing.LabelEncoder()
ds['Sex'] = sex_encoding.fit_transform(ds['Sex'])
BP_encoding=preprocessing.LabelEncoder()
ds['BP']=BP_encoding.fit_transform(ds['BP'])
Cholesterol_encoding=preprocessing.LabelEncoder()
ds['Cholesterol']=Cholesterol_encoding.fit_transform(ds['Cholesterol'])
#display the preprocessed dataset
print(ds)

     Age  Sex  BP  Cholesterol  Na_to_K   Drug
0     23    0   0            0   25.355  drugY
1     47    1   1            0   13.093  drugC
2     47    1   1            0   10.114  drugC
3     28    0   2            0    7.798  drugX
4     61    0   1            0   18.043  drugY
..   ...  ...  ..          ...      ...    ...
195   56    0   1            0   11.567  drugC
196   16    1   1            0   12.006  drugC
197   52    1   2            0    9.894  drugX
198   23    1   2            1   14.020  drugX
199   40    0   1            1   11.349  drugX

[200 rows x 6 columns]


In [None]:
#Split the data into features(x) and target variable(y)
xcols=[col for col in ds.columns if col not in ['Drug']]
x=ds[xcols]
y=ds['Drug']

In [None]:
#Split the data into traing data and testing data
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=3)

In [None]:
#Train Logistic Regression Model
from sklearn.linear_model import LogisticRegression
logireg= LogisticRegression()
logireg.fit(x_train, y_train)
lr_prediction = logireg.predict(x_test)
print("LogisticRegression's Accuracy: ", metrics.accuracy_score(y_test, lr_prediction))
calculate_additional_metrics('Logistic Regression', y_test, lr_prediction)

LogisticRegression's Accuracy:  0.8833333333333333
Logistic Regression - Additional Metrics:
Precision: 0.8931222943722943
Recall: 0.8833333333333333
F1-Score: 0.8807004429678849



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
#Train Decision Tree classifier Model
from sklearn.tree import DecisionTreeClassifier
dectree=DecisionTreeClassifier()
dectree.fit(x_train,y_train)
dr_prediction=dectree.predict(x_test)
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_test, dr_prediction))
calculate_additional_metrics('Decision Tree', y_test, dr_prediction)

DecisionTrees's Accuracy:  0.9833333333333333
Decision Tree - Additional Metrics:
Precision: 0.9840579710144927
Recall: 0.9833333333333333
F1-Score: 0.9833152664859981



In [None]:
#Training Random Forest Model
from sklearn.ensemble import RandomForestClassifier
ranforest = RandomForestClassifier()
ranforest.fit(x_train, y_train)
rf_prediction= ranforest.predict(x_test)
print("Random Forest's Accuracy:", metrics.accuracy_score(y_test, rf_prediction))
calculate_additional_metrics('Random Forest', y_test, rf_prediction)

Random Forest's Accuracy: 0.9833333333333333
Random Forest - Additional Metrics:
Precision: 0.9840579710144927
Recall: 0.9833333333333333
F1-Score: 0.9833152664859981



In [None]:
#Training KNN Model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
knn_prediction= knn.predict(x_test)
print("KNN's Accuracy:", metrics.accuracy_score(y_test, knn_prediction))
calculate_additional_metrics('KNN', y_test, knn_prediction)

KNN's Accuracy: 0.6333333333333333
KNN - Additional Metrics:
Precision: 0.7005698005698006
Recall: 0.6333333333333333
F1-Score: 0.6444670280036134



In [None]:
#Training SVM model
from sklearn.svm import SVC
svm= SVC()
svm.fit(x_train, y_train)
svm_prediction= svm.predict(x_test)
print("SVM's Accuracy:", metrics.accuracy_score(y_test, svm_prediction))
calculate_additional_metrics('SVM', y_test, svm_prediction)

SVM's Accuracy: 0.7
SVM - Additional Metrics:
Precision: 0.5226666666666666
Recall: 0.7
F1-Score: 0.5932624113475177



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
#Training Naive Bayes Model
from sklearn.naive_bayes import GaussianNB
naivebayes = GaussianNB()
naivebayes.fit(x_train, y_train)
nb_prediction= naivebayes.predict(x_test)
print("Naive Bayes' Accuracy:", metrics.accuracy_score(y_test, nb_prediction))
calculate_additional_metrics('Naive Bayes', y_test, nb_prediction)

Naive Bayes' Accuracy: 0.85
Naive Bayes - Additional Metrics:
Precision: 0.8854960317460318
Recall: 0.85
F1-Score: 0.8485663320311815



In [None]:
#New patient data
new_patient_data=pd.DataFrame({'Age':[30],'Sex':'F','BP':['NORMAL'],'Cholesterol':['HIGH'],'Na_to_K':[15.0]})

In [None]:
new_patient_data['Sex'] = sex_encoding.fit_transform(new_patient_data['Sex'])
new_patient_data['BP'] = BP_encoding.fit_transform(new_patient_data['BP'])
new_patient_data['Cholesterol'] = Cholesterol_encoding.fit_transform(new_patient_data['Cholesterol'])
print("New Patient Data:")
print(new_patient_data)

New Patient Data:
   Age  Sex  BP  Cholesterol  Na_to_K
0   30    0   0            0     15.0


In [None]:
#Use the trained model for predict for the new patient
logreg_prediction = logireg.predict(new_patient_data)
dt_prediction=dectree.predict(new_patient_data)
rf_prediction = ranforest.predict(new_patient_data)
knn_prediction = knn.predict(new_patient_data)
svm_prediction = svm.predict(new_patient_data)
nb_prediction = naivebayes.predict(new_patient_data)
print("\nPredictions for the New Patient:")
print("Logistic Regression Prediction:", logreg_prediction[0])
print("Decision Tree Prediction:",dt_prediction[0])
print("Random Forest Prediction:", rf_prediction[0])
print("KNN Prediction:", knn_prediction[0])
print("SVM Prediction:", svm_prediction[0])
print("Naive Bayes Prediction:", nb_prediction[0])



Predictions for the New Patient:
Logistic Regression Prediction: drugA
Decision Tree Prediction: drugY
Random Forest Prediction: drugY
KNN Prediction: drugC
SVM Prediction: drugY
Naive Bayes Prediction: drugA
