In [None]:
# 1. Imports libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

In [None]:
# 2. Load Dataset
df = pd.read_csv("drug200.csv")

df.head()
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


Unnamed: 0,0
Age,0
Sex,0
BP,0
Cholesterol,0
Na_to_K,0
Drug,0


In [None]:
# 3. Feature / Target Split
X = df.drop("Drug", axis=1)
y = df["Drug"]

In [None]:
# 4. Encoding

# Target encoding
le_drug = LabelEncoder()
y_encoded = le_drug.fit_transform(y)

# Feature encoding
X_encoded = pd.get_dummies(
    X,
    columns=["Sex", "BP", "Cholesterol"],
    drop_first=True
)

In [None]:
# 5. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,
    y_encoded,
    test_size=0.3,
    random_state=3,
    stratify=y_encoded
)

In [None]:
# 6. Scaling for some models
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# 7. Naive Bayes Models

# 7.1 Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

# Create pipeline
pipe_gnb = Pipeline(steps=[
    ("gnb", GaussianNB())
])

# Parameter grid (note the prefix gnb__)
param_grid = {
    "gnb__var_smoothing": [1e-9, 1e-8, 1e-6, 1e-4]
}

# GridSearchCV
grid_gnb = GridSearchCV(
    estimator=pipe_gnb,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

# Fit model
grid_gnb.fit(X_train, y_train)

# Best model
best_gnb = grid_gnb.best_estimator_

# Predictions
y_pred_gnb = best_gnb.predict(X_test)

# Evaluation
acc_gnb = accuracy_score(y_test, y_pred_gnb)
print("Gaussian NB Accuracy:", acc_gnb)
print(classification_report(y_test, y_pred_gnb, target_names=le_drug.classes_))

Gaussian NB Accuracy: 0.9333333333333333
              precision    recall  f1-score   support

       drugA       0.88      1.00      0.93         7
       drugB       0.83      1.00      0.91         5
       drugC       0.71      1.00      0.83         5
       drugX       1.00      1.00      1.00        16
       drugY       1.00      0.85      0.92        27

    accuracy                           0.93        60
   macro avg       0.88      0.97      0.92        60
weighted avg       0.95      0.93      0.93        60



In [None]:
# 7.2 Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB

# Create pipeline
pipe_mnb = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("mnb", MultinomialNB())
])

# Parameter grid (use step name + __)
param_grid = {
    "mnb__alpha": [0.01, 0.1, 1.0, 10]
}

# GridSearchCV
grid_mnb = GridSearchCV(
    estimator=pipe_mnb,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

# Fit
grid_mnb.fit(X_train, y_train)

# Best model
best_mnb = grid_mnb.best_estimator_

# Predict
y_pred_mnb = best_mnb.predict(X_test)

# Evaluation
acc_mnb = accuracy_score(y_test, y_pred_mnb)
print("Multinomial NB Accuracy:", acc_mnb)
print(classification_report(y_test, y_pred_mnb, target_names=le_drug.classes_))

Multinomial NB Accuracy: 0.6
              precision    recall  f1-score   support

       drugA       0.00      0.00      0.00         7
       drugB       0.00      0.00      0.00         5
       drugC       0.00      0.00      0.00         5
       drugX       1.00      0.56      0.72        16
       drugY       0.53      1.00      0.69        27

    accuracy                           0.60        60
   macro avg       0.31      0.31      0.28        60
weighted avg       0.50      0.60      0.50        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# 7.3 Bernoulli Naive Bayes
from sklearn.naive_bayes import BernoulliNB

# Create pipeline
pipe_bnb = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("bnb", BernoulliNB())
])

# Parameter grid
param_grid = {
    "bnb__alpha": [0.01, 0.1, 1.0]
}

# GridSearchCV
grid_bnb = GridSearchCV(
    estimator=pipe_bnb,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

# Fit
grid_bnb.fit(X_train, y_train)

# Best model
best_bnb = grid_bnb.best_estimator_

# Predict
y_pred_bnb = best_bnb.predict(X_test)

# Evaluation
acc_bnb = accuracy_score(y_test, y_pred_bnb)
print("Bernoulli NB Accuracy:", acc_bnb)
print(classification_report(y_test, y_pred_bnb, target_names=le_drug.classes_))

Bernoulli NB Accuracy: 0.43333333333333335
              precision    recall  f1-score   support

       drugA       0.00      0.00      0.00         7
       drugB       0.00      0.00      0.00         5
       drugC       0.45      1.00      0.62         5
       drugX       0.50      0.44      0.47        16
       drugY       0.40      0.52      0.45        27

    accuracy                           0.43        60
   macro avg       0.27      0.39      0.31        60
weighted avg       0.35      0.43      0.38        60



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# 8. Logistic Regression + Feature Importance
from sklearn.linear_model import LogisticRegression

# Create pipeline
pipe_lr = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=500))
])

# Parameter grid
param_grid = {
    "lr__C": [0.01, 0.1, 1, 10],
    "lr__solver": ["lbfgs", "newton-cg"]
}

# GridSearchCV
grid_lr = GridSearchCV(
    estimator=pipe_lr,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

# Fit
grid_lr.fit(X_train, y_train)

# Best model
best_lr = grid_lr.best_estimator_

# Predict
y_pred_lr = best_lr.predict(X_test)

# Evaluation
acc_lr = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Accuracy:", acc_lr)
print(classification_report(y_test, y_pred_lr, target_names=le_drug.classes_))

Logistic Regression Accuracy: 0.9333333333333333
              precision    recall  f1-score   support

       drugA       0.88      1.00      0.93         7
       drugB       0.83      1.00      0.91         5
       drugC       1.00      1.00      1.00         5
       drugX       1.00      0.88      0.93        16
       drugY       0.93      0.93      0.93        27

    accuracy                           0.93        60
   macro avg       0.93      0.96      0.94        60
weighted avg       0.94      0.93      0.93        60



In [None]:
# Feature Importance (Logistic Regression)
importance_lr = pd.DataFrame(
    best_lr.named_steps["lr"].coef_.mean(axis=0),
    index=X_encoded.columns,
    columns=["Importance"]
).sort_values(by="Importance", ascending=False)

importance_lr.head(10)

Unnamed: 0,Importance
Sex_M,1.576517e-15
Cholesterol_NORMAL,4.32987e-16
BP_LOW,8.604228000000001e-17
BP_NORMAL,-8.437695e-16
Age,-8.548717e-16
Na_to_K,-1.065814e-15


In [None]:
# 9. Decision Tree + Feature Importance
from sklearn.tree import DecisionTreeClassifier

# Create pipeline
pipe_dt = Pipeline(steps=[
    ("dt", DecisionTreeClassifier(random_state=3))
])

# Parameter grid
param_grid = {
    "dt__criterion": ["gini", "entropy"],
    "dt__max_depth": [None, 3, 5, 7],
    "dt__min_samples_split": [2, 5],
    "dt__min_samples_leaf": [1, 2]
}

# GridSearchCV
grid_dt = GridSearchCV(
    estimator=pipe_dt,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

# Fit
grid_dt.fit(X_train, y_train)

# Best model
best_dt = grid_dt.best_estimator_

# Predict
y_pred_dt = best_dt.predict(X_test)

# Evaluation
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy:", acc_dt)
print(classification_report(y_test, y_pred_dt, target_names=le_drug.classes_))

Decision Tree Accuracy: 1.0
              precision    recall  f1-score   support

       drugA       1.00      1.00      1.00         7
       drugB       1.00      1.00      1.00         5
       drugC       1.00      1.00      1.00         5
       drugX       1.00      1.00      1.00        16
       drugY       1.00      1.00      1.00        27

    accuracy                           1.00        60
   macro avg       1.00      1.00      1.00        60
weighted avg       1.00      1.00      1.00        60



In [None]:
# Feature Importance (Decision Tree)
importance_dt = pd.DataFrame(
    best_dt.named_steps["dt"].feature_importances_,
    index=X_encoded.columns,
    columns=["Importance"]
).sort_values(by="Importance", ascending=False)

importance_dt.head(10)


Unnamed: 0,Importance
Na_to_K,0.479234
BP_LOW,0.137262
Age,0.134581
Cholesterol_NORMAL,0.127179
BP_NORMAL,0.121744
Sex_M,0.0


In [None]:
# 10. Random Forest + Feature Importance
from sklearn.ensemble import RandomForestClassifier

# Create pipeline
pipe_rf = Pipeline(steps=[
    ("rf", RandomForestClassifier(random_state=3))
])

# Parameter grid
param_grid = {
    "rf__n_estimators": [50, 100],
    "rf__max_depth": [None, 5, 10],
    "rf__min_samples_split": [2, 5],
    "rf__min_samples_leaf": [1, 2]
}

# GridSearchCV
grid_rf = GridSearchCV(
    estimator=pipe_rf,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

# Fit
grid_rf.fit(X_train, y_train)

# Best model
best_rf = grid_rf.best_estimator_

# Predict
y_pred_rf = best_rf.predict(X_test)

# Evaluation
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", acc_rf)
print(classification_report(y_test, y_pred_rf, target_names=le_drug.classes_))

Random Forest Accuracy: 1.0
              precision    recall  f1-score   support

       drugA       1.00      1.00      1.00         7
       drugB       1.00      1.00      1.00         5
       drugC       1.00      1.00      1.00         5
       drugX       1.00      1.00      1.00        16
       drugY       1.00      1.00      1.00        27

    accuracy                           1.00        60
   macro avg       1.00      1.00      1.00        60
weighted avg       1.00      1.00      1.00        60



In [None]:
# Feature Importance (Random Forest)
importance_rf = pd.DataFrame(
    best_rf.named_steps["rf"].feature_importances_,
    index=X_encoded.columns,
    columns=["Importance"]
).sort_values(by="Importance", ascending=False)

importance_rf.head(10)


Unnamed: 0,Importance
Na_to_K,0.548753
Age,0.141014
BP_NORMAL,0.121747
BP_LOW,0.105589
Cholesterol_NORMAL,0.066474
Sex_M,0.016423


In [None]:
# 11. KNN (DISTANCE-BASED)
from sklearn.neighbors import KNeighborsClassifier

# Create pipeline
pipe_knn = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier())
])

# Parameter grid
param_grid = {
    "knn__n_neighbors": [3, 5, 7, 9],
    "knn__weights": ["uniform", "distance"],
    "knn__metric": ["euclidean", "manhattan"]
}

# GridSearchCV
grid_knn = GridSearchCV(
    estimator=pipe_knn,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

# Fit
grid_knn.fit(X_train, y_train)

# Best model
best_knn = grid_knn.best_estimator_

# Predict
y_pred_knn = best_knn.predict(X_test)

# Evaluation
acc_knn = accuracy_score(y_test, y_pred_knn)
print("KNN Accuracy:", acc_knn)
print(classification_report(y_test, y_pred_knn, target_names=le_drug.classes_))

KNN Accuracy: 0.9
              precision    recall  f1-score   support

       drugA       0.60      0.86      0.71         7
       drugB       1.00      0.60      0.75         5
       drugC       1.00      1.00      1.00         5
       drugX       0.94      1.00      0.97        16
       drugY       0.96      0.89      0.92        27

    accuracy                           0.90        60
   macro avg       0.90      0.87      0.87        60
weighted avg       0.92      0.90      0.90        60



In [None]:
# 12. Support Vector Classifier (SVC)
from sklearn.svm import SVC

# Create pipeline
pipe_svc = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("svc", SVC())
])

# Parameter grid
param_grid = {
    "svc__C": [0.1, 1, 10],
    "svc__kernel": ["linear", "rbf"],
    "svc__gamma": ["scale", "auto"]
}

# GridSearchCV
grid_svc = GridSearchCV(
    estimator=pipe_svc,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

# Fit
grid_svc.fit(X_train, y_train)

# Best model
best_svc = grid_svc.best_estimator_

# Predict
y_pred_svc = best_svc.predict(X_test)

# Evaluation
acc_svc = accuracy_score(y_test, y_pred_svc)
print("Support Vector Accuracy:", acc_svc)
print(classification_report(y_test, y_pred_svc, target_names=le_drug.classes_))

Support Vector Accuracy: 0.95
              precision    recall  f1-score   support

       drugA       0.88      1.00      0.93         7
       drugB       1.00      0.80      0.89         5
       drugC       1.00      1.00      1.00         5
       drugX       1.00      0.94      0.97        16
       drugY       0.93      0.96      0.95        27

    accuracy                           0.95        60
   macro avg       0.96      0.94      0.95        60
weighted avg       0.95      0.95      0.95        60



In [None]:
# 13. XGBoost + Feature Importance
from xgboost import XGBClassifier

# Create pipeline
pipe_xgb = Pipeline(steps=[
    ("xgb", XGBClassifier(
        objective="multi:softprob",
        num_class=len(le_drug.classes_),
        eval_metric="mlogloss",
        random_state=3
    ))
])

# Parameter grid
param_grid = {
    "xgb__n_estimators": [50, 100],
    "xgb__max_depth": [3, 5],
    "xgb__learning_rate": [0.05, 0.1]
}

# GridSearchCV
grid_xgb = GridSearchCV(
    estimator=pipe_xgb,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

# Fit
grid_xgb.fit(X_train, y_train)

# Best model
best_xgb = grid_xgb.best_estimator_

# Predict
y_pred_xgb = best_xgb.predict(X_test)

# Evaluation
acc_xgb = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Accuracy:", acc_xgb)
print(classification_report(y_test, y_pred_xgb, target_names=le_drug.classes_))

XGBoost Accuracy: 1.0
              precision    recall  f1-score   support

       drugA       1.00      1.00      1.00         7
       drugB       1.00      1.00      1.00         5
       drugC       1.00      1.00      1.00         5
       drugX       1.00      1.00      1.00        16
       drugY       1.00      1.00      1.00        27

    accuracy                           1.00        60
   macro avg       1.00      1.00      1.00        60
weighted avg       1.00      1.00      1.00        60



In [None]:
# Feature Importance (XGBoost)
importance_xgb = pd.DataFrame(
    best_xgb.named_steps["xgb"].feature_importances_,
    index=X_encoded.columns,
    columns=["Importance"]
).sort_values(by="Importance", ascending=False)

importance_xgb.head(10)

Unnamed: 0,Importance
Na_to_K,0.309984
Cholesterol_NORMAL,0.213886
BP_LOW,0.203112
BP_NORMAL,0.191361
Age,0.080305
Sex_M,0.001353


In [None]:
# 14. Final Model Comparison
final_results = {
    "Gaussian NB": acc_gnb,
    "Multinomial NB": acc_mnb,
    "Bernoulli NB": acc_bnb,
    "Logistic Regression": acc_lr,
    "Decision Tree": acc_dt,
    "Random Forest": acc_rf,
    "KNN": acc_knn,
    "SVC": acc_svc,
    "XGBoost": acc_xgb
}

pd.DataFrame.from_dict(
    final_results,
    orient="index",
    columns=["Test Accuracy"]
).sort_values(by="Test Accuracy", ascending=False)

Unnamed: 0,Test Accuracy
Decision Tree,1.0
XGBoost,1.0
Random Forest,1.0
SVC,0.95
Gaussian NB,0.933333
Logistic Regression,0.933333
KNN,0.9
Multinomial NB,0.6
Bernoulli NB,0.433333
