In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# Load dataset
df = pd.read_csv("/content/drugs_side_effects_drugs_com.csv")
print(f"Initial dataset shape: {df.shape}")
print(f"Initial unique drug names: {df['drug_name'].nunique()}")
print("Initial class distribution:")
print(df['drug_name'].value_counts())

# Preprocess dataset
df_cleaned = df[['side_effects', 'drug_name']].dropna().drop_duplicates()
print(f"Shape after dropna and drop_duplicates: {df_cleaned.shape}")
print(f"Unique drug names after preprocessing: {df_cleaned['drug_name'].nunique()}")
print("Class distribution after preprocessing:")
print(df_cleaned['drug_name'].value_counts())

if df_cleaned.empty:
    raise ValueError("Dataset is empty after preprocessing.")
if df_cleaned['drug_name'].nunique() < 2:
    raise ValueError(f"Dataset has only {df_cleaned['drug_name'].nunique()} unique class(es). Need at least 2.")

# Encode labels
label_encoder = LabelEncoder()
df_cleaned['label'] = label_encoder.fit_transform(df_cleaned['drug_name'])
print(f"Number of encoded classes: {len(label_encoder.classes_)}")
print(f"Sample of encoded data:\n{df_cleaned[['side_effects', 'drug_name', 'label']].head()}")

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    df_cleaned['side_effects'], df_cleaned['label'], test_size=0.2, random_state=42
)
print(f"Train set shape: {X_train.shape}, Test set shape: {X_test.shape}")
print(f"Unique classes in y_train: {len(np.unique(y_train))}")
print(f"Unique classes in y_test: {len(np.unique(y_test))}")

# Feature extraction with TF-IDF
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# --- Naive Bayes Model ---
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_pred = nb_model.predict(X_test_tfidf)
print(f"Naive Bayes Accuracy: {accuracy_score(y_test, nb_pred) * 100:.2f}%")
print("Naive Bayes Classification Report:\n", classification_report(y_test, nb_pred))
print("Sample predictions vs actual (Naive Bayes):")
print(pd.DataFrame({'Actual': y_test[:5], 'Predicted': nb_pred[:5]}))

# --- Logistic Regression Model ---
lr_model = LogisticRegression(max_iter=1000, solver='lbfgs')
lr_model.fit(X_train_tfidf, y_train)
lr_pred = lr_model.predict(X_test_tfidf)
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, lr_pred) * 100:.2f}%")
print("Logistic Regression Classification Report:\n", classification_report(y_test, lr_pred))
print("Sample predictions vs actual (Logistic Regression):")
print(pd.DataFrame({'Actual': y_test[:5], 'Predicted': lr_pred[:5]}))

# Prediction function
def predict_drug(side_effect: str):
    tfidf_input = vectorizer.transform([side_effect])
    nb_probs = nb_model.predict_proba(tfidf_input)[0]
    nb_pred = np.argmax(nb_probs)
    lr_probs = lr_model.predict_proba(tfidf_input)[0]
    lr_pred = np.argmax(lr_probs)
    ensemble_probs = (nb_probs + lr_probs) / 2
    final_pred_idx = np.argmax(ensemble_probs)
    final_drug = label_encoder.inverse_transform([final_pred_idx])[0]

    print(f"Input: '{side_effect}'")
    print(f"Naive Bayes Predicted Drug: {label_encoder.inverse_transform([nb_pred])[0]} (Prob: {max(nb_probs):.4f})")
    print(f"Logistic Regression Predicted Drug: {label_encoder.inverse_transform([lr_pred])[0]} (Prob: {max(lr_probs):.4f})")
    print(f"Ensemble Top Probability: {max(ensemble_probs):.4f}")
    print(f"Top 3 Ensemble Probabilities: {sorted(zip(label_encoder.classes_, ensemble_probs), key=lambda x: x[1], reverse=True)[:3]}")
    return final_drug

# Main loop
def main():
    print("\nAdvanced Drug Prediction Tool (Using Naive Bayes and Logistic Regression)")
    print("Enter side effects to predict the associated drug (or 'quit' to exit):")
    while True:
        side_effect_input = input("Side effects: ").strip()
        if side_effect_input.lower() == 'quit':
            print("Exiting program.")
            break
        if not side_effect_input:
            print("Please enter some side effects.")
            continue
        predicted_drug = predict_drug(side_effect_input)
        print(f"Final Predicted Drug: {predicted_drug}\n")

if __name__ == "__main__":
    main()

Initial dataset shape: (2966, 135)
Initial unique drug names: 2913
Initial class distribution:
drug_name
re provided authorization to proceed for Phase 3 trial for its COVID-19 Vaccine Candidate    34
triamcinolone                                                                                 3
erythromycin                                                                                  2
cromolyn                                                                                      2
mometasone                                                                                    2
                                                                                             ..
Contac Cold + Flu (Night) Cooling Relief Liquid                                               1
Dallergy                                                                                      1
Dallergy Drops                                                                                1
Decorel Forte Plus             

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Logistic Regression Accuracy: 0.00%
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       1.0
           2       0.00      0.00      0.00       1.0
           7       0.00      0.00      0.00       0.0
          24       0.00      0.00      0.00       1.0
          25       0.00      0.00      0.00       1.0
          27       0.00      0.00      0.00       1.0
          28       0.00      0.00      0.00       1.0
          37       0.00      0.00      0.00       1.0
          39       0.00      0.00      0.00       1.0
          53       0.00      0.00      0.00       1.0
          54       0.00      0.00      0.00       0.0
          55       0.00      0.00      0.00       1.0
          57       0.00      0.00      0.00       1.0
          58       0.00      0.00      0.00       0.0
          61       0.00      0.00      0.00       1.0
 