In [23]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

In [27]:
def load_medquad(path="MedQuAD/"):
    all_data = []
    
    for folder in os.listdir(path):
        folder_path = os.path.join(path, folder)
        if not os.path.isdir(folder_path):
            continue
        
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            if not filename.endswith(".xml"):
                continue
            
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()
                
                question = root.findtext(".//question")
                answer = root.findtext(".//answer")
                
                if question and answer:
                    all_data.append({"question": question.strip(), "answer": answer.strip()})
            except Exception as e:
                print(f"Skipping {file_path}: {e}")
    
    return pd.DataFrame(all_data)


In [None]:
df = load_medquad("MedQuAD/")
df = df.dropna().drop_duplicates()
df.to_csv("medquad_qa.csv", index=False)

In [None]:
df = pd.read_csv("medquad_qa.csv")
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["question"])

In [None]:
import joblib
import os

vectorizer = joblib.load("saved_model/vectorizer.pkl")
X = joblib.load("saved_model/X.pkl")

In [None]:
def get_answer(user_question):
    user_vec = vectorizer.transform([user_question])
    
    similarity = cosine_similarity(user_vec, X).flatten()
    
    best_match_idx = similarity.argmax()
    
    best_question = df.iloc[best_match_idx]["question"]
    best_answer = df.iloc[best_match_idx]["answer"]
    
    return best_question, best_answer

In [None]:
while True:
    user_q = input("You: ")
    if user_q.lower() in ["exit", "quit", "bye"]:
        print("Bot: Thank you! Stay healthy. 👋")
        break
    
    matched_q, response = get_answer(user_q)
    print(f"\nBot (Best Match: {matched_q})\n→ {response}\n")
