In [19]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

In [21]:
def load_medquad(path="MedQuAD/"):
    all_data = []
    
    for folder in os.listdir(path):
        folder_path = os.path.join(path, folder)
        if not os.path.isdir(folder_path):
            continue
        
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            if not filename.endswith(".xml"):
                continue
            
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()
                
                question = root.findtext(".//question")
                answer = root.findtext(".//answer")
                
                if question and answer:
                    all_data.append({"question": question.strip(), "answer": answer.strip()})
            except Exception as e:
                print(f"Skipping {file_path}: {e}")
    
    return pd.DataFrame(all_data)


In [23]:
df = load_medquad("MedQuAD/")
df = df.dropna().drop_duplicates()
df.to_csv("medquad_qa.csv", index=False)

In [24]:
df = pd.read_csv("medquad_qa.csv")
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,question,answer
0,what is holmes-adie syndrome ?,Holmes-Adie syndrome (HAS) is a neurological d...
1,what is psychogenic movement?,Psychogenic movement is an unwanted muscle mov...
2,what are lipid storage diseases?,Lipid storage diseases are a group of inherite...
3,what is amyotrophic lateral sclerosis (als)?,"Amyotrophic lateral sclerosis (ALS), sometimes..."


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["question"])

In [26]:
import joblib
import os

vectorizer = joblib.load("saved_model/vectorizer.pkl")
X = joblib.load("saved_model/X.pkl")

In [27]:
def get_answer(user_question):
    user_vec = vectorizer.transform([user_question])
    
    similarity = cosine_similarity(user_vec, X).flatten()
    
    best_match_idx = similarity.argmax()
    
    best_question = df.iloc[best_match_idx]["question"]
    best_answer = df.iloc[best_match_idx]["answer"]
    
    return best_question, best_answer

In [None]:
while True:
    user_q = input("You: ")
    if user_q.lower() in ["exit", "quit", "bye"]:
        print("Bot: Thank you! Stay healthy. 👋")
        break
    
    matched_q, response = get_answer(user_q)
    print(f"\nBot (Best Match: {matched_q})\n→ {response}\n")

You:  what is holmes-adie syndrome ?



Bot (Best Match: what is holmes-adie syndrome ?)
→ Holmes-Adie syndrome (HAS) is a neurological disorder affecting the pupil of the eye and the autonomic nervous system.  It is characterized by one eye with a pupil that is larger than normal and constricts slowly in bright light  (tonic pupil), along with the absence of deep tendon reflexes, usually in the Achilles tendon.   HAS is thought to be the result of a neurotrophic (acting on neurons, or nerve cells) viral infection that causes inflammation and damage to neurons in the ciliary ganglion, an area of the brain that controls eye movements, and the dorsal root ganglion, an area of the spinal cord involved in the response of the autonomic nervous system.  HAS begins gradually in one eye, and often progresses to involve the other eye.  At first, it may only cause the loss of deep tendon reflexes on one side of the body, but then progress to the other side.  The eye and reflex symptoms may not appear at the same time.  People with HA