## Work Done By: Mohammad Al-Refaie

## Importing Libraries

In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from typing import Optional, List, Tuple
from datasets import Dataset
import matplotlib.pyplot as plt
from langchain_core.documents import Document as LangChainDocument
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from pymongo import MongoClient
from pymongo import ReplaceOne
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import tkinter as tk
from tkinter import scrolledtext, ttk

## Dataset
Link: https://www.kaggle.com/datasets/chaitanyakck/medical-text?resource=download

In [2]:
with open(r"C:\Users\Admin\Desktop\Projects\Q&A Chatbot using LangChain & Hugging Face\Medical Text\train.txt") as f:
    data = f.read()

In [3]:
# A Snippet of the Data (first 100 character)
data[:100]

'4\tCatheterization laboratory events and hospital outcome with direct angioplasty for acute myocardia'

## LangChain

In [4]:
raw_database = LangChainDocument(page_content = data)

In [5]:
markdown_separators = [
    # Used for recursively split by character in chunks
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

In [6]:
splitter = RecursiveCharacterTextSplitter(separators=markdown_separators, 
                                          chunk_size=1000,
                                          chunk_overlap=100 # For enhancing the context
                                          )

In [7]:
processed_data = splitter.split_documents([raw_database])
processed_data[0]

Document(metadata={}, page_content='4\tCatheterization laboratory events and hospital outcome with direct angioplasty for acute myocardial infarction To assess the safety of direct infarct angioplasty without antecedent thrombolytic therapy, catheterization laboratory and hospital events were assessed in consecutively treated patients with infarctions involving the left anterior descending (n = 100 patients), right (n = 100), and circumflex (n = 50) coronary arteries. The groups of patients were similar for age (left anterior descending coronary artery, 59 years; right coronary artery, 58 years; circumflex coronary artery, 62 years), patients with multivessel disease (left anterior descending coronary artery, 55%; right coronary artery, 55%; circumflex coronary artery, 64%), and patients with initial grade 0/1 antegrade flow (left anterior descending coronary artery, 79%; right coronary artery, 84%; circumflex coronary artery, 90%). Cardiogenic shock was present in eight patients with 

In [8]:
# To get the length of the text:
len(processed_data[0].page_content)

994

In [9]:
# To get the number of documents:
len(processed_data)

25185

## Vectorizing the dataset

### all-mpnet-base-v2
**It maps sentences & paragraphs to a 768 dimensional dense vector space and can be used for tasks like clustering or semantic search.**

**Check Hugging face documentation for more information about the model: https://huggingface.co/sentence-transformers/all-mpnet-base-v2**

In [10]:
model_name = "all-mpnet-base-v2"

In [11]:
embedding_model = HuggingFaceEmbeddings(
    model_name = model_name,
    multi_process = True,
    model_kwargs={"device": "cuda"},
    encode_kwargs = {'normalize_embedings': True} # Set 'True' for cosine similarity 
)

  embedding_model = HuggingFaceEmbeddings(


In [12]:
embedding_model.embed_query('Hello, this is a test')

[0.00382057623937726,
 -0.07617252320051193,
 -0.03142876923084259,
 0.013323704712092876,
 -0.023790976032614708,
 0.016874266788363457,
 0.06717122346162796,
 0.05855962634086609,
 0.08362171053886414,
 0.023424580693244934,
 0.07058099657297134,
 -0.0348392091691494,
 0.019853297621011734,
 -0.01704786904156208,
 0.042018238455057144,
 -0.0757012739777565,
 0.04179135710000992,
 -0.0007359701558016241,
 -0.047719571739435196,
 0.024783087894320488,
 -0.023707788437604904,
 -0.0005718622705899179,
 -0.023185059428215027,
 0.0031419312581419945,
 0.0008909982861950994,
 0.031143000349402428,
 -0.009855793789029121,
 -0.015642162412405014,
 0.020381219685077667,
 -0.04523126780986786,
 0.013436519540846348,
 -0.0018972683465108275,
 0.006475826259702444,
 -0.041207991540431976,
 1.8592058950162027e-06,
 -0.015370253473520279,
 0.014024169184267521,
 -0.0024404614232480526,
 -0.055930040776729584,
 -0.02611120417714119,
 0.005546136759221554,
 0.044159553945064545,
 -0.01061432808637619

### Connect to MongoDB

In [None]:
client = MongoClient("Your MongoDB URL")
db = client["Your DB name"]
collection = db["Your collection name"]

**25,185 (documents) ÷ 400 (batch size) ≈ 63 total batches**

In [16]:
batch_size = 400  # Adjust depending on GPU memory
total_batches = len(processed_data) // batch_size + (len(processed_data) % batch_size > 0)

# Initialize tqdm 
with tqdm(total=total_batches, ncols=100, unit="batch") as pbar:
    for batch_idx, start_idx in enumerate(range(0, len(processed_data), batch_size)):
        end_idx = start_idx + batch_size
        batch = processed_data[start_idx:end_idx]

        # Get all texts in this batch
        texts = [entry.page_content for entry in batch]

        # Embed all texts at once (GPU-optimized)
        vectors = embedding_model.embed_documents(texts)

        operations = []
        for i, vector in enumerate(vectors, start=start_idx):
            doc = {
                "_id": f"vec{i}",
                "values": vector,
                "metadata": {"text": batch[i - start_idx].page_content}  # original text
            }
            operations.append(ReplaceOne({"_id": doc["_id"]}, doc, upsert=True))

        if operations:
            collection.bulk_write(operations)

        # Update tqdm (For a better visualization of the process)
        pbar.update(1)
        percent_done = (batch_idx + 1) / total_batches * 100
        pbar.set_postfix_str(f"{percent_done:.2f}% completed")

100%|█████████████████████████████████████████| 63/63 [11:21<00:00, 10.81s/batch, 100.00% completed]


**Note ⚠️: Use this line if you want to clear your MongoDB content ''collection.delete_many({})''   ⚠️This clears everything⚠️**

In [14]:
# Count how many documents are stored
print(collection.count_documents({})) # The output shows that all documents are stored successfully 

25185


In [15]:
# Print the first 3 documents
for doc in collection.find().limit(3):
    print(doc, '\n')

{'_id': 'vec0', 'values': [0.0014613830717280507, -0.07627732306718826, 0.015294569544494152, 0.014604976400732994, -0.05865821987390518, 0.03732427582144737, -0.01966121606528759, 0.01141977496445179, 0.004187275655567646, 0.014595599845051765, -0.0019418258452787995, 0.0013819243758916855, 0.04894949868321419, 0.003551694331690669, -0.011567838490009308, -0.019318467006087303, -0.03900465741753578, -0.020103055983781815, 0.014752398245036602, -0.03328833729028702, -0.019458811730146408, 0.02032020315527916, -0.0036714966408908367, 0.00833289884030819, 0.03545732423663139, -0.04004736244678497, 0.016626935452222824, 0.01409732736647129, 0.008214615285396576, -0.006772086955606937, -0.030154375359416008, 0.03509107232093811, -0.016685323789715767, -0.07667471468448639, 3.178029828632134e-06, 0.013768774457275867, 0.022011062130331993, -0.0054788896813988686, -0.0018797559896484017, -0.032587677240371704, -0.04771125316619873, -0.002290475182235241, 0.058861829340457916, 0.0230133160948

In [16]:
# Print the first document text
for tex in collection.find().limit(1):
    print(tex["metadata"]["text"], "\n")

4	Catheterization laboratory events and hospital outcome with direct angioplasty for acute myocardial infarction To assess the safety of direct infarct angioplasty without antecedent thrombolytic therapy, catheterization laboratory and hospital events were assessed in consecutively treated patients with infarctions involving the left anterior descending (n = 100 patients), right (n = 100), and circumflex (n = 50) coronary arteries. The groups of patients were similar for age (left anterior descending coronary artery, 59 years; right coronary artery, 58 years; circumflex coronary artery, 62 years), patients with multivessel disease (left anterior descending coronary artery, 55%; right coronary artery, 55%; circumflex coronary artery, 64%), and patients with initial grade 0/1 antegrade flow (left anterior descending coronary artery, 79%; right coronary artery, 84%; circumflex coronary artery, 90%). Cardiogenic shock was present in eight patients with infarction of the left anterior 



## Loading The LLM

**Zephyr is a series of language models that are trained to act as helpful assistants.**

**Model description:**

-Model type: A 7B parameter GPT-like model fine-tuned on a mix of publicly
available, synthetic datasets.

-Language (NLP): Primarily English

-License: MIT

-Finetuned from model: mistralai/Mistral-7B-v0.1

**For more info, visit: https://huggingface.co/HuggingFaceH4/zephyr-7b-beta**

In [17]:
hf_model_name = 'HuggingFaceH4/zephyr-7b-beta'

In [18]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(hf_model_name, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(hf_model_name)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [19]:
llm_model=pipeline(
    model=model,
    tokenizer=tokenizer,
    task='text-generation',
    do_sample=True,
    temperature=0.2,
    max_new_tokens=500
)

Device set to use cuda:0


In [20]:
# Test if everything works
llm_model('Hi, how are you')

[{'generated_text': 'Hi, how are you?\n\nI’m good, thanks. I’m excited to share some news with you today.\n\nI’ve been working on a new project for a while now, and I’m thrilled to finally be able to tell you about it. It’s called [Project Name], and it’s a [brief description of the project].\n\nI’m really passionate about this project because [explain why you’re passionate about it]. I believe that [state the problem or need that your project addresses]. By [explain how your project solves the problem or meets the need], we can make a real difference in [state the impact your project will have].\n\nHere’s what we’ve accomplished so far:\n\n- [List some of the major milestones or achievements of the project]\n- [List any notable partnerships or collaborations]\n- [List any awards or recognition the project has received]\n\nBut we still have a long way to go. Here’s where you come in:\n\n[Explain how the recipient can get involved or support the project. This could be through volunteeri

**The responses without prompting can be so random, like the response above.**

## Prompting the Model

In [21]:
prompt = """
<|system|>
You are a helpful assistant that answers on medical questions based on the real information provided from different sources and in the context.
Give the rational and well written response. If you don't have proper info in the context, answer exactly like the delimited by triple single quotes '''I don't have the information needed to answer that accurately.'''
And Respond only to the question asked.

<|user|>
Context:
{}
---
Here is the question you need to answer.

Question: {}
<|assistant|>
"""

## RAG

In [24]:
user_input = input("User: ")

# Embed user input
vectorized_input = embedding_model.embed_query(user_input)

# Retrieve all vectors from MongoDB
cursor = collection.find({}, {"values": 1, "metadata.text": 1})

best_doc = None
best_similarity = -1

# Compute cosine similarity with each document
for doc in cursor:
    stored_vec = np.array(doc["values"])

    # cosine similarity
    sim = np.dot(vectorized_input, stored_vec)

    if sim > best_similarity:
        best_similarity = sim
        best_doc = doc

# Extract best context
context_text = best_doc["metadata"]["text"]

# Build prompt 
final_prompt = prompt.format(context_text, user_input)

# Generate answer
answer = llm_model(final_prompt)

# Extract only the assistant's response (everything after the last <|assistant|> tag)
full_output = answer[0]["generated_text"].strip()

# Split by the assistant tag and take the last part
if "<|assistant|>" in full_output:
    response_parts = full_output.split("<|assistant|>")
    final_answer = response_parts[-1].strip()
else:
    final_answer = full_output

print(final_answer)

Inflammatory pseudotumor, also known as inflammatory mass-like lesion, is a rare condition that can affect various organs, including the liver. It is not a true tumor, but rather a collection of inflammatory cells that can mimic the appearance of a malignant tumor. The exact cause of inflammatory pseudotumor is unknown, but it is believed to be associated with chronic inflammation or infection. The symptoms of inflammatory pseudotumor can vary depending on the location of the lesion, but may include pain, fever, weight loss, and fatigue. Diagnosis is typically made through imaging studies and biopsy, and treatment may involve conservative management, such as antibiotics or anti-inflammatory medications, as the lesion is benign and can regress spontaneously. However, surgical intervention may be necessary in some cases to confirm the diagnosis or to manage complications. Overall, inflammatory pseudotumor is a rare and complex condition that requires a multidisciplinary approach for accu

## Chatbot GUI

In [None]:
class ModernChatGUI:
    def __init__(self, root):
        self.root = root
        self.setup_gui()
        
    def setup_gui(self):
        # Modern color palette
        self.colors = {
            "bg_dark": "#0f172a",    # Dark blue background
            "bg_light": "#1e293b",   # Lighter blue
            "user_bubble": "#3b82f6", # Blue bubble (User)
            "ai_bubble": "#374151",   # Gray bubble (AI)
            "user_text": "#ffffff",
            "ai_text": "#f3f4f6",
            "input_bg": "#1f2937",
            "button_bg": "#10b981",   # Green
            "button_hover": "#0da271",
            "header_bg": "#111827",
            "chat_bg": "#1e293b"
        }
        
        # Configure root window
        self.root.title("MedAI Assistant")
        self.root.geometry("1200x800")
        self.root.configure(bg=self.colors["bg_dark"])
        self.root.minsize(1000, 700)
        
        # Create main container
        self.main_container = tk.Frame(self.root, bg=self.colors["bg_dark"])
        self.main_container.pack(fill="both", expand=True)
        
        self.create_header()
        self.create_chat_area()
        self.create_input_area()
        
        # Add welcome message
        self.root.after(100, self.add_welcome_message)
        
    def create_header(self):
        header_frame = tk.Frame(self.main_container, bg=self.colors["header_bg"], height=70)
        header_frame.pack(fill="x", side="top")
        header_frame.pack_propagate(False)
        
        header_content = tk.Frame(header_frame, bg=self.colors["header_bg"])
        header_content.place(relx=0.5, rely=0.5, anchor="center")
        
        icon_label = tk.Label(header_content, text="⚕️", font=("Arial", 24), 
                             bg=self.colors["header_bg"], fg="#10b981")
        icon_label.pack(side="left", padx=(0, 10))
        
        title_label = tk.Label(header_content, text="MedAI Assistant", 
                              font=("Segoe UI", 20, "bold"), 
                              bg=self.colors["header_bg"], fg="white")
        title_label.pack(side="left")
        
    def create_chat_area(self):
        chat_outer_frame = tk.Frame(self.main_container, bg=self.colors["bg_dark"])
        chat_outer_frame.pack(fill="both", expand=True, padx=0, pady=0)
        
        self.chat_canvas = tk.Canvas(chat_outer_frame, bg=self.colors["chat_bg"], 
                                    highlightthickness=0)
        self.scrollbar = ttk.Scrollbar(chat_outer_frame, orient="vertical", 
                                     command=self.chat_canvas.yview)
        
        self.chat_frame = tk.Frame(self.chat_canvas, bg=self.colors["chat_bg"])
        
        self.chat_window_id = self.chat_canvas.create_window(
            (0, 0), 
            window=self.chat_frame, 
            anchor="nw"
        )
        
        self.chat_canvas.configure(yscrollcommand=self.scrollbar.set)
        
        self.scrollbar.pack(side="right", fill="y")
        self.chat_canvas.pack(side="left", fill="both", expand=True, padx=0, pady=0)
        
        def on_frame_configure(event):
            self.chat_canvas.configure(scrollregion=self.chat_canvas.bbox("all"))
        
        def on_canvas_configure(event):
            self.chat_canvas.itemconfig(self.chat_window_id, width=event.width)
        
        self.chat_frame.bind("<Configure>", on_frame_configure)
        self.chat_canvas.bind("<Configure>", on_canvas_configure)
        
        def on_mousewheel(event):
            self.chat_canvas.yview_scroll(int(-1*(event.delta/120)), "units")
            return "break"
        
        self.chat_canvas.bind_all("<MouseWheel>", on_mousewheel)
        self.chat_frame.bind("<MouseWheel>", on_mousewheel)
        
    def create_input_area(self):
        input_container = tk.Frame(self.main_container, bg=self.colors["input_bg"], height=120)
        input_container.pack(fill="x", side="bottom")
        input_container.pack_propagate(False)
        
        input_frame = tk.Frame(input_container, bg=self.colors["input_bg"])
        input_frame.pack(fill="both", padx=20, pady=15)
        
        self.question_entry = scrolledtext.ScrolledText(input_frame, height=3, 
                                                       font=("Segoe UI", 11), 
                                                       wrap=tk.WORD, 
                                                       bg="#2d3748",
                                                       fg="white",
                                                       insertbackground="white",
                                                       relief="flat", 
                                                       borderwidth=1,
                                                       padx=15, pady=12)
        self.question_entry.pack(side="left", fill="both", expand=True)
        self.question_entry.bind("<Return>", self.on_enter_pressed)
        self.question_entry.bind("<Shift-Return>", lambda e: None)
        
        self.ask_button = tk.Button(input_frame, text="Ask AI", 
                                   font=("Segoe UI", 11, "bold"), 
                                   bg=self.colors["button_bg"], 
                                   fg="white", 
                                   activebackground=self.colors["button_hover"],
                                   activeforeground="white", 
                                   bd=0, 
                                   relief="flat", 
                                   cursor="hand2",
                                   command=self.ask_question, 
                                   padx=25, pady=10)
        self.ask_button.pack(side="right", padx=(15, 0))
        
        self.root.after(200, lambda: self.question_entry.focus_set())
        
    def add_message(self, sender, message, is_typing=False, has_avatar=True):
        message_container = tk.Frame(self.chat_frame, bg=self.colors["chat_bg"])
        message_container.pack(fill="x", pady=10, padx=20)
        
        if sender == "user":
            content_frame = tk.Frame(message_container, bg=self.colors["chat_bg"])
            content_frame.pack(side="right", anchor="e")
            
            label = tk.Label(content_frame, text=message, font=("Segoe UI", 11), 
                            bg=self.colors["user_bubble"], fg=self.colors["user_text"],
                            wraplength=550, justify="left", 
                            padx=15, pady=10,
                            relief="flat", borderwidth=0)
            label.pack(side="right")
            
        else:
            content_frame = tk.Frame(message_container, bg=self.colors["chat_bg"])
            content_frame.pack(side="left", anchor="w")
            
            if has_avatar and not is_typing:
                avatar = tk.Label(content_frame, text="⚕️", font=("Arial", 16), 
                                bg=self.colors["chat_bg"], fg="#10b981",
                                padx=5, pady=0)
                avatar.pack(side="left", anchor="n", padx=(0, 10))
            
            font_style = ("Segoe UI", 11, "italic") if is_typing else ("Segoe UI", 11)
            
            label = tk.Label(content_frame, text=message, font=font_style, 
                            bg=self.colors["ai_bubble"], fg=self.colors["ai_text"],
                            wraplength=600, justify="left", 
                            padx=15, pady=10,
                            relief="flat", borderwidth=0)
            label.pack(side="left")

            if is_typing:
                self.typing_indicator = message_container

        self.chat_frame.update_idletasks()
        self.chat_canvas.configure(scrollregion=self.chat_canvas.bbox("all"))
        
        if not hasattr(self, '_first_message_added'):
            self._first_message_added = True
            self.root.after(50, lambda: self.chat_canvas.yview_moveto(0.0))
        elif not is_typing:
            self.root.after(50, lambda: self.chat_canvas.yview_moveto(1.0))
        
        if is_typing:
            return message_container
        return None
        
    def remove_typing_indicator(self):
        if hasattr(self, 'typing_indicator'):
            self.typing_indicator.destroy()
            delattr(self, 'typing_indicator')
            self.chat_frame.update_idletasks()
            self.chat_canvas.configure(scrollregion=self.chat_canvas.bbox("all"))
        
    def add_welcome_message(self):
        welcome_text = """Hello! I'm MedAI, your AI-powered medical assistant. I'm here to help answer your health-related questions based on reliable medical information.

💡 What I can do:
• Answer medical questions
• Explain health conditions
• Provide general health information

⚠️ Important: I'm an AI assistant and not a substitute for professional medical advice. Always consult healthcare professionals for medical decisions and emergencies.

Feel free to ask me anything!"""
        self.add_message("ai", welcome_text, has_avatar=True)
        
    def on_enter_pressed(self, event):
        if event.state & 0x1:
            return
        else:
            self.ask_question()
            return "break"
            
    def ask_question(self):
        user_input = self.question_entry.get("1.0", tk.END).strip()
        if not user_input:
            return

        self.ask_button.config(state=tk.DISABLED)
        
        # Add user message
        self.add_message("user", user_input)
        self.question_entry.delete("1.0", tk.END)
        
        # Show typing indicator
        self.add_message("ai", "Thinking...", is_typing=True, has_avatar=False)
        self.chat_frame.update()

        try:
            
            # 1. Embed user query
            vectorized_input = embedding_model.embed_query(user_input)

            # 2. Search Database
            cursor = collection.find({}, {"values": 1, "metadata.text": 1})

            best_doc = None
            best_similarity = -1

            # 3. Find best match
            for doc in cursor:
                stored_vec = np.array(doc["values"])
                sim = np.dot(vectorized_input, stored_vec)
                if sim > best_similarity:
                    best_similarity = sim
                    best_doc = doc

            if best_doc:
                context_text = best_doc["metadata"]["text"]
                final_prompt = prompt.format(context_text, user_input)
                
                answer = llm_model(final_prompt)
                
                # 4. Extract text response
                full_response = answer[0]["generated_text"].strip()
                if "<|assistant|>" in full_response:
                    response_parts = full_response.split("<|assistant|>")
                    final_answer = response_parts[-1].strip()
                else:
                    final_answer = full_response
            else:
                final_answer = "I couldn't find any relevant medical information in my database."

            # 5. Show Response
            self.remove_typing_indicator()
            self.add_message("ai", final_answer, has_avatar=True)

        except Exception as e:
            self.remove_typing_indicator()
            # Show error message in chat if something fails
            error_msg = f"Error: {str(e)}\n\n(Make sure embedding_model, collection, and llm_model are initialized)"
            self.add_message("ai", error_msg, has_avatar=True)
            print(f"Error: {e}")

        # Re-enable button
        self.ask_button.config(state=tk.NORMAL)


if __name__ == "__main__":
    root = tk.Tk()
    app = ModernChatGUI(root)
    root.mainloop()