### In this section, I will build an Basic RAG with Re-rank mechanism

Before, I already used a medical-domain LLM to generate hypothentical questions for each chunk. 

The project's main task is medical Q&A. So I am going to implement Multi-Vector as the foundation of the RAG system.

That means I am going to:
* Embedding those questions to vectorestore and put chunked documents to docstore.
* I will use doc_id which were generated at chunking stage to be a link between vectorstore and docstore.

In [None]:
# Multi-Vector implementation
from mytools import timed, login_huggingface
import os
import json
import copy
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from sentence_transformers import CrossEncoder

In [None]:
import os
import json
import copy
import torch
import uuid
from mytools import timed
import settings
from typing import List
from dotenv import load_dotenv
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from sentence_transformers import CrossEncoder
from huggingface_hub import login
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory # Short-term Memory
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.output_parsers import JsonOutputParser

W0922 16:00:32.725000 64104 Lib\site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


##### I choose sentence-transformers/embeddinggemma-300m-medical, as it is a sentence-transformers model finetuned from google/embeddinggemma-300m on the miriad/miriad-4.4M dataset. It maps sentences & documents to a 768-dimensional dense vector space and can be used for medical information retrieval, specifically designed for searching for passages (up to 1k tokens) of scientific medical papers using detailed medical questions.

* Reference: https://huggingface.co/sentence-transformers/embeddinggemma-300m-medical

@inproceedings{reimers-2019-sentence-bert,
    title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
    author = "Reimers, Nils and Gurevych, Iryna",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
    month = "11",
    year = "2019",
    publisher = "Association for Computational Linguistics",
    url = "https://arxiv.org/abs/1908.10084",
}

@misc{gao2021scaling,
    title={Scaling Deep Contrastive Learning Batch Size under Memory Limited Setup},
    author={Luyu Gao and Yunyi Zhang and Jiawei Han and Jamie Callan},
    year={2021},
    eprint={2101.06983},
    archivePrefix={arXiv},
    primaryClass={cs.LG}
}


### I involve a cross-encoder(ncbi/MedCPT-Cross-Encoder) to rerank the retrieved documents and output top_k(n) most relevant ones.
##### This crossEncoder(Bert) model was fine-tuned on 30522 medical related tokens.

Citation:

@article{jin2023medcpt,
  title={MedCPT: Contrastive Pre-trained Transformers with large-scale PubMed search logs for zero-shot biomedical information retrieval},
  author={Jin, Qiao and Kim, Won and Chen, Qingyu and Comeau, Donald C and Yeganova, Lana and Wilbur, W John and Lu, Zhiyong},
  journal={Bioinformatics},
  volume={39},
  number={11},
  pages={btad651},
  year={2023},
  publisher={Oxford University Press}
}


In [None]:
# Wrap them up

class Rerank_RAG():
    """
        Rerank_RAG class defines everything the RAG needs.
            Attributes:
                workspace_base_path: The current workspace.
                dataset_path: The path to the medicine dataset.                
                embedding_model_id: The name of the embedding model.
                cross_encoder_model_id: The name of crossEncoder model which is used to do reranking.
                embedding_model: A embedding model.
                retriever: It is a very important retriever who will similarity search the documents based on query.

            Functions:
                load_json_list: Load json file to json objects.
                login_huggingface: Login huggingface to gain the access to the LLMs
                build_medicine_retriever: Build a multi-vector db which contains vectorstore and docstore. Embedding hypothetical questions to vectorstore and Storing original documents to docstore.
                load_embedding_model: Load embedding model.
                load_crossencoder: Load cross encoder model.
                retrieve: Wrap retriever and reranker up to fetch top_k relevant documents.
    """
    def __init__(self) -> None:

        self.workspace_base_path = os.getcwd()
        self.dataset_path = os.path.join(self.workspace_base_path, "datasets", "medicine_data_hypotheticalquestions.json")  
        self.embedding_model_id = "sentence-transformers/embeddinggemma-300m-medical"
        self.cross_encoder_model_id = "ncbi/MedCPT-Cross-Encoder" 
        self.embedding_model = None
        self.retriever = None
        self.cross_encoder = None

    @timed
    def load_embedding_model(self):        
        self.embedding_model = HuggingFaceEmbeddings(
            model_name=self.embedding_model_id,
            model_kwargs = {'device': 'cpu'},
            # Normalizing helps cosine similarity behave better across models
            encode_kwargs={"normalize_embeddings": True},
        )        

    @timed
    def load_crossencoder(self):
        self.cross_encoder = CrossEncoder(self.cross_encoder_model_id)

    @timed
    def load_json_list(self):    
        with open(self.dataset_path, mode = "r", encoding="utf-8") as f:
            return json.load(f)
        
    @timed    
    def build_medicine_retriever(self):        
        data = self.load_json_list()  
        login_huggingface()      
        self.load_embedding_model()
        self.load_crossencoder()
        docstore = InMemoryStore()
        id_key = "doc_id"

        # The vectorstore to use to index the questions
        vectorstore = Chroma(collection_name = "medicine_data", embedding_function = self.embedding_model)
        # The Multi-Vector retriever
        self.retriever = MultiVectorRetriever(
            vectorstore=vectorstore,
            docstore=docstore,
            id_key=id_key,
        )

        doc_ids = list()
        questions = list()
        docs = list()
        for d in data[:50]:
            doc_id = d["doc_id"]
            doc_ids.append(doc_id)
            docs.append(Document(metadata={"doc_id": doc_id}, page_content=d["original_doc"]))
            for q in d["questions"]:
                questions.append(Document(metadata={"doc_id": doc_id}, page_content=q))

        self.retriever.vectorstore.add_documents(questions)
        self.retriever.docstore.mset(list(zip(doc_ids,docs)))

    @timed
    def retrieve(self, query: str, top_k: int=5):
        retrieved_docs = self.retriever.invoke(query, kwargs={"k":10})
        retrieved_docs = copy.deepcopy(retrieved_docs) # Avoid rerank changes original documents
        #Rerank part
        pairs = [[query, d.page_content] for d in retrieved_docs]
        scores = self.cross_encoder.predict(pairs, batch_size=32)
        for r_d, score in zip(retrieved_docs, scores):
            r_d.metadata["rerank_score"] = float(score)
        retrieved_docs.sort(key= lambda d: d.metadata["rerank_score"], reverse=True)
        #Rerank part
        return retrieved_docs[ :top_k]    

In [None]:
rag = Rerank_RAG()

In [7]:
rag.build_medicine_retriever()

build_medicine_retriever starts runing!
load_json_list starts runing!
load_json_list took 3.4837s
Login HuggingFace!
load_embedding_model starts runing!


You are trying to use a model that was created with Sentence Transformers version 5.2.0.dev0, but you're currently using version 5.1.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.


load_embedding_model took 6.9730s
load_crossencoder starts runing!
load_crossencoder took 1.6666s
build_medicine_retriever took 22.0846s


In [10]:
rag.retrieve("My nasal is disconfort. Do you have a medicine to relieve sinus congestion and pressure?",top_k=2)

retrieve starts runing!
retrieve took 4.0924s


[Document(metadata={'doc_id': '1bf5880b-93ec-4ac9-a0cb-eb35693ccce4', 'rerank_score': 0.9999985694885254}, page_content='phenylephrine is used to relieve nasal discomfort caused by colds, allergies, and hay fever. it is also used to relieve sinus congestion and pressure. phenylephrine will relieve symptoms but will not treat the cause of the symptoms or speed recovery. phenylephrine is in a class of medications called nasal decongestants. it works by reducing swelling of the blood vessels in the nasal passages.about Phenylephrine'),
 Document(metadata={'doc_id': 'bb119108-9008-4636-bda2-7f7ad0d185ed', 'rerank_score': 0.09486568719148636}, page_content='Hydrocortisone Injection may be prescribed for other uses; ask your doctor or pharmacist for more information.')]

#### The rerank model is just working so good. Keep it and move next.

#### Beside rerank, I think query is the most important thing what the RAG can retrieve most relevant documents based on.
#### But in real conversation, users can ask anything we can not predict ahead. 
For example:
In the third turn the user really want to ask 'How do I take Phenylephrine?'

But he types 'How do I take it?'. From the context, 'it' means 'Phenylephrine'.

If we retrieve by query  'How do I take it?', we can get unrelevant document.  'How do I take Phenylephrine?' makes more sense.

Other senarios:

1. In first turn, user just greet without any question.
2. User ask a random question in the middle of conversation.
3. .........

#### To handle all those, I need to put LLM as a master agent to determine what to do next based on different situation.
#### So I will involve langgraph, memory, local LLM, websearch tool... work together to make the RAG agentic.

#### First of all, local LLM

In [20]:
model_id = "ContactDoctor/Bio-Medical-Llama-3-8B"

In [21]:
def best_dtype():
    if torch.cuda.is_available():
        if torch.cuda.is_bf16_supported():
            return torch.bfloat16
        else:
            return torch.float16
        
    return torch.float32

def best_device():
    return "cuda" if torch.cuda.is_available() else "cpu"

In [22]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype = best_dtype(),
    device_map={"":best_device()}, 
    low_cpu_mem_usage=True     
)
print("Load tokenizer and base model done!")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Load tokenizer and base model done!


In [26]:
print(model)                    # full architecture tree (long but useful)
print(model.config)             # core hyperparameters (dims, layers, heads…)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_

In [None]:
original_pipeline = pipeline(
    "text-generation", 
    model=model, 
    tokenizer=tokenizer,
    return_full_text=False,   
)

Device set to use cuda


In [24]:
# Wrapper normal piple with huggingfacepipeline
hug_pipeline = HuggingFacePipeline(pipeline=original_pipeline)

In [25]:
master_agent = ChatHuggingFace(llm=hug_pipeline)

#### Then Memory

In [26]:
class Short_Term_Memory():
    def __init__(self) -> None:        
        self.session_store: dict[int,BaseChatMessageHistory] = {}
        self.current_session_id: int = 0

    def get_history(self, session_id: int) -> BaseChatMessageHistory:        
        self.current_session_id = session_id
        if session_id not in self.session_store:
            self.session_store[session_id] = ChatMessageHistory()
        return self.session_store[session_id]
    
    def get_current_history(self) -> BaseChatMessageHistory:
        return self.get_history(self.current_session_id)
    
    def delete_history(self, session_id: int) -> bool:
        if session_id in self.session_store:
            d = self.session_store.pop(session_id)
            if d:
                return True
            else:
                return False
        return True
    
    def delete_current_history(self) -> bool:
        return self.delete_history(self.current_session_id)
    
# Convert history message to a string
def history_as_text(history: BaseChatMessageHistory) -> str:
    return "\n".join([
        f"{m.type.upper()}: {m.content}"   # e.g. "HUMAN: …" or "AI: …"
        for m in history.messages])

In [27]:
from typing import List, TypedDict

class GraphState(TypedDict):
    """
    Represents the state of our graph.

    Attributes:
        session_id: current session id
        query: user's query or augmented query
        retrieval_doc: retrieval docment        
        generation: LLM generation        
    """
    session_id: int
    query: str
    retrieval_doc: str        
    generation: str

In [28]:
# Node: combine_node will check the global session_id and short_term_memory

def combine_node(query: str):
    if  settings.SESSION_ID == 0:   
        settings.SESSION_ID += 1
        settings.SHORT_TERM_MEMORY = Short_Term_Memory()
    
    history = settings.SHORT_TERM_MEMORY.get_history(settings.SESSION_ID)

    return {"query": query,"history": history, "generation": ""}

In [29]:
# First turn: ask a question
query_1 = "My nasal is disconfort. Do you have a medicine to relieve sinus congestion and pressure?"

In [30]:
combine_return = combine_node(query_1)

In [32]:
combine_return

{'query': 'My nasal is disconfort. Do you have a medicine to relieve sinus congestion and pressure?',
 'history': InMemoryChatMessageHistory(messages=[]),
 'generation': ''}

In [33]:
settings.SESSION_ID

1

In [34]:
len(combine_return["history"].messages) # Check how many message it has.

0

In [35]:
combine_return["history"].add_user_message(query_1)

In [36]:
doc_1 = ask(query_1, retriever, top_k=1)

In [37]:
doc_1

[Document(metadata={'doc_id': '1bf5880b-93ec-4ac9-a0cb-eb35693ccce4', 'rerank_score': 0.9999985694885254}, page_content='phenylephrine is used to relieve nasal discomfort caused by colds, allergies, and hay fever. it is also used to relieve sinus congestion and pressure. phenylephrine will relieve symptoms but will not treat the cause of the symptoms or speed recovery. phenylephrine is in a class of medications called nasal decongestants. it works by reducing swelling of the blood vessels in the nasal passages.about Phenylephrine')]

In [38]:
# The first document is what exactly I am expecting
# Put is to history store
combine_return["history"].add_ai_message(doc_1[0].page_content)

In [39]:
combine_return["history"]

InMemoryChatMessageHistory(messages=[HumanMessage(content='My nasal is disconfort. Do you have a medicine to relieve sinus congestion and pressure?', additional_kwargs={}, response_metadata={}), AIMessage(content='phenylephrine is used to relieve nasal discomfort caused by colds, allergies, and hay fever. it is also used to relieve sinus congestion and pressure. phenylephrine will relieve symptoms but will not treat the cause of the symptoms or speed recovery. phenylephrine is in a class of medications called nasal decongestants. it works by reducing swelling of the blood vessels in the nasal passages.about Phenylephrine', additional_kwargs={}, response_metadata={})])

In [40]:
# second turn: ask a question
query_2 = "How can I use it?"

In [41]:
combine_return_2 = combine_node("How can I use it?")

In [42]:
combine_return_2

{'query': 'How can I use it?',
 'history': InMemoryChatMessageHistory(messages=[HumanMessage(content='My nasal is disconfort. Do you have a medicine to relieve sinus congestion and pressure?', additional_kwargs={}, response_metadata={}), AIMessage(content='phenylephrine is used to relieve nasal discomfort caused by colds, allergies, and hay fever. it is also used to relieve sinus congestion and pressure. phenylephrine will relieve symptoms but will not treat the cause of the symptoms or speed recovery. phenylephrine is in a class of medications called nasal decongestants. it works by reducing swelling of the blood vessels in the nasal passages.about Phenylephrine', additional_kwargs={}, response_metadata={})]),
 'generation': ''}

In [43]:
# Test whether the LLM can determine the query has no ambiguity
query_grader_prompt = PromptTemplate(
    template="""You are a grader for a question. \n 
    You need to determine if a question is meaningful, clear, self-contained without any ambiguity, if you don't know the conversation context. \n    
    Here is the user's question: {question} \n   
    Give a binary score 'yes' or 'no' score to indicate whether the question is meaningful and self-contained. \n     
    Only provide the binary score as a JSON with a single key 'score', for example {{"score": "yes"}} or  {{"score": "no"}}.\n
    No premable or explanation.""",
    input_variables=["question"],
)

query_grader_chain = query_grader_prompt | master_agent | JsonOutputParser()

In [44]:
result = query_grader_chain.invoke({"question": query_2})

In [45]:
print(result)
    

{'score': 'no'}


In [120]:
### Test the master agent whether can determine query is pure greeting and it is greeting + something else.
#If the input includes a greeting PLUS any request (e.g., "hi, can you...") or any content beyond a greeting, it is NOT just a greeting.
#The question might contain a greeting plus a greeting, for example "Good morning! How are you?". It is also a greeting.
# greeting_grader_prompt = PromptTemplate(
#     template="""You are a grader for a question.
#     You need to determine if the user's input is JUST a greeting/pleasantry and contains no request for information or action and no substantive topic.
#     Treat common variants (e.g., "hi", "hello", "hey", "good morning", "Good afternoon!", "good evening", "how are you?", "what's up", "How are you doing?", emojis, or greetings in other languages) as greetings.
#     The input may contain a few sentences. If one of them is not greeting, score it 'no'. If all of them are greeting, it is a greeting.for example "Good morning! How are you?". It is a greeting.
#     Here is the user's input: {question}
#     Give a binary score 'yes' or 'no' to indicate whether the input is just a greeting.
#     Only provide the binary score as a JSON with a single key 'score', for example {{"score": "yes"}} or {{"score": "no"}}.
#     No preamble or explanation.""",
#     input_variables=["question"],
# )

greeting_grader_prompt = PromptTemplate(
    template="""You are a classifier.
Decide if the given message is a **pure greeting** with no question or request.

Pure greeting examples:
- Hi
- Hello
- Hey!
- Good morning
- Good afternoon
- Good evening
- Greetings
- Howdy
- Hi there
- Hello there
- Morning!
- Evening!
- Hey there
- Hi everyone
- Yo
- Welcome!
- Hi friend
- Hi all
- Hello team
- Hi folks
- Hi buddy
- Hi mate
- Bonjour
- Salut
- Bonsoir
- Ça va
- "how are you?"
- "How are you doing?"
- "Good morning! How are you?"

Not pure greeting examples:
- "Hello, can you help me with my code?"
- "Good morning, what's the weather today?"\n
    Here is the user's input: {question}
    Give a binary score 'yes' or 'no' to indicate whether the input is just a greeting.
    Only provide the binary score as a JSON with a single key 'score', for example {{"score": "yes"}} or {{"score": "no"}}.
    No preamble or explanation.""",
    input_variables=["question"],
)

query_grader_chain = greeting_grader_prompt | master_agent | JsonOutputParser()

In [171]:

questions = [
"Hi",
"Hi buddy",
"what's up?",
"Hello there ",
"How are you?",
"How are you doing?",
"Good morning! How are you?",
"Hey, can you help me set up LangGraph?",
"Bonjour",
"Yo, what's the ETA on the build?",
"Hi — quick question: what's our API rate limit?",
"Good evening from Montréal! I don't feel good today.",
"thanks!",
"Hello, please translate this paragraph."
]

for q in questions:
    result = query_grader_chain.invoke({"question": q})
    print(f"question:{q}, score:{result}")

question:Hi, score:{'score': 'yes'}
question:Hi buddy, score:{'score': 'yes'}
question:what's up?, score:{'score': 'yes'}
question:Hello there , score:{'score': 'yes'}
question:How are you?, score:{'score': 'yes'}
question:How are you doing?, score:{'score': 'no'}
question:Good morning! How are you?, score:{'score': 'no'}
question:Hey, can you help me set up LangGraph?, score:{'score': 'no'}
question:Bonjour, score:{'score': 'yes'}
question:Yo, what's the ETA on the build?, score:{'score': 'no'}
question:Hi — quick question: what's our API rate limit?, score:{'score': 'no'}
question:Good evening from Montréal! I don't feel good today., score:{'score': 'yes'}
question:thanks!, score:{'score': 'yes'}
question:Hello, please translate this paragraph., score:{'score': 'no'}


In [111]:
### Test the master Agent whether can answer an un-clinical question or normal talk
#If unsure, say you don't know briefly and suggest one next step.
#If the question is "How are you?" or "How are you doing?", just greeting without any other words.
# polite_answer_prompt = PromptTemplate(
#     template="""You are a polite, honest, and helpful assistant.
# Answer the user's non-clinical question clearly and concisely.
# If the question is "How are you?" or "How are you doing?", just greeting.\n

# No preamble or meta commentary.
# Question: {question}
# Answer:""",
#     input_variables=["question"],
# )

polite_answer_prompt = PromptTemplate(
    template="""You are a polite, honest, and helpful assistant.
Answer the user's non-clinical question clearly and concisely.
If the question is "How are you?" or "How are you doing?", just greeting.
If unsure, say you don't know briefly and suggest one next step.
No preamble or meta commentary.
Question: {question}
Answer:""",
    input_variables=["question"],
)
polite_answer_chain = polite_answer_prompt | master_agent | StrOutputParser()

In [113]:
questions = [
"Hi",
"Hello there ",
"Good morning!",
"Hey, can you help me set up LangGraph?",
"Bonjour",
"Yo, what's the ETA on the build?",
"Hi — quick question: what's our API rate limit?",
"Good evening from Montréal! I don't feel good today.",
"thanks!",
"Hello, please translate this paragraph."
]

for q in questions:
    result = polite_answer_chain.invoke({"question": q})
    print(f"question:{q}, answer:{result}")

question:Hi, answer: Hi
question:Hello there , answer: Hello!
question:Good morning!, answer: Good morning!
question:Hey, can you help me set up LangGraph?, answer: I'm not trained to set up LangGraph, but I can give you some general tips on how to use the platform. Please carefully read the instructions below and follow them to set up LangGraph. If you need more assistance, you can ask the platform support team or refer to the LangGraph documentation. 

To set up LangGraph, you need to have Python installed on your computer (version 3.7 or above). You can download Python from the official Python website if you haven’t already. 

Once you have Python installed, you can install LangGraph using pip. Open a terminal or command prompt and type the following command: 

pip install langgraph

This might take a few minutes to complete, depending on your internet connection. Once the installation is complete, you can start using LangGraph. You can type ‘langgraph –h’ in the terminal or command

In [123]:
### Test the master LLM whether can determine a question or description is related to clinical and medical.
clinical_grader_prompt = PromptTemplate(
    template="""You are a grader for a question.
    You need to determine if the user's question is a clinical/medical question.
    Consider clinical if it asks about diagnosis, symptoms, treatment, medications (dose, interactions, side effects), test/lab interpretation, procedures, triage ("should I see a doctor/ER?"), risks/prognosis, or health advice for humans or animals.
    Non-clinical includes general health trivia/news, biology concepts without personal care decisions, admin/insurance/scheduling, or unrelated topics.
    Here is the user's question: {question} \n
    Give a binary score 'yes' or 'no' to indicate whether it is a clinical question.
    Only provide the binary score as a JSON with a single key 'score', for example {{"score": "yes"}} or {{"score": "no"}}.
    No preamble or explanation.""",
    input_variables=["question"],
)

# clinical_grader_prompt = PromptTemplate(
#     template="""You are a grader for a question.
#     You need to determine if the user's question is a clinical/medical question.    
#     Here is the user's question: {question} \n
#     Give a binary score 'yes' or 'no' to indicate whether it is a clinical question.
#     Only provide the binary score as a JSON with a single key 'score', for example {{"score": "yes"}} or {{"score": "no"}}.
#     No preamble or explanation.""",
#     input_variables=["question"],
# )
polite_answer_chain = clinical_grader_prompt | master_agent | JsonOutputParser()

In [128]:
questions = [
"Hi",
"Hello there ",
"Good morning! How are you?",
"Hey, can you help me set up LangGraph?",
"Bonjour",
"Yo, what's the ETA on the build?",
"Hi — quick question: what's our API rate limit?",
"Good evening from Montréal! I don't feel good today.",
"thanks!",
"Hello, please translate this paragraph.",
"what side effects can Pseudoephedrine cause?",
"In what situations should you place medication safe location â one up?",
"What is the guidance on ask doctor pharmacist advice which product best?",
"Are there any dietary instructions while using Acetaminophen?"
]

for q in questions:
    result = polite_answer_chain.invoke({"question": q})
    print(f"question:{q}, score:{result}")

question:Hi, score:{'score': 'no'}
question:Hello there , score:{'score': 'no'}
question:Good morning! How are you?, score:{'score': 'no'}
question:Hey, can you help me set up LangGraph?, score:{'score': 'no'}
question:Bonjour, score:{'score': 'no'}
question:Yo, what's the ETA on the build?, score:{'score': 'no'}
question:Hi — quick question: what's our API rate limit?, score:{'score': 'no'}
question:Good evening from Montréal! I don't feel good today., score:{'score': 'yes'}
question:thanks!, score:{'score': 'no'}
question:Hello, please translate this paragraph., score:{'score': 'no'}
question:what side effects can Pseudoephedrine cause?, score:{'score': 'yes'}
question:In what situations should you place medication safe location â one up?, score:{'score': 'yes'}
question:What is the guidance on ask doctor pharmacist advice which product best?, score:{'score': 'no'}
question:Are there any dietary instructions while using Acetaminophen?, score:{'score': 'yes'}


In [1]:
### Test If wikipedia search tool is working.
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper

wiki = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
result = wiki.invoke({"query": "Pseudoephedrine"})


In [2]:
result

'Page: Pseudoephedrine\nSummary: Pseudoephedrine, sold under the brand name Sudafed among others, is a sympathomimetic medication which is used as a decongestant to treat nasal congestion. It has also been used off-label for certain other indications, like treatment of low blood pressure. At higher doses, it may produce various additional effects including stimulant, appetite suppressant, and performance-enhancing effects. In relation to this, non-medical use of pseudoephedrine has been encountered. The medication is taken by mouth.\nSide effects of pseudoephedrine include insomnia, elevated heart rate, increased blood pressure, restlessness, dizziness, anxiety, and dry mouth, among others. Rarely, pseudoephedrine has been associated with serious cardiovascular complications like heart attack and hemorrhagic stroke. Some people may be more sensitive to its cardiovascular effects. Pseudoephedrine acts as a norepinephrine releasing agent, thereby indirectly activating adrenergic receptor

In [6]:
import json
import re

In [None]:
def wiki_to_json(s: str):

    records = [r.strip() for r in s.strip().split("\n\n") if r.strip()]

    data = []
    for record in records:
        page_match = re.search(r"Page:\s*(.+)", record)
        summary_match = re.search(r"Summary:\s*(.+)", record, re.DOTALL)
        if page_match and summary_match:
            data.append({
                "Page": page_match.group(1).strip(),
                "Summary": summary_match.group(1).strip()
            })

    return data

In [8]:
result = wiki_to_json(result)

In [10]:
result[1]["Summary"]

'Pseudoephedrine/loratadine, sold under the brand name Claritin-D among others, is an orally administered combination medication used for the treatment of allergic rhinitis (hay fever) and the common cold. Pseudoephedrine, one of the naturally occurring alkaloids of ephedra, is a sympathomimetic used as a decongestant. It produces a decongestant effect that is facilitated by the vasoconstriction in the mucosal capillaries of the upper respiratory areas. Loratadine is a long-acting antihistamine (H1 histamine antagonist) that is less sedating than older substances of its type.\nIn 2023, it was the 300th most commonly prescribed medication in the United States, with more than 400,000 prescriptions.'

In [130]:
# Test the LLM can rewrite a query depends on history documents
rewrite_prompt = PromptTemplate(
    template="""You are question re-writer that converts an input question to a better version that is optimized \n 
     for vectorstore retrieval. Use the history conversation to resolve references. Keep the contextual meaning. \n
     Here is the history conversation: \n\n {document} \n\n
     Here is the initial question: \n\n {question}. Improved question with no preamble: \n """,
    input_variables=["question", "document"],
)

query_rewrite_chain = rewrite_prompt | master_agent | StrOutputParser()

In [131]:
doc_txt = history_as_text(combine_return_2["history"])
print(doc_txt)

HUMAN: My nasal is disconfort. Do you have a medicine to relieve sinus congestion and pressure?
AI: phenylephrine is used to relieve nasal discomfort caused by colds, allergies, and hay fever. it is also used to relieve sinus congestion and pressure. phenylephrine will relieve symptoms but will not treat the cause of the symptoms or speed recovery. phenylephrine is in a class of medications called nasal decongestants. it works by reducing swelling of the blood vessels in the nasal passages.about Phenylephrine


In [140]:
result = query_rewrite_chain.invoke({"question": "How can I take it?", "document": doc_txt})

In [141]:
print(result)

 How to take phenylephrine for nasal congestion?


In [142]:
# Test the LLM if can judge the retrieval documents are related to the question enough
doc_relevance_prompt = PromptTemplate(
    template="""You are a grader assessing relevance of a retrieved document to a user question. \n 
    Here is the retrieved document: \n\n {document} \n\n
    Here is the user question: {question} \n
    If the document contains keywords related to the user question, grade it as relevant. \n
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Only provide the binary score as a JSON with a single key 'score', for example {{"score": "yes"}} or  {{"score": "no"}}.\n
    No premable or explanation.""",
    input_variables=["question", "document"],
)

retrieval_grader_chain = doc_relevance_prompt | master_agent | JsonOutputParser()

In [155]:
docs = ask("how should Phenylephrine be used?", retriever, top_k = 2)
doc_txt = " ".join([d.page_content for d in docs])

In [156]:
doc_txt

"phenylephrine comes as a tablet, a liquid, or a dissolving strip to take by mouth. it is usually taken every 4 hours as needed. follow the directions on your prescription label or the package label carefully, and ask your doctor or pharmacist to explain any part you do not understand. take phenylephrine exactly as directed. do not take more or less of it or take it more often than prescribed by your doctor or directed on the label.phenylephrine comes alone and in combination with other medications. ask your doctor or pharmacist for advice on which product is best for your symptoms. check nonprescription cough and cold product labels carefully before using two or more products at the same time. these products may contain the same active ingredient(s) and taking them together could cause you to receive an overdose. this is especially important if you will be giving cough and cold medications to a child.nonprescription cough and cold combination products, including products that contain 

In [157]:
result = retrieval_grader_chain.invoke({"question": "how should Phenylephrine be used?", "document": doc_txt})

In [158]:
print(result)

{'score': 'yes'}


In [159]:
result = retrieval_grader_chain.invoke({"question": "how should Acarbose be used?", "document": doc_txt})

In [160]:
print(result)

{'score': 'no'}


In [114]:
# Test the LLM if can generate an answer

# Prompt
answer_prompt = hub.pull("rlm/rag-prompt")

# Chain
answer_chain = answer_prompt | master_agent | StrOutputParser()



In [116]:
answer_prompt.pretty_print()


You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: [33;1m[1;3m{question}[0m 
Context: [33;1m[1;3m{context}[0m 
Answer:


In [126]:
# Run
generation = answer_chain.invoke({"context": docs, "question": query_2})
print(generation)


 You can use pyrethrin and piperonyl butoxide shampoo to treat head lice and scabies. It is applied to the hair and scalp, and then washed off after 10 minutes. Two treatments are usually needed, seven to ten days apart, and a third treatment may be necessary if some lice or nits are still present after the second treatment.


In [163]:
# Hallucination test
# Test the LLM if can determine the answer is grounded in the facts. 
hallucination_grader_prompt = PromptTemplate(
    template="""You are a grader assessing whether an answer is grounded in / supported by a set of facts. \n 
    Here are the facts:
    \n ------- \n
    {documents} 
    \n ------- \n
    Here is the answer: {generation} \n
    Only provide the binary score as a JSON with a single key 'score', for example {{"score": "yes"}} or  {{"score": "no"}}.\n     
    Don't do preamble or explanation.""",
    input_variables=["generation", "documents"],
)

hallucination_grader_chain = hallucination_grader_prompt | master_agent | JsonOutputParser()

In [161]:
result = hallucination_grader_chain.invoke({"documents": "You can use pyrethrin and piperonyl butoxide shampoo to treat head lice and scabies. ", "generation": "pseudoephedrine may cause side effects. "})

In [162]:
print(result)

{'score': 'no'}


In [164]:
result = hallucination_grader_chain.invoke({"documents": "phenylephrine comes as a tablet, a liquid, or a dissolving strip to take by mouth. it is usually taken every 4 hours as needed. follow the directions on your prescription label or the package label carefully, and ask your doctor or pharmacist to explain any part you do not understand. take phenylephrine exactly as directed. do not take more or less of it or take it more often than prescribed by your doctor or directed on the label.phenylephrine comes alone and in combination with other medications. ask your doctor or pharmacist for advice on which product is best for your symptoms. check nonprescription cough and cold product labels carefully before using two or more products at the same time. these products may contain the same active ingredient(s) and taking them together could cause you to receive an overdose. this is especially important if you will be giving cough and cold medications to a child.nonprescription cough and cold combination products, including products that contain phenylephrine, can cause serious side effects or death in young children. do not give these products to children younger than 4 years of age. if you give these products to children 4 to 11 years of age, use caution and follow the package directions carefully.if you are giving phenylephrine or a combination product that contains phenylephrine to a child, read the package label carefully to be sure that it is the right product for a child of that age. do not give phenylephrine products that are made for adults to children.before you give a phenylephrine product to a child, check the package label to find out how much medication the child should receive. give the dose that matches the child's age on the chart. ask the child's doctor if you don't know how much medication to give the child.if you are taking the liquid, do not use a household spoon to measure your dose. use the measuring spoon or cup that came with the medication or use a spoon made especially for measuring medication.if your symptoms do not get better within 7 days or if you have a fever, stop taking phenylephrine and call your doctor.if you are taking the dissolving strips, place one strip on your tongue and allow it to dissolve.about Phenylephrine phenylephrine is used to relieve nasal discomfort caused by colds, allergies, and hay fever. it is also used to relieve sinus congestion and pressure. phenylephrine will relieve symptoms but will not treat the cause of the symptoms or speed recovery. phenylephrine is in a class of medications called nasal decongestants. it works by reducing swelling of the blood vessels in the nasal passages.about Phenylephrine", "generation": "pseudoephedrine may cause side effects. "})

In [165]:
print(result)

{'score': 'yes'}


#### It is a point where an agent can transfer "How can I use it?" to a retrievable query.

#### The query rewriting part is working well now.
#### But sometimes, the user types questions which have multiply meaning. For example: How can I have phenylephrine?
* It could mean "What is the dosage instruction to take phenylephrine?"
* It also could mean "Where and how can I buy phenylephrine?" 
#### This example remind me to involve Query Decomposition technique which will transfer a query to a few querys in different angles.

In [166]:
# I will use the same LLM model to do the job with different prompt

# Test the LLM can augment a query depends on history documents
expand_query_prompt = PromptTemplate(
    template="""You are a medical doctor. You generate exactly one distinct, clinically-relevant question variants from the user's Original question, \n 
     covering different angles (e.g., indications/contraindications, dosing vs. administration, \n
     adult vs. pediatric, interactions vs. adverse effects). \n
     Here is the history conversation: \n\n {document} \n\n
     Here is the initial question: \n\n {question}. \n 
     Return only one question""",
    input_variables=["question", "document"],
)

expand_query_chain = expand_query_prompt | master_agent | StrOutputParser()

In [167]:
result = expand_query_chain.invoke({"question": "How can I have Phenylephrine?", "document": doc_txt})

In [168]:
print(result)

 Is it safe to give Phenylephrine to children under the age of 4?


### Using LangGraph to coordinate retriever, reranker, query_rewriter and relevant_grader to work together so that produce most relevant answer.
##### First of all, 

Each node will -

1/ Either be a function or a runnable.

2/ Modify the state.

The edges choose which node to call next depends on the state.

In [None]:
### Nodes



def combine_node(state):
    print("----------------Enter the combine node------------------\n")
    print("Got a new question \n")

    print(f"Step {settings.STEP}:  Extract previous conversation from memory.\n")    

    if state["session_id"] == "":   
        print("This is a new conversation.\n")
        settings.SESSION_ID = str(uuid.uuid4())        
        settings.SHORT_TERM_MEMORY = Short_Term_Memory()
    else:
        print("This is multi-turn conversation.\n")
    
    history = settings.SHORT_TERM_MEMORY.get_history(state["session_id"])
    query = state["query"]

    print(f"There are {len(history.messages)} turn before!\n")

    print("Wrap up session, query and history to next step.\n")

    print("----------------Step out the combine node---------------\n")

    settings.STEP += 1

    return {"session_id": settings.SESSION_ID, "query": query,"history": history, "generation": ""}

In [None]:
### Routers


In [None]:
### Edges
def decide_query_relevance(state):
    print("---Enter the edge who determines whether the query is relevant---\n")
