# Baseline RAG model

This is just the RAG model on the sample of a patient's record

In [None]:
!pip install -r '/content/requirements.txt' -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.6/974.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [None]:
#attach drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from langchain_community.document_loaders import TextLoader
from langchain_mistralai.chat_models import ChatMistralAI
from langchain_mistralai.embeddings import MistralAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain

from langchain.embeddings.huggingface import HuggingFaceEmbeddings


api_key = "YOUR_API_KEY"

drive_dataset_path = '/content/drive/MyDrive/dataset_folder/'
drive_faiss_path = '/content/drive/MyDrive/faiss_index_full'
drive_trad_model_path = '/content/drive/MyDrive/traditional_med_model/'

# /content/drive/MyDrive/dataset_folder/health_report_{0}/health_report_{0}.txt

# Load data a sample health record for testing the framework draft
loader = TextLoader(drive_dataset_path+"health_report_{0}/health_report_{0}.txt")
docs = loader.load()

# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.split_documents(docs)

# Define the embedding model
# can be from MistralAI or a locally loaded embedding model
# embeddings = MistralAIEmbeddings(model="mistral-embed", mistral_api_key=api_key)
embeddings= HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Create the vector store for the user
vector = FAISS.from_documents(documents, embeddings)
# Define a retriever interface for the user vector store
retriever = vector.as_retriever()

# Define LLM
# open-mixtral-8x7b
model = ChatMistralAI(mistral_api_key=api_key)


# Define prompt template
prompt = ChatPromptTemplate.from_template("""
You are a helpful, respectful and honest medical bot. Always answer as
helpfully as possible, while being safe.

If a question does not make any sense, or is not factually coherent, explain
why instead of answering something not correct. If you don't know the answer
to a question, please don't share false information.

Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}""")

# Create a retrieval chain to answer questions
document_chain = create_stuff_documents_chain(model, prompt)
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [None]:
# genearating a response from the basaeline RAG model
response = retrieval_chain.invoke({"input": "What is are the symptoms of the person in question?"})
print(response["answer"])

The symptoms described by the 57-year-old female in the context are:

1. Suddenly vomiting blood (or spitting out blood) for two consecutive days, which occurs in the night.
2. Bleeding from the gums, which she notices when she wakes up.
3. Swollen teeth.
4. Shortness of breath (panting) while walking.
5. She also mentions softness, which could be referring to weakness, but it's not entirely clear from the context.

However, it's important to note that she has a history of hypertension. The healthcare professional in the conversation also suggests that her symptoms could be related to cardiopulmonary function, indicating potential heart or lung issues. 

Please consult with a healthcare professional for a thorough examination and accurate diagnosis.


# RAG with Relevant Medical Facts

Here we are performing RAG on both the user patient data and medical facts, to gather relevant info from both and generate a desired augmented response

In [None]:
# Defining a Vector store of the Medical textbooks in the drive

vector_db = FAISS.load_local('/content/drive/MyDrive/faiss_index_full', embeddings, allow_dangerous_deserialization=True)

In [None]:
# performing a simple similarity search on a prompt
docs_and_scores = vector_db.similarity_search_with_score("what is first aid?")

docs_and_scores[:][:][0]

(Document(page_content='First Aid for the Medicine Clerkship $42.95 Review McGraw-Hill, 2005, 2nd ed., 416 pages, ISBN 9780071448758 A high-yield review of symptoms and diseases. Pros: A comprehensive review that is well organized by symptom with good illustrations, scenarios, diagrams, algorithms, and mnemonics. Cons: May not be suited to readers who prefer information arranged in text form. May be too basic for certain topics. Summary: An excellent, concise review of medicine for those who prefer its format. Underground Clinical Vignettes: Emergency Medicine $22.95 Review', metadata={'source': '/content/textbooks/chunk/First_Aid_Step2.jsonl', 'seq_num': 1295}),
 0.9970927)

In [None]:
# viewing all retieved info for the sample prompt given above
for doc, score in docs_and_scores:
    print(f"Content: {doc.page_content}")

Content: First Aid for the Medicine Clerkship $42.95 Review McGraw-Hill, 2005, 2nd ed., 416 pages, ISBN 9780071448758 A high-yield review of symptoms and diseases. Pros: A comprehensive review that is well organized by symptom with good illustrations, scenarios, diagrams, algorithms, and mnemonics. Cons: May not be suited to readers who prefer information arranged in text form. May be too basic for certain topics. Summary: An excellent, concise review of medicine for those who prefer its format. Underground Clinical Vignettes: Emergency Medicine $22.95 Review
Content: First aid includes horizontal positioning (especially if there are cerebral manifestations), intravenous fluids if available, and sustained 100% oxygen administration. The latter accelerates inert gas washout from tissues and promotes resolution of bubbles. Definitive treatment of DCS or CAGE with recompression and hyperbaric oxygen is justified in most instances, although some mild or marginal DCS cases may be managed wi

In [None]:
# Define prompt template for performing RAG on both the vector stores
prompt = ChatPromptTemplate.from_template("""
You are a helpful, respectful and honest medical bot. Always answer as
helpfully as possible, while being safe.

If a question does not make any sense, or is not factually coherent, explain
why instead of answering something not correct. If you don't know the answer
to a question, please don't share false information.

Answer the following question based only on the user info:

<user_info>
{user_info}
</user_info>

You can look at the context to see if it is relevant and can help the medical suggestion you're making.
<context>
{context}
</context>

Question: {input}""")

# docs_and_scores = vector_db.similarity_search_with_score("what is first aid?")


# retriever for the medical vector store
vector_db_retriever = vector_db.as_retriever()

# Create a retrieval chain to answer questions using the Medcial vector retriever
document_chain = create_stuff_documents_chain(model, prompt)
retrieval_chain = create_retrieval_chain(vector_db_retriever, document_chain)

#sample prompt to the LLM
input = "What is are the symptoms of the person in question?"

# retrieving releavant user context
docs_and_scores = vector.similarity_search_with_score(input)
user_info = [doc.page_content for doc, score in docs_and_scores]


# generating the desired response with the medical retrieval chain
response = retrieval_chain.invoke({"input": input, "user_info": user_info})
print(response["answer"])

Based on the provided user information, the following symptoms can be identified for the person in question:

1. Pain around the navel for around 2-3 days.
2. The pain is not constant and comes and goes.
3. No medication or examination has been done yet.
4. Stool is normal.
5. No symptoms of vomiting or fainting.
6. There is a feeling of bloating and decreased appetite.
7. Initially, there were symptoms similar to diarrhea, but it has improved.

Additionally, based on the context provided, the following symptoms are also relevant to the conditions mentioned:

1. Two or more gastrointestinal symptoms (navel pain, bloating, and earlier diarrhea symptoms)
2. No specific sexual symptom is mentioned in the user information.
3. There is no pseudoneurological symptom mentioned in the user information.

The context also mentions that when symptoms persist, there can be symptoms of anxiety, depression, intolerance of noise, emotional excitement, and crowds, as well as other symptoms such as ten