In [None]:
!pip install --upgrade langchain
!pip install --upgrade langchain_core
!pip install --upgrade langchain_community
!pip install --upgrade langchain_google_genai

In [None]:
!pip install -U langchain-chroma

In [None]:
!pip install gradio

In [None]:
!pip install llama-cpp-python

In [6]:
from google.colab import userdata
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from sentence_transformers import SentenceTransformer
from langchain_community.docstore.document import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_community.llms import LlamaCpp
import os
import time
import pandas as pd
import gradio as gr

#### First Test

In [7]:
# 1. Initializing the Chat Model
chat_model = ChatGoogleGenerativeAI(model='gemini-2.5-flash', temperature=0, google_api_key=userdata.get('GOOGLE_API_KEY'))

# 2. Prepare the messages
# The SystemMessage sets the behavior and context for the AI (If i want normal behavior without restriction, i can skip the SystemMessage)
# The HumanMessage is the user`s actual query
messages = [
    SystemMessage(content="You are an assistant knowledgeable about healthcare. Only answer healthcare-related questions."),
    HumanMessage(content="What is Malaria?")
]

# 3. Invoke the model with the messages
result = chat_model.invoke(messages)

print(result.content)

Malaria is a serious and sometimes fatal disease caused by a parasite that commonly infects a certain type of mosquito, which then feeds on humans.

Here's a breakdown:

1.  **Cause:** It is caused by Plasmodium parasites. There are five species that infect humans, with *Plasmodium falciparum* being the most dangerous and responsible for most malaria-related deaths worldwide.
2.  **Transmission:** Malaria is transmitted through the bite of an infected female *Anopheles* mosquito. When an infected mosquito bites a person, it injects the parasites into the bloodstream.
3.  **Life Cycle:** Once in the human body, the parasites travel to the liver, where they mature and multiply. After a period, they leave the liver and infect red blood cells, where they continue to multiply, causing the red blood cells to burst. This cycle leads to the characteristic symptoms of malaria.
4.  **Symptoms:** Symptoms typically appear 10 days to 4 weeks after infection, though they can appear as early as 7 da

In [8]:
chat_model.invoke("What is red blood cell?")

AIMessage(content="A **red blood cell (RBC)**, also known as an **erythrocyte**, is the most common type of blood cell and the principal means of delivering oxygen to the body tissues via the blood.\n\nHere's a breakdown of its key characteristics and functions:\n\n1.  **Primary Function: Oxygen Transport**\n    *   Red blood cells pick up oxygen from the lungs.\n    *   They transport this oxygen to all the tissues and organs throughout the body, which need oxygen to produce energy.\n    *   They also help transport carbon dioxide (a waste product of metabolism) from the tissues back to the lungs to be exhaled.\n\n2.  **Hemoglobin**\n    *   The distinctive red color of blood comes from **hemoglobin**, an iron-rich protein found inside red blood cells.\n    *   Hemoglobin is crucial because it's what actually binds to oxygen molecules in the lungs and releases them in the tissues. It also binds to carbon dioxide.\n\n3.  **Unique Shape**\n    *   Red blood cells have a unique **biconca

### Second Test

In [None]:
# Same steps with the same SystemMessage, but out of context HumanMessage

chat_model = ChatGoogleGenerativeAI(model='gemini-2.5-flash', temperature=0, google_api_key=userdata.get('GOOGLE_API_KEY'))

messages = [
    SystemMessage(content="You're an assistant knowledgeable about healthcare. Only answer healthcare-related questions."),
    HumanMessage(content="Which country is the largest?"),
]

result = chat_model.invoke(messages)

print(result.content)

I can only answer healthcare-related questions.


### Chat Prompt Template for dynamic data.

In [10]:
# 1. Initializing the Chat Model
chat_model = ChatGoogleGenerativeAI(model='gemini-2.5-flash', temperature=0, google_api_key=userdata.get('GOOGLE_API_KEY'))

# 2. Create the Prompt Template
instruction_str = """Your job is to use patient reviews to answer questions about their experience at a hospital.
Use the following context to answer questions. Be as detailed as possible, but don't make up any information that's not from the context.
If you don't know an answer, say you don't know.

Context: {context}

Question: {question}
"""

review_template = ChatPromptTemplate.from_template(instruction_str)

# 3. Define the context and question
context = "The discharge process was seamless!"
question = "Did anyone have a positive experience?"

# 4. Create the chain by piping the components together
# Also added an output parser to get a clean string result
chain = review_template | chat_model | StrOutputParser()

# 5. Invoke the chain with the input variables
result = chain.invoke({
    "context": context,
    "question": question
})

print(result)

Yes, someone had a positive experience. They described the discharge process as "seamless."


### Using Prompt Templates & Message Templates

In [None]:
chat_model = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0, google_api_key=userdata.get('GOOGLE_API_KEY'))

instruction_str = """Your job is to use patient reviews to answer questions about their experience at a hospital.
Use the following context to answer questions.
Be as detailed as possible, but don't make up any information that's not from the context.
If you don't know an answer, say you don't know.

Context: {context}
"""

review_system_prompt = SystemMessagePromptTemplate(
    prompt=PromptTemplate(input_variables=['context'], template=instruction_str)
)

review_human_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(input_variables=['question'], template="{question}")
)

messages = [review_system_prompt, review_human_prompt]

# This is the final, reusable prompt template
review_prompt_template = ChatPromptTemplate(
    input_variables=["context", "question"],
    messages=messages,
)

# Define the context and question
context = "The staff was very rude, also the prices are through the roof!"
question = "Did anyone have a positive or bad experience?"

# Create the chain
chain = review_prompt_template | chat_model | StrOutputParser()

# Invoke the chain
result = chain.invoke({
    "context": context,
    "question": question
})

print(result)



Based on the provided context, the patient had a bad experience. The staff was described as "very rude," and the prices were considered "through the roof."


In [12]:
context = "I had a good stay!"
question = "Did anyone have a positive experience?"

chain.invoke({
    "context": context,
    "question": question
})

'Yes, one patient stated, "I had a good stay!"'

### Adding RAG

In [14]:
REVIEWS_CSV_PATH = "/content/data/reviews.csv"

# Define variable for directory where the Chroma vector database will be stored
REVIEW_CHROMA_PATH = 'chroma_data'

loader = CSVLoader(
    file_path=REVIEWS_CSV_PATH,
    source_column='review'
)
reviews = loader.load()

embedding_function = GoogleGenerativeAIEmbeddings(
    model='models/gemini-embedding-001',
    google_api_key=userdata.get('GOOGLE_API_KEY')
)

# Set the size of each batch to process
batch_size = 20
# Calculate the total number of batches
num_batches = (len(reviews) - 1) // batch_size + 1
reviews_vector_db = None


This code uses the `gemini-embedding-001` free model and after processing the data the free quota is exhausted, so later in the code im switching to a local embedding model.

In [15]:
# Loop through the document in batches to avoid hitting the API`s rate limit.
for i in range(0, len(reviews), batch_size):
    batch_docs = reviews[i:i+batch_size]
    current_batch_num = i // batch_size + 1

    print(f"Processing batch {current_batch_num} / {num_batches}...")

    if i==0:
        reviews_vector_db = Chroma.from_documents(
            documents=batch_docs,
            embedding=embedding_function,
            persist_directory=REVIEW_CHROMA_PATH
        )
    else:
        reviews_vector_db.add_documents(documents=batch_docs)

    # Pause the script for 30 seconds after each batch to respect the per-minute rate limit.
    print(f"Batch {current_batch_num} processed. Waiting for 30 seconds...")
    time.sleep(30)

print("Vector database created successfully and saved to the specified directory.")

Processing batch 1 / 51...
Batch 1 processed. Waiting for 30 seconds...
Processing batch 2 / 51...
Batch 2 processed. Waiting for 30 seconds...
Processing batch 3 / 51...
Batch 3 processed. Waiting for 30 seconds...
Processing batch 4 / 51...
Batch 4 processed. Waiting for 30 seconds...
Processing batch 5 / 51...
Batch 5 processed. Waiting for 30 seconds...
Processing batch 6 / 51...
Batch 6 processed. Waiting for 30 seconds...
Processing batch 7 / 51...
Batch 7 processed. Waiting for 30 seconds...
Processing batch 8 / 51...
Batch 8 processed. Waiting for 30 seconds...
Processing batch 9 / 51...
Batch 9 processed. Waiting for 30 seconds...
Processing batch 10 / 51...
Batch 10 processed. Waiting for 30 seconds...
Processing batch 11 / 51...
Batch 11 processed. Waiting for 30 seconds...
Processing batch 12 / 51...
Batch 12 processed. Waiting for 30 seconds...
Processing batch 13 / 51...
Batch 13 processed. Waiting for 30 seconds...
Processing batch 14 / 51...
Batch 14 processed. Waiting 

Switching the embedding model to local `all-mpnet-base-v2` for free embeddings.

In [None]:
# Load CSV
df = pd.read_csv("/content/data/reviews.csv")
reviews_docs = [Document(page_content=text) for text in df['review']]

# Load SentenceTransformer model
model_name = "sentence-transformers/all-mpnet-base-v2"
embedding_model = HuggingFaceEmbeddings(model_name=model_name)

# Create Chroma DB
start = time.time()

print("Creating the Database...")
reviews_vector_db = Chroma.from_documents(
    documents=reviews_docs,
    embedding=embedding_model,
    persist_directory="/content/chroma_db_mpnet"
)

print(f"Time taken: {time.time() - start} seconds.")
print("Vector database created successfully!")

### Retrieval

`similarity_search` function does the embedding.

In [17]:
question = """Has anyone complained about communication with the hospital staff?"""
relevant_chunks = reviews_vector_db.similarity_search(question, k=3)

for i, review in enumerate(relevant_chunks):
    print(i+1, ":", review.page_content)

1 : I encountered some communication issues during my stay. The medical staff seemed disorganized, and it led to confusion about my treatment plan.
2 : I encountered some issues with the nursing staff's communication. It seemed like there was a lack of coordination, leading to confusion about my medication schedule and treatment plan.
3 : The hospital staff were friendly and attentive, making my stay more pleasant. However, there were occasional lapses in communication that caused confusion about my treatment plan.


In [18]:
reviews_retriever = reviews_vector_db.as_retriever(k=10)

# Create a chain for querying and generating responses
review_chain = (
    {"context": reviews_retriever, "question": RunnablePassthrough()}
    | review_prompt_template
    | chat_model
    | StrOutputParser()
)

In [19]:
question = """Has anyone complained about communication with the hospital staff?"""
review_chain.invoke(question)

"Yes, several patients have complained about communication with the hospital staff.\n\nSpecifically:\n*   One patient encountered communication issues during their stay, noting that the medical staff seemed disorganized, which led to confusion about their treatment plan.\n*   Another patient experienced issues with the nursing staff's communication, citing a lack of coordination that caused confusion about their medication schedule and treatment plan.\n*   Two separate reviews mentioned that while the hospital staff were friendly and attentive, there were occasional lapses in communication that caused confusion about their treatment plan."

### Adding a UI

In [None]:
def respond_to_user_question(question):
    """
    Respond to a user`s question using the review_chain.
    """
    return review_chain.invoke(question)

Testing the chat model `gemini-2.5-flash` with gradio UI. (the output can not be rendered in the browser)

In [None]:
interface = gr.ChatInterface(fn=respond_to_user_question, title="Review Helper Bot")

interface.launch(debug=True)

  self.chatbot = Chatbot(


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://4acf096688f5242bd3.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://4acf096688f5242bd3.gradio.live




In [None]:
model_path = "/content/phi3_128k"

# Create folder
os.makedirs(model_path, exist_ok=True)

!hf download bartowski/Phi-3.1-mini-128k-instruct-GGUF \
    --include "Phi-3.1-mini-128k-instruct-Q4_K_M.gguf" \
    --local-dir "$model_path"

print("Download done, file at:", os.listdir(model_path))

Also switching the chat model from `gemini-2.5-flash` to `Phi-3.1-mini-128k-instruct-Q4_K_M` because the free version of `gemini` has limited tokens. `Phi-3.1-mini` is a local model that i've saved.

In [None]:
chat_model = LlamaCpp(
    model_path="/content/phi3_128k/Phi-3.1-mini-128k-instruct-Q4_K_M.gguf",
    n_ctx=8096,          # context window changed form 128k to 8k because free tier of colab is limiter on RAM
    temperature=0.0,
    max_tokens=512
)

In [None]:
review_chain = (
    {"context": reviews_retriever, "question": RunnablePassthrough()}
    | review_prompt_template
    | chat_model
    | StrOutputParser()
)

Testing `Phi-3.1-mini` (not rendable)

In [23]:
interface = gr.ChatInterface(fn=respond_to_user_question, title="Review Helper Bot")

interface.launch(debug=True)

  self.chatbot = Chatbot(


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://e658227459ea7781bf.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://e658227459ea7781bf.gradio.live


