In [None]:
from google import genai
from google.genai import types

from IPython.display import HTML, Markdown, display

from google.api_core import retry

""" LangChain Packages"""
from langchain.vectorstores import Chroma
from langchain.embeddings.vertexai import VertexAIEmbeddings
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

from chromadb import EmbeddingFunction, Embeddings
import chromadb
import json
from tqdm import tqdm

PydanticImportError: `BaseSettings` has been moved to the `pydantic-settings` package. See https://docs.pydantic.dev/2.10/migration/#basesettings-has-moved-to-pydantic-settings for more details.

For further information visit https://errors.pydantic.dev/2.10/u/import-error

In [None]:
from openai import OpenAI
import os

api_key = os.getenv("OPENROUTER_API_KEY")
client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=api_key,
)


is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})

genai.models.Models.generate_content = retry.Retry(
    predicate=is_retriable)(genai.models.Models.generate_content)

In [29]:
file_path = "/kaggle/input/vetbot-sources/merck_articles.json"

def extract_contents_with_path(data, base_path="", metadata=None):
    docs = []

    if isinstance(data, dict):
        for key, value in data.items():
            new_path = f"{base_path}.{key}" if base_path else key

            if key == "content" and isinstance(value, str):
                doc_metadata = metadata.copy() if metadata else {}
                doc_metadata["path"] = base_path  # use path *before* reaching 'content'
                docs.append(Document(page_content=value, metadata=doc_metadata))
            else:
                docs.extend(extract_contents_with_path(value, new_path, metadata))

    elif isinstance(data, list):
        for i, item in enumerate(data):
            new_path = f"{base_path}[{i}]"
            docs.extend(extract_contents_with_path(item, new_path, metadata))

    return docs

# Load the JSON
with open(file_path, "r") as f:
    raw_data = json.load(f)

# Extract documents with full metadata paths
docs = []
for item in raw_data:
    metadata = {
        "name": item.get("name"),
        "link": item.get("link"),
    }
    content_dict = item.get("content", {})
    document = extract_contents_with_path(content_dict, base_path="dog-owners", metadata=metadata)
    docs.extend(document)

In [30]:
# Use Vertex AI embeddings (Gecko)
embedding_model = "models/text-embedding-004"

class GeminiEmbeddingFunction(EmbeddingFunction):
    # Specify whether to generate embeddings for documents, or queries
    document_mode = True

    @retry.Retry(predicate=is_retriable)
    def __call__(self, input: Document) -> Embeddings:
        if self.document_mode:
            embedding_task = "retrieval_document"
        else:
            embedding_task = "retrieval_query"

        response = client.models.embed_content(
            model=embedding_model,
            contents=input,
            config=types.EmbedContentConfig(
                task_type=embedding_task,
            ),
        )
        return [e.values for e in response.embeddings]

In [31]:
db_name = "veterinary_articles"

embed_fn = GeminiEmbeddingFunction()
embed_fn.document_mode = True

# Extract content, metadata, and generate ids
documents = [doc.page_content for doc in docs]
metadata = [doc.metadata for doc in docs]
ids = [doc.metadata["link"] for doc in docs]

  embed_fn = GeminiEmbeddingFunction()


In [32]:
chroma_client = chromadb.Client()
db = chroma_client.get_or_create_collection(name=db_name, embedding_function=embed_fn)

batch_size = 100
for i in tqdm(range(0, len(documents), batch_size)):
    batch_docs = documents[i:i+batch_size]
    batch_meta = metadata[i:i+batch_size]
    batch_ids = [f"doc_{j}" for j in range(i, i + len(batch_docs))]  # ensure unique IDs

    db.add(
        documents=batch_docs,
        metadatas=batch_meta,
        ids=batch_ids
    )

100%|██████████| 10/10 [00:10<00:00,  1.05s/it]


In [33]:
# TODO: better urls (include subsections)

initial_instruction_prompt = """You are a knowledgeable, friendly, and responsible veterinary chatbot designed to assist with questions related to veterinary science. These questions may range from highly practical concerns of pet owners caring for sick animals to more theoretical or academic inquiries about veterinary topics.

When responding to practical health-related questions, please keep in mind that the user may not provide all the necessary information to properly assess the animal’s condition. In such cases, ask relevant and specific follow-up questions to gather additional information before making any conclusive statements or offering advice. Your follow-up questions should be informed by the veterinary articles provided to you.

Always prioritize accuracy and caution in your responses. Do not make assumptions or offer definitive diagnoses without sufficient information. Avoid making false claims, as incorrect information in this domain can have serious consequences for animal health and safety.

You will be given veterinary documents and their URLs. These articles may or may not be relevant to the user’s query—use them only if they are clearly applicable. 
When you do draw from them, reference them in a scientific and transparent manner, listing the sources below your response.
Your statements about veterinary science should only be based on the provided documents!

When answering:
	•	Be comprehensive and provide relevant background information where helpful.
	•	Use clear, simple language suited for a general audience with no technical background.
	•	Maintain a conversational and empathetic tone—your goal is to support and educate pet owners in a reassuring way.

If you need more information before answering, keep your reply brief and focused on the follow-up questions needed to proceed."""

reminder_instruction_prompt =  """Reminder: You are a veterinary chatbot. 
Please continue to provide helpful, accurate, and cautious responses to both practical and theoretical veterinary questions. 
When a user input lacks crucial detail, ask relevant follow-up questions before offering advice. 
Only reference veterinary documents when they clearly apply, and cite them appropriately. 
Additionally, make sure that all your replies are based on the provided documents.
Maintain a clear, empathetic tone for non-expert pet owners."""

def retrieve_relevant_articles(query):
    embed_fn.document_mode = False
    results = db.query(query_texts=[query], n_results=10, include=['documents', 'metadatas'])
    [documents], [metadata] = results['documents'], results['metadatas']
    return [x['link'] for x in metadata], [x['name'] for x in metadata], documents
urls, names, documents = retrieve_relevant_articles("What are the symptoms of rabies?")

def augment_query(query):
    query = query.replace("\n", " ")
    urls, names, documents = retrieve_relevant_articles(query)
    prompt = instruction_prompt + f"QUESTION: {query}\n\n"
    # Add the retrieved documents to the prompt.
    for url, name, document in zip(urls, names, documents):
        document = document.replace("\n", " ")
        prompt += f"url: {url}\n"
        prompt += f"name: {name}\n"
        prompt += f"document: {document}\n\n"
    return prompt


# ToDos:
- clean up notebook, reorder cells, write descriptions
- Initial long instruction prompt, then once every ten turns one shorter reminder message.
- clear history when running cell anew.
- loading gif should have grey background color as the rest.
- URLs should be complete with sections.
- Reevaluate what is left to do.

## Chat Interface

In [34]:
# Load required libraries
from IPython.display import display, HTML
import ipywidgets as widgets
from datetime import datetime
import base64
import markdown

### Logic for the Chatbot/Chatbot Interface

In [35]:
# Event handler
def on_submit(change):
    user_input = change["new"]
    if not user_input.strip():
        return
    input_widget.value = ""
    loading_widget.layout.display = "block"
    
    with chat_output:
        display(HTML(format_user_message(user_input)))
    augmented_query = augment_query(user_input)
    response = chat.send_message(augmented_query).text
    
    with chat_output:
        display(HTML(format_bot_message(response)))
    
    loading_widget.layout.display = "none"


def format_user_message(msg):
    return f"""
    <div class="chat-message-right">
        <div>
            <img src="data:image/png;base64,{dog_base64}" class="rounded-circle" width="40" height="40">
            <div class="text-muted small text-nowrap mt-1">{datetime.now().strftime('%H:%M:%S')}</div>
        </div>
        <div class="chat-bubble">
            <div><strong>You</strong></div>
            {msg}
        </div>
    </div>
    """

def format_bot_message(msg):
    return f"""
    <div class="chat-message-left">
        <div>
            <img src="data:image/png;base64,{vet_base64}" class="rounded-circle" width="40" height="40">
            <div class="text-muted small text-nowrap mt-1">{datetime.now().strftime('%H:%M:%S')}</div>
        </div>
        <div class="chat-bubble">
            <div><strong>Bot</strong></div>
            {markdown.markdown(msg)}
        </div>
    </div>
    """


def img_to_base64_str(path):
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")


### Inject CSS styles into HTML frontend

`display`: “Render this object’s representation to the notebook’s output cell, using whatever method it defines.”
This can mean showing: 
	- a widget,
	- a plot,
	- some styled HTML,
    - or in this case, just executing HTML that has side effects (like adding <style> tags or JavaScript).

In [1]:
display(HTML("""
<style>
    .rounded-input input {
        border-radius: 20px !important;
        padding: 10px !important;
        font-size: 16px;
    }
</style>
"""))

display(HTML("""
<style>
    .widget-box {
        font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
    }
</style>
"""))


# Basic Bootstrap CSS
display(HTML("""
<link rel="stylesheet"
      href="https://cdn.jsdelivr.net/npm/bootstrap@4.5.3/dist/css/bootstrap.min.css"
      integrity="sha384-TX8t27EcRE3e/ihU7zmQxVncDAy5uIKz4rEkgIXeMed4M0jlfIDPvg6uqKI2xXr2"
      crossorigin="anonymous">
<style>
    .chat-message-left, .chat-message-right {
        display: flex;
        flex-shrink: 0;
        margin-bottom: 10px;
    }
    .chat-message-left {
        margin-right: auto;
    }
    .chat-message-right {
        flex-direction: row-reverse;
        margin-left: auto;
    }
    .chat-bubble {
        max-width: 75%;
        border-radius: 15px;
        padding: 10px 15px;
        margin: 5px;
    }
    .chat-message-left .chat-bubble {
        background: #f1f0f0;
    }
    .chat-message-right .chat-bubble {
        background: #d1e7dd;
    }
</style>
"""))

display(HTML("""
<style>
    .chat-container {
        border: 2px solid #bbb;
        border-radius: 25px;
        padding: 20px;
        margin: 20px 0;
        background-color: #f8f9fa;
        box-shadow: 0 4px 12px rgba(0,0,0,0.1);
    }
</style>
"""))

NameError: name 'HTML' is not defined

In [2]:
#from google.generativeai.types import Content, Part


client = genai.Client(api_key=GOOGLE_API_KEY)
chat = client.chats.create(model='gemini-2.0-flash', history=[])
#chat = client.chats.create(model='gemini-2.0-flash', 
#                           history=[Content(role="user", parts=[Part(text=initial_instruction_prompt)]),
#                           ])

NameError: name 'genai' is not defined

In [38]:
input_widget = widgets.Text(
    placeholder="Ask something...",
    layout=widgets.Layout(width="100%", height="40px"),
    style={'description_width': 'initial'}
)

input_widget.add_class("rounded-input")
input_widget.continuous_update = False
input_widget.observe(on_submit, names="value")


# Load and hide loading animation
with open("/kaggle/input/vetbot-sources/loading.gif", "rb") as file:
    loading_image = file.read()

loading_widget = widgets.Image(
    value=loading_image,
    format="gif",
    width=24,
    height=24,
    layout={"display": "none", "margin": "0px 10px"}
)

# Output box
chat_output = widgets.Output()

# Convert images to base64 strings once
dog_base64 = img_to_base64_str("/kaggle/input/vetbot-sources/dog.png")
vet_base64 = img_to_base64_str("/kaggle/input/vetbot-sources/veterinarian.png")

# Layout and display
chat_layout = widgets.VBox([
    widgets.HBox([chat_output],
                 layout=widgets.Layout(flex_flow="column-reverse", height="600px", overflow="auto")),
    widgets.HBox([loading_widget, input_widget])
])


chat_container = widgets.VBox([chat_layout])
chat_container.add_class("chat-container")
display(chat_container)

VBox(children=(VBox(children=(HBox(children=(Output(),), layout=Layout(flex_flow='column-reverse', height='600…