In [11]:
""" LangChain Packages"""
from langchain.vectorstores import Chroma
from langchain.embeddings.vertexai import VertexAIEmbeddings
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

""" Chroma packages"""
from chromadb import EmbeddingFunction, Embeddings
import chromadb
from chromadb import PersistentClient

import json, requests
from tqdm import tqdm

""" Chatbot Interface """
from IPython.display import HTML, Markdown, display
import ipywidgets as widgets
from datetime import datetime
import base64
import markdown

## Extraction of JSON Articles as Langchain Documents

We load the authoratative veterinary literature from GitHub. 
The `merck-articles.json` file is a result of scraping [https://www.merckvetmanual.com/](https://www.merckvetmanual.com/) for relevant veterinary articles.

Details can be found in the [respective repository](https://github.com/JohannesSchulz97/WebScraper).

In [4]:
url = "https://raw.githubusercontent.com/JohannesSchulz97/WebScraper/refs/heads/main/data/merck-articles.json"

"""
    Retrieve veterinary articles dataset from Github.
"""
response = requests.get(url)
data = response.json()

word_count = 0
for article in data: 
    word_count += len(article['content'].split())
print(f"Loaded {len(data)} articles with a total word count of {word_count}.")

"""
    Create Langchain documents to be uploaded to the vector store later on.
"""
documents = [
    Document(
        page_content=entry["content"],
        metadata={"name": entry["name"], "link": entry["link"]}
    )
    for entry in data
]

Loaded 2895 articles with a total word count of 2836584.


## Setup Chroma Database

In [8]:
db_name = "veterinary_articles"

# Extract content, metadata, and generate ids
contents = [doc.page_content for doc in documents]
metadata = [doc.metadata for doc in documents]
ids = [doc.metadata["link"] for doc in documents]

# Create a persistent Chroma DB client, if it has not been created already.
if "chroma_client" not in globals():
    chroma_client = PersistentClient(path="./chroma_db")

# Create a Chroma Collection, using our predefined embedding function.
db = chroma_client.get_or_create_collection(name=db_name)

# Upload content in batches to stay within chromas defined limits.
batch_size = 100
for i in tqdm(range(0, len(contents), batch_size)):
    batch_docs = contents[i:i+batch_size]
    batch_meta = metadata[i:i+batch_size]
    batch_ids = [f"doc_{j}" for j in range(i, i + len(batch_docs))]  # ensure unique IDs

    db.add(
        documents=batch_docs,
        metadatas=batch_meta,
        ids=batch_ids
    )

100%|███████████████████████████████████████████| 29/29 [01:30<00:00,  3.12s/it]


In [9]:
from enum import Enum

class QueryType(Enum):
    INITIAL = 1
    REMINDER = 2
    NO_INSTRUCTIONS = 3


initial_instruction_prompt = """You are a knowledgeable, friendly, and responsible veterinary chatbot designed to assist with questions related to veterinary science. 
These questions may range from highly practical concerns of pet owners caring for sick animals to more theoretical or academic inquiries about veterinary topics.

When responding to practical health-related questions, please keep in mind that the user may not provide all the necessary information to properly assess the animal’s condition. 
In such cases, ask relevant and specific follow-up questions to gather additional information before making any conclusive statements or offering advice. 
Your follow-up questions should be informed by the veterinary articles provided to you.

Always prioritize accuracy and caution in your responses. Do not make assumptions or offer definitive diagnoses without sufficient information. 
Avoid making false claims, as incorrect information in this domain can have serious consequences for animal health and safety.

You will be given veterinary documents and their URLs. These articles may or may not be relevant to the user’s query—use them only if they are clearly applicable. 
When you make statements and claims, reference the respective article.
Also, do not spam the same citation again and again, but rather make sure that it is cited where most appropriate and at least once.
List all sources in a separate section below your response.
The source citations should in a consistent format (ideally the one that uses numbers) that is typical in scientific literature.
In general your statements about veterinary science should only be based on the provided documents!

To summarize:
	•	If you need more information before answering, keep your reply brief and focused on a few essential follow-up questions needed to proceed.
	•	Only base your statements on the provided articles and prioritize accuracy and caution.
	•	Only reference veterinary documents when they clearly apply and cite them as prescribed.
    •	Maintain a conversational and empathetic tone—your goal is to support and educate pet owners in a reassuring way.

"""

reminder_instruction_prompt =  """Reminder: You are a veterinary chatbot. 
Please continue to provide helpful, accurate, and cautious responses to both practical and theoretical veterinary questions. 
When a user input lacks crucial detail, ask relevant follow-up questions before offering advice. 
Only reference veterinary documents when they clearly apply and cite them as prescribed. 
Additionally, make sure that all your replies are based on the provided documents.
Maintain a clear, empathetic tone for non-expert pet owners."""

"""
    Retrieve the ten most relevant articles from the Chroma DB database by embedding the users query and then comparing the resultant
    embedding to the embeddings of the stored articles..
"""
def retrieve_relevant_articles(query):
    embed_fn.document_mode = False
    results = db.query(query_texts=[query], n_results=10, include=['documents', 'metadatas'])
    [documents], [metadata] = results['documents'], results['metadatas']
    return [x['link'] for x in metadata], [x['name'] for x in metadata], documents
    
"""
    Query augmentation varies, depending on the current turn. 
    Initially, we add lenghty and detailed instructions. After every ten rounds, we will remind the model of those
    with a shorter instruction reminder. 
    For all query types, we append relevant documents afeter the query.
"""
def augment_query(query, query_type: QueryType):
    query = query.replace("\n", " ")
    urls, names, documents = retrieve_relevant_articles(query)
    match query_type:
        case QueryType.INITIAL:
            prompt = initial_instruction_prompt + f"QUESTION: {query}\n\n"
        case QueryType.REMINDER:
            prompt = reminder_instruction_prompt + f"QUESTION: {query}\n\n"
        case QueryType.NO_INSTRUCTIONS:
            prompt = f"QUESTION: {query}\n\n"
    # Add the retrieved documents to the prompt.
    for url, name, document in zip(urls, names, documents):
        document = document.replace("\n", " ")
        prompt += f"url: {url}\n"
        prompt += f"name: {name}\n"
        prompt += f"document: {document}\n\n"
    return prompt


## Chat Interface

### Logic for the Chatbot/Chatbot Interface

In [14]:
"""
     Event handler for query submission, the core logic of the User Interface is implemented here.
     This includes: 
     - the augmentation of the users query with relevant articles as well as an additional instruction prompt if needed. 
     - the visualization of both the users query as well as the models response.
"""

def on_submit(change):
    user_input = change["new"]
    if not user_input.strip():
        return
    input_widget.value = ""
    loading_widget.layout.display = "block"
    
    with chat_output:
        display(HTML(format_user_message(user_input)))

    num_turns = len(chat.get_history())/2
    
    if num_turns == 0: 
        augmented_query = augment_query(user_input, QueryType.INITIAL)
    elif (num_turns%10) == 0:
        augmented_query = augment_query(user_input, QueryType.REMINDER)
    else: 
        augmented_query = augment_query(user_input, QueryType.NO_INSTRUCTIONS)

    response = chat.send_message(augmented_query).text
    
    with chat_output:
        display(HTML(format_bot_message(response)))
    
    loading_widget.layout.display = "none"


def format_user_message(msg):
    return f"""
    <div class="chat-message-right">
        <div>
            <img src="data:image/png;base64,{dog_base64}" class="rounded-circle" width="40" height="40">
            <div class="text-muted small text-nowrap mt-1">{datetime.now().strftime('%H:%M:%S')}</div>
        </div>
        <div class="chat-bubble">
            <div><strong>You</strong></div>
            {msg}
        </div>
    </div>
    """

def format_bot_message(msg):
    return f"""
    <div class="chat-message-left">
        <div>
            <img src="data:image/png;base64,{vet_base64}" class="rounded-circle" width="40" height="40">
            <div class="text-muted small text-nowrap mt-1">{datetime.now().strftime('%H:%M:%S')}</div>
        </div>
        <div class="chat-bubble">
            <div><strong>Veterinary Assistant Chatbot</strong></div>
            {markdown.markdown(msg)}
        </div>
    </div>
    """


def img_to_base64_str(path):
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")


### Inject CSS styles into HTML frontend

`display`: “Render this object’s representation to the notebook’s output cell, using whatever method it defines.”
This can mean showing: 
	- a widget,
	- a plot,
	- some styled HTML,
    - or in this case, just executing HTML that has side effects (like adding <style> tags or JavaScript).

In [15]:
display(HTML("""
<style>
    .rounded-input input {
        border-radius: 20px !important;
        padding: 10px !important;
        font-size: 22px;
    }
</style>
"""))

display(HTML("""
<style>
    .widget-box {
        font-family: 'S/egoe UI', Tahoma, Geneva, Verdana, sans-serif;
    }
</style>
"""))


# Basic Bootstrap CSS
display(HTML("""
<link rel="stylesheet"
      href="https://cdn.jsdelivr.net/npm/bootstrap@4.5.3/dist/css/bootstrap.min.css"
      integrity="sha384-TX8t27EcRE3e/ihU7zmQxVncDAy5uIKz4rEkgIXeMed4M0jlfIDPvg6uqKI2xXr2"
      crossorigin="anonymous">
<style>
    .chat-message-left, .chat-message-right {
        display: flex;
        flex-shrink: 0;
        margin-bottom: 10px;
    }
    .chat-message-left {
        margin-right: auto;
    }
    .chat-message-right {
        flex-direction: row-reverse;
        margin-left: auto;
    }
    .chat-bubble {
        max-width: 75%;
        border-radius: 15px;
        padding: 10px 15px;
        margin: 5px;
    }
    .chat-message-left .chat-bubble {
        background: #f1f0f0;
        font-size: 15px;
    }
    .chat-message-right .chat-bubble {
        background: #d1e7dd;
        font-size: 15px;
    }
</style>
"""))

display(HTML("""
<style>
    .chat-container {
        border: 2px solid #bbb;
        border-radius: 25px;
        padding: 20px;
        background-color: #f8f9fa;
        box-shadow: 0 4px 12px rgba(0,0,0,0.1);
        max-width: 1200px; 
        margin: 20px auto; |
    }
</style>
"""))

# Font size of Output
display(HTML("""
<style>
.rounded-input input {
    font-size: 15px !important;
    padding: 6px 10px;
}
</style>
"""))

### Create User Interface

In [16]:
input_widget = widgets.Text(
    placeholder="Ask something...",
    layout=widgets.Layout(width="100%", height="40px"),
    style={'description_width': 'initial'}
)

input_widget.add_class("rounded-input")
input_widget.continuous_update = False
input_widget.observe(on_submit, names="value")


# Load and hide loading animation
with open("./data/img/loading.gif", "rb") as file:
    loading_image = file.read()

loading_widget = widgets.Image(
    value=loading_image,
    format="gif",
    width=24,
    height=24,
    layout={"display": "none", "margin": "-12px 10px 0px 10px"}
)

# Output box
chat_output = widgets.Output()

# Convert images to base64 strings once
dog_base64 = img_to_base64_str("./data/img/dog.png")
vet_base64 = img_to_base64_str("./data/img/veterinarian.png")

# Layout and display
chat_layout = widgets.VBox([
    widgets.HBox([chat_output],
                 layout=widgets.Layout(flex_flow="column-reverse", height="750px", overflow="auto")),
    widgets.HBox([loading_widget, input_widget])
])


chat_container = widgets.VBox([chat_layout])
chat_container.add_class("chat-container")
display(chat_container)

VBox(children=(VBox(children=(HBox(children=(Output(),), layout=Layout(flex_flow='column-reverse', height='750…

In [None]:
I have a small puppy. And I would like to train him so that he respects my boundaries and is generally well behaved, while also not too strict in correcting him. Any adivice?