In [1]:
import os
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain.chat_models import init_chat_model

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

EMBED_MODEL = "text-embedding-3-large"
LLM_MODEL = "gpt-4.1-mini"      # cheap + fast for RAG
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
model = init_chat_model(LLM_MODEL)

In [2]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./processed/chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [6]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

# Only keep post title, headers, and content from the full HTML.
bs4_strainer = bs4.SoupStrainer()
loader = WebBaseLoader(
    web_paths=("https://www.forgottenhomeapothecary.com/",),
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()
assert len(docs) == 1
print(f"Total characters: {len(docs[0].page_content)}")

Total characters: 49921


In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

print(f"Split site into {len(all_splits)} sub-documents.")

Split site into 66 sub-documents.


In [8]:
document_ids = vector_store.add_documents(documents=all_splits)

print(document_ids[:3])

['e14e6832-e62c-4602-9bee-3cf4d99e6762', '9c8c2e4f-5b4f-4e9c-8119-d33e5f82dfbb', '71ff5a14-5c87-49d8-9a86-ea741ea48c20']


In [9]:
from langchain_community.document_loaders import PyPDFLoader
from pathlib import Path
from typing import List

def load_documents(data_path:str="data") -> List[str]:
    documents = []
    for file in Path(data_path).glob("*.pdf"):
        loader = PyPDFLoader(str(file))
        docs = loader.load()
        documents.extend(docs)
    return documents


docs = load_documents()
all_splits = text_splitter.split_documents(docs)
print(f"Split PDFs into {len(all_splits)} sub-documents.")

document_ids = vector_store.add_documents(documents=all_splits)
print(document_ids[:3])

Split blog post into 1023 sub-documents.
['8273cf9b-ddfd-474d-b1f8-1fdd1210158b', '955146ac-f160-4da1-b799-0de73637efa0', '00a041b4-9433-40db-a039-a967974259bc']


In [10]:
from langchain.tools import tool

@tool(response_format="content_and_artifact")
def retrieve_context(query: str):
    """Retrieve information to help answer a query."""
    retrieved_docs = vector_store.similarity_search(query, k=2)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\nContent: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

In [11]:
from langchain.agents import create_agent


tools = [retrieve_context]
# If desired, specify custom instructions
prompt = (
    "You have access to a tool that retrieves context from a blog post. "
    "Use the tool to help answer user queries."
)
agent = create_agent(model, tools, system_prompt=prompt)

In [17]:
query = (
    "What are the recipes for treating fever but external use?" \
    "Please provide me with the description and list of Ingredients for that remedy and the Instructions"
)

for event in agent.stream(
    {"messages": [{"role": "user", "content": query}]},
    stream_mode="values",
):
    event["messages"][-1].pretty_print()


What are the recipes for treating fever but external use?Please provide me with the description and list of Ingredients for that remedy and the Instructions
Tool Calls:
  retrieve_context (call_cuPTJaSunj2tHgiHcFVx0AN2)
 Call ID: call_cuPTJaSunj2tHgiHcFVx0AN2
  Args:
    query: recipes for treating fever but external use
Name: retrieve_context

Source: {'creationdate': '2025-05-23T14:09:28+03:00', 'start_index': 0, 'page_label': '123', 'total_pages': 288, 'moddate': '2025-05-23T18:08:02+03:00', 'page': 122, 'creator': 'Adobe InDesign 20.3 (Windows)', 'producer': 'Adobe PDF Library 17.0', 'source': 'data\\Forgotten Home Apothecary.pdf', 'trapped': '/False'}
Content: 123
Forgotten Home Apothecary
INGREDIENTS:
• 2 cups water
• 1 tablespoon dried peppermint 
leaves (Mentha piperita)
• 1 tablespoon dried elderflower 
(Sambucus nigra)
• 1 tablespoon honey (optional, for 
taste)
NOTES:
SEEK IMMEDIATE MEDICAL 
ATTENTION for fevers over 
102°F (39°C), lasting more than 
3 days, or accompanied 