### RAG Document Loaders

In [76]:
from langchain_community.document_loaders import PyMuPDFLoader

#  PyMuPDFLoader ? 

pdf_data = PyMuPDFLoader("./harry.pdf")

pdf_data = pdf_data.load()

In [77]:
print(f"The pdf contains : {len(pdf_data)} pages")

The pdf contains : 3623 pages


In [78]:
## Doument Splitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

# what is tiktok-tokenizer of openai. (gpt)
# tokenizer ----> text--> numerical ids

split_data = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    encoding_name="cl100k_base",
    chunk_size = 500,
    chunk_overlap=100
)

In [79]:
all_docs = split_data.split_documents(pdf_data)

In [80]:
print(f"The pdf contains : {len(all_docs)} pages")

The pdf contains : 4425 pages


In [81]:
from langchain_chroma import Chroma
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma
import os
from dotenv import load_dotenv

load_dotenv()
# embedding
embed_model = OpenAIEmbeddings(model="text-embedding-3-large")

vector_store = Chroma(
    collection_name="harry_tokens",
    embedding_function=embed_model,
    chroma_cloud_api_key=os.getenv("CHROMA_API_KEY"),
    tenant=os.getenv("CHROMA_TENANT"),
    database=os.getenv("CHROMA_DATABASE"),
)


In [82]:
## add to vector store
batch_size= 300
for i in range(0,len(all_docs), batch_size):
    vector_store.add_documents(all_docs[i:i+batch_size])

In [83]:
### Retrieval

retriver = vector_store.as_retriever(
    search_type= 'mmr',
    search_kwargs= {
        "fetch_k":20,
        "k": 20
    }
)

In [84]:
prompt = "Reveal the identiy of the half-blood prince"
retriver.invoke(prompt)

[Document(id='12633d64-c5e4-41e1-bfe4-c121186178db', metadata={'author': 'Rowling, J.K.', 'trapped': '', 'file_path': './harry.pdf', 'creationDate': "D:20190221160423+00'00'", 'creationdate': '2019-02-21T16:04:23+00:00', 'creator': 'calibre 3.27.1 [https://calibre-ebook.com]', 'moddate': '', 'page': 2696, 'source': './harry.pdf', 'keywords': '', 'modDate': '', 'subject': '', 'total_pages': 3623, 'producer': 'calibre 3.27.1 [https://calibre-ebook.com]', 'format': 'PDF 1.4', 'title': 'Harry Potter: The Complete Collection'}, page_content='my particular brand of reasoned argument is making much headway against\nGreyback’s insistence that we werewolves deserve blood, that we ought to\nrevenge ourselves on normal people.”\n“But you are normal!” said Harry fiercely. “You’ve just got a — a problem\n—”\nLupin burst out laughing. “Sometimes you remind me a lot of James. He\ncalled it my ‘furry little problem’ in company. Many people were under the\nimpression that I owned a badly behaved rabbit

In [145]:
## agent
from langchain.agents import create_agent
from langchain.tools import tool
## content length of gpt5-nano = 400k
@tool
def get_novel_context(query:str):
    """Get the harry potter novel context based on the query"""
    out = retriver.invoke(query)
    return "\n\n".join([chunk.page_content for chunk in out])

system_prompt = """
You must answer ONLY using the tool result.

When answering:
1. First extract the exact sentence(s) from the tool output that support the answer.
2. Then provide the final answer.
3. If no supporting sentence exists in the tool output, say "I don't know."

Never answer without quoting supporting text from the tool result. And the answer should be brief and consise!
"""


agent = create_agent(
    model="gpt-5-nano",
    tools=[get_novel_context],
    system_prompt=system_prompt
)

In [149]:
res = agent.invoke({"messages": "Who is half blood prince?"})

In [150]:
res

{'messages': [HumanMessage(content='Who is half blood prince?', additional_kwargs={}, response_metadata={}, id='86bd65e1-4d65-451e-b3f6-89be090dbadc'),
  AIMessage(content='', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 352, 'prompt_tokens': 219, 'total_tokens': 571, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 320, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-5-nano-2025-08-07', 'system_fingerprint': None, 'id': 'chatcmpl-DDPyUOmSlytVkYjblqFuB8ATBVCX0', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='lc_run--019c98d2-d2f4-7111-b335-d2e1d7a653ea-0', tool_calls=[{'name': 'get_novel_context', 'args': {'query': 'Who is the Half-Blood Prince?'}, 'id': 'call_ONiZmfxfbS3d4Mrj2rvCwmH5', 'type': 'tool_call'}], invalid_tool_calls=[], usage_metadata={'input_

In [151]:
for m in res["messages"]:
    m.pretty_print()


Who is half blood prince?
Tool Calls:
  get_novel_context (call_ONiZmfxfbS3d4Mrj2rvCwmH5)
 Call ID: call_ONiZmfxfbS3d4Mrj2rvCwmH5
  Args:
    query: Who is the Half-Blood Prince?
Name: get_novel_context

my particular brand of reasoned argument is making much headway against
Greyback’s insistence that we werewolves deserve blood, that we ought to
revenge ourselves on normal people.”
“But you are normal!” said Harry fiercely. “You’ve just got a — a problem
—”
Lupin burst out laughing. “Sometimes you remind me a lot of James. He
called it my ‘furry little problem’ in company. Many people were under the
impression that I owned a badly behaved rabbit.”
He accepted a glass of eggnog from Mr. Weasley with a word of thanks,
looking slightly more cheerful. Harry, meanwhile, felt a rush of excitement:
This last mention of his father had reminded him that there was something he
had been looking forward to asking Lupin.
“Have you ever heard of someone called the Half-Blood Prince?”
“The Half-Blo

In [None]:
### Two step RAG
from langchain.agents.middleware import dynamic_prompt, ModelRequest

@dynamic_prompt
def dynamic_prompt(req:ModelRequest):
    """Craft the system prompt"""
    out = retriver.invoke(req.state["messages"][-1].content)
    retreive_data = "\n\n".join([chunk.page_content for chunk in out])
    system_prompt = f"""
            You must answer ONLY using the tool result.

            When answering:
            1. First extract the exact sentence(s) from the tool output that support the answer.
            2. Then provide the final answer.
            3. If no supporting sentence exists in the tool output, say "I don't know."
            4. Avoid supporting text that just make guessing the answer without providing any useful information.
            5. Only use the supporting text that directly support the answer.

            Never answer without quoting supporting text from the tool result. And the answer should be brief and consise!
            Here is the context : \n\n {retreive_data}"""
    
    return system_prompt




agent = create_agent(
    model="gpt-5-nano",
    middleware=[dynamic_prompt]
)
    

In [209]:
res = agent.invoke({"messages": "Hello"})

In [167]:
res

{'messages': [HumanMessage(content='Hello', additional_kwargs={}, response_metadata={}, id='e388c606-4fbb-4d6c-8b52-08039b455145'),
  AIMessage(content='“Hello, Percy,” said Harry, trying not to laugh.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 2006, 'prompt_tokens': 6423, 'total_tokens': 8429, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 1984, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-5-nano-2025-08-07', 'system_fingerprint': None, 'id': 'chatcmpl-DDQvAcROKNKKXZnHu49TsaNC5MhcV', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='lc_run--019c990a-615b-73d1-9b01-2f7829a3dab9-0', tool_calls=[], invalid_tool_calls=[], usage_metadata={'input_tokens': 6423, 'output_tokens': 2006, 'total_tokens': 8429, 'input_token_details': {'audio': 0, 'cache_read': 0}, '

In [165]:
for m in res["messages"]:
    m.pretty_print()


Who is half blood prince?

“You dare use my own spells against me, Potter? It was I who invented them — I, the Half-Blood Prince!” 

“Snape must have been proud of being ‘half a Prince,’ you see? Tobias Snape was a Muggle from what it said in the Prophet.”

Severus Snape.
