In [1]:
from dotenv import load_dotenv

load_dotenv()

True

## Semantic search

In [2]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader("resources/acmecorp-employee-handbook.pdf")

data = loader.load()

In [3]:
data

[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-11-20T23:23:16+00:00', 'source': 'resources/acmecorp-employee-handbook.pdf', 'file_path': 'resources/acmecorp-employee-handbook.pdf', 'total_pages': 1, 'format': 'PDF 1.4', 'title': '(anonymous)', 'author': '(anonymous)', 'subject': '(unspecified)', 'keywords': '', 'moddate': '2025-11-20T23:23:16+00:00', 'trapped': '', 'modDate': "D:20251120232316+00'00'", 'creationDate': "D:20251120232316+00'00'", 'page': 0}, page_content='Employee Handbook\nNon-Disclosure Agreement (NDA) Policy\nEmployees must protect confidential information belonging to the company, its clients, and partners.\nThis includes, but is not limited to, product roadmaps, customer data, internal communications,\nproprietary algorithms, financial information, and unreleased features. Confidential information may not\nbe shared with unauthorized individuals inside or outside the organization. These 

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)

all_splits = text_splitter.split_documents(data)

print(len(all_splits))

3


In [5]:
from pprint import pprint

pprint(all_splits)

[Document(metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-11-20T23:23:16+00:00', 'source': 'resources/acmecorp-employee-handbook.pdf', 'file_path': 'resources/acmecorp-employee-handbook.pdf', 'total_pages': 1, 'format': 'PDF 1.4', 'title': '(anonymous)', 'author': '(anonymous)', 'subject': '(unspecified)', 'keywords': '', 'moddate': '2025-11-20T23:23:16+00:00', 'trapped': '', 'modDate': "D:20251120232316+00'00'", 'creationDate': "D:20251120232316+00'00'", 'page': 0, 'start_index': 0}, page_content='Employee Handbook\nNon-Disclosure Agreement (NDA) Policy\nEmployees must protect confidential information belonging to the company, its clients, and partners.\nThis includes, but is not limited to, product roadmaps, customer data, internal communications,\nproprietary algorithms, financial information, and unreleased features. Confidential information may not\nbe shared with unauthorized individuals inside or outside the or

#### Chunk 1

In [6]:
pprint(all_splits[0].page_content)

('Employee Handbook\n'
 'Non-Disclosure Agreement (NDA) Policy\n'
 'Employees must protect confidential information belonging to the company, '
 'its clients, and partners.\n'
 'This includes, but is not limited to, product roadmaps, customer data, '
 'internal communications,\n'
 'proprietary algorithms, financial information, and unreleased features. '
 'Confidential information may not\n'
 'be shared with unauthorized individuals inside or outside the organization. '
 'These obligations continue\n'
 'after employment ends.\n'
 'Workplace Conduct Policy\n'
 'Employees must maintain a respectful, professional environment free from '
 'harassment, discrimination,\n'
 'and intimidation. All employees are expected to follow organizational '
 'values, collaborate effectively,\n'
 'and communicate constructively. Disruptive behavior, verbal abuse, or misuse '
 'of company systems is\n'
 'prohibited. Violations may result in disciplinary action.\n'
 'Paid Time Off (PTO) Policy\n'
 'FullItim

#### Chunk 2 - Partial

In [7]:
pprint(all_splits[1].page_content[:500])

('prohibited. Violations may result in disciplinary action.\n'
 'Paid Time Off (PTO) Policy\n'
 'FullItime employees accrue PTO according to the following schedule: • 0–1 '
 'years of service: 10 days\n'
 'per year (0.833 days per month) • 1–3 years of service: 15 days per year '
 '(1.25 days per month) • 3+\n'
 'years of service: 20 days per year (1.67 days per month) PTO may be used for '
 'vacation, personal\n'
 'needs, or illness. Requests should be submitted in advance through the HR '
 'system unless related to\n'
 'an emergency. Employ')


Embedding Models:

https://docs.langchain.com/oss/python/integrations/text_embedding

In [8]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [9]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [10]:
ids = vector_store.add_documents(documents=all_splits)

ids

['69014a58-8142-4a23-b5b9-f2b7ad2fd047',
 'e5eea517-d8e9-4cf1-b2f3-ce74a1cd9e8d',
 'bdf96b28-7744-46b0-926c-141065bcf293']

In [12]:
# 1) Inspect a stored document vector
doc_id = ids[0]
record = vector_store.store[doc_id]

record.keys(), record["text"][:120], len(record["vector"]), record["vector"][:10]

(dict_keys(['id', 'vector', 'text', 'metadata']),
 'Employee Handbook\nNon-Disclosure Agreement (NDA) Policy\nEmployees must protect confidential information belonging to the',
 3072,
 [-0.04888039454817772,
  -0.02630968950688839,
  -0.004073366057127714,
  -0.02653767727315426,
  -0.01925729401409626,
  -0.0268416590988636,
  -0.01240248791873455,
  0.024349002167582512,
  -0.041676007211208344,
  0.029729492962360382])

In [11]:
results = vector_store.similarity_search(
    "How many days of vacation does an eployee get in their first year?"
)

pprint(results[0])

Document(id='e5eea517-d8e9-4cf1-b2f3-ce74a1cd9e8d', metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-11-20T23:23:16+00:00', 'source': 'resources/acmecorp-employee-handbook.pdf', 'file_path': 'resources/acmecorp-employee-handbook.pdf', 'total_pages': 1, 'format': 'PDF 1.4', 'title': '(anonymous)', 'author': '(anonymous)', 'subject': '(unspecified)', 'keywords': '', 'moddate': '2025-11-20T23:23:16+00:00', 'trapped': '', 'modDate': "D:20251120232316+00'00'", 'creationDate': "D:20251120232316+00'00'", 'page': 0, 'start_index': 812}, page_content='prohibited. Violations may result in disciplinary action.\nPaid Time Off (PTO) Policy\nFullItime employees accrue PTO according to the following schedule: • 0–1 years of service: 10 days\nper year (0.833 days per month) • 1–3 years of service: 15 days per year (1.25 days per month) • 3+\nyears of service: 20 days per year (1.67 days per month) PTO may be used for vacation, persona

In [13]:
q = "How many days of vacation does an employee get in their first year?"
q_vec = embeddings.embed_query(q)

len(q_vec), q_vec[:10]

(3072,
 [-0.03438050299882889,
  0.013325775973498821,
  0.007619920652359724,
  -0.003585844999179244,
  -0.01651184819638729,
  -0.04559838026762009,
  0.028541387990117073,
  0.05277007073163986,
  0.006287343334406614,
  -0.007062660995870829])

## RAG Agent

In [14]:
from langchain.tools import tool

@tool
def search_handbook(query: str) -> str:
    """
    Search the employee handbook for information
    """

    results = vector_store.similarity_search(query)
    return results[0].page_content

In [15]:
from langchain.agents import create_agent
from langchain.messages import SystemMessage

system_prompt = SystemMessage(
    content="You are a helpful agent that can search the employee handbook for information."
)


agent = create_agent(
    model="gpt-5-nano",
    tools=[search_handbook],
    system_prompt=system_prompt
)

In [16]:
from langchain.messages import HumanMessage

user_query = HumanMessage(
    content="How many days of vacation does an employee get in their first year?"
)

response = agent.invoke(
    {"messages": [user_query]}
)

In [17]:
pprint(response)

{'messages': [HumanMessage(content='How many days of vacation does an employee get in their first year?', additional_kwargs={}, response_metadata={}, id='bc7bb062-88d8-4fd1-8b39-028d68341eae'),
              AIMessage(content='', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 92, 'prompt_tokens': 157, 'total_tokens': 249, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 64, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-5-nano-2025-08-07', 'system_fingerprint': None, 'id': 'chatcmpl-Ctc7lINxd2bpHbcnuFIEYOPfhR4Az', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='lc_run--019b7f7f-8ee8-76a3-a1e1-f78c169a86a1-0', tool_calls=[{'name': 'search_handbook', 'args': {'query': 'vacation days first year'}, 'id': 'call_DXXaU2qXoUa9HzaGDZHgZHzx', 'type': 'tool_call'}], u

In [25]:
pprint(response["messages"][-1].content)

('In the first year (0–1 year of service), an employee gets 10 PTO days per '
 'year (about 0.833 days per month). PTO can be used for vacation, personal '
 'needs, or illness.')


https://smith.langchain.com/public/a02a342f-fe2e-4b74-b21e-6652ff3f67dc/r