In [31]:
from agents import function_tool, RunContextWrapper, Agent, Runner
import pandas as pd
import os
from pydantic import BaseModel
from dotenv import load_dotenv
load_dotenv()
api_key = os.environ.get("OPENAI_API_KEY")

if not api_key:
    raise ValueError("OPENAI_API_KEY is not set in the environment variables")

In [49]:
from tools.order_api import get_order_status, CustomerContext

In [40]:
class CustomerContext(BaseModel):
    customer_id: str 
    customer_name: str | None = None
    

In [50]:
orders = pd.read_json("tools/dummy_orders.json")

In [45]:
@function_tool
def get_order_status(context: RunContextWrapper[CustomerContext]):
    result = orders[orders["customer_id"] == context.context.customer_id]

    return result.to_dict(orient="records")[0]

In [46]:
cust = CustomerContext(customer_id="cust_001", customer_name="Maaz")


In [42]:
cust.customer_id

'cust_001'

In [47]:
tracking_agent = Agent(
    name="Tracking Agent",
    model="gpt-4o-mini",
    instructions="You are a tracking agent and your job is to track the order status, you can use the tools provided to get the order status.",
    tools=[
        get_order_status
    ]
)

In [48]:
text = "What is the status of my order?"
result = await Runner.run(tracking_agent, input=text, context=cust)
print(result.final_output)

Your order status is *Shipped*. Here are the details:

- **Order ID:** ORD1001
- **Customer ID:** cust_001
- **Order Date:** May 15, 2025
- **Items:** 
  - Wireless Mouse
  - USB-C Adapter


In [51]:
# rag 

from langchain_openai import ChatOpenAI


In [134]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
documents = loader.load()

# Step 2: Split into fixed-size chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=10
)
chunks = splitter.split_documents(pages)

In [135]:
len(chunks)

11

In [129]:
chunks[0]

Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-05-24T23:58:07+05:00', 'msip_label_0886e226-cfd0-4c2d-b2ff-bd9be0eaa4b4_enabled': 'true', 'msip_label_0886e226-cfd0-4c2d-b2ff-bd9be0eaa4b4_setdate': '2025-05-24T18:57:26Z', 'msip_label_0886e226-cfd0-4c2d-b2ff-bd9be0eaa4b4_method': 'Standard', 'msip_label_0886e226-cfd0-4c2d-b2ff-bd9be0eaa4b4_name': 'Internal', 'msip_label_0886e226-cfd0-4c2d-b2ff-bd9be0eaa4b4_siteid': '5764b349-a60c-4df1-8cf5-62d06dd5b2c3', 'msip_label_0886e226-cfd0-4c2d-b2ff-bd9be0eaa4b4_actionid': '30ecf6cd-eb01-4539-b679-9d325835b9a2', 'msip_label_0886e226-cfd0-4c2d-b2ff-bd9be0eaa4b4_contentbits': '0', 'msip_label_0886e226-cfd0-4c2d-b2ff-bd9be0eaa4b4_tag': '10, 3, 0, 1', 'author': 'python-docx', 'moddate': '2025-05-24T23:58:07+05:00', 'source': 'data/Dummy_Return_Policy.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}, page_content='Return Policy for E-commerce Platform \n1. Ov

In [115]:
chunks[12]

Document(metadata={}, page_content='y')

In [98]:
from langchain.embeddings import OpenAIEmbeddings

# Use OpenAI's embedding model
embedding_model = OpenAIEmbeddings()  # Uses API key from env

In [99]:
def sanitize_metadata(documents):
    for doc in documents:
        for key, value in doc.metadata.items():
            if isinstance(value, list):
                # Convert lists to comma-separated strings
                doc.metadata[key] = ", ".join(map(str, value))
            elif not isinstance(value, (str, int, float, bool, type(None))):
                # Drop or stringify other complex types
                doc.metadata[key] = str(value)
    return documents

In [92]:
documents = sanitize_metadata(docs)

In [123]:
from langchain.vectorstores import Chroma

# Save to a persistent directory
vectorstore = Chroma.from_documents(chunks, embedding_model, persist_directory="./data/chroma_store")

# Persist
vectorstore.persist()

AttributeError: 'dict' object has no attribute 'page_content'

In [101]:
retriever = vectorstore.as_retriever()

query = "non-returnable?"
docs = retriever.get_relevant_documents(query)

for i, doc in enumerate(docs):
    print(f"\n--- Relevant Chunk {i+1} ---\n{doc.page_content[:300]}")

INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"



--- Relevant Chunk 1 ---
accuracy. Requests made after this period may not be accepted, unless mandated by 
consumer protection laws or exceptional cases such as shipping delays or product recalls. 
2.2 Condition of Items 
Returned items must be unused, undamaged, and in a condition suitable for resale. Items 
should be ret

--- Relevant Chunk 2 ---
- Perishable goods such as food and flowers, which cannot be restocked. 
- Personalized or custom-made products, unless they are defective or damaged upon arrival. 
- Gift cards, which are considered final sale items. 
- Downloadable digital products or software, due to licensing restrictions. 
3. R

--- Relevant Chunk 3 ---
2.2 Condition of Items 
Returned items must be unused, undamaged, and in a condition suitable for resale. Items 
should be returned with the original packaging, including boxes, tags, manuals, and any 
included accessories. Products showing signs of wear, usage, or modification may not be 
eligible 

--- Relevant Chunk

In [121]:
import re
import uuid
from typing import List, Dict

def chunk_text_for_rag(text: str, max_chunk_size: int = 2000) -> List[Dict]:
    """
    Split text by titles/subtitles into RAG-ready chunks with context preserved.

    Args:
        text: Full document text.
        max_chunk_size: Optional max size per chunk (characters).

    Returns:
        List of dicts with id, metadata, and content.
    """
    title_re = re.compile(r'^(?P<title>[A-Z][A-Z0-9 ,\-]{3,})$', re.MULTILINE)
    subtitle_re = re.compile(r'^(?P<subtitle>(\d+[\.\)]|[A-Z]\.)\s+[A-Z][^\n]+)$', re.MULTILINE)

    matches = sorted(
        list(title_re.finditer(text)) + list(subtitle_re.finditer(text)),
        key=lambda m: m.start()
    )

    chunks = []
    last_title, last_subtitle = None, None
    last_pos = 0

    for match in matches:
        current_pos = match.start()
        content = text[last_pos:current_pos].strip()
        if content:
            chunk = {
                "id": str(uuid.uuid4()),
                "metadata": {
                    "title": last_title,
                    "subtitle": last_subtitle
                },
                "content": f"{last_title or ''}\n{last_subtitle or ''}\n{content}".strip()
            }
            chunks.extend(split_long_chunk(chunk, max_chunk_size))

        # Update context
        if match.re == title_re:
            last_title = match.group("title")
            last_subtitle = None
        else:
            last_subtitle = match.group("subtitle")

        last_pos = current_pos

    # Final chunk
    content = text[last_pos:].strip()
    if content:
        chunk = {
            "id": str(uuid.uuid4()),
            "metadata": {
                "title": last_title,
                "subtitle": last_subtitle
            },
            "content": f"{last_title or ''}\n{last_subtitle or ''}\n{content}".strip()
        }
        chunks.extend(split_long_chunk(chunk, max_chunk_size))

    return chunks

def split_long_chunk(chunk: Dict, max_size: int) -> List[Dict]:
    """
    Splits a single chunk into smaller ones if it exceeds max_size.
    """
    content = chunk["content"]
    if len(content) <= max_size:
        return [chunk]

    sentences = re.split(r'(?<=[.!?])\s+', content)
    current = ""
    sub_chunks = []
    for sent in sentences:
        if len(current) + len(sent) + 1 <= max_size:
            current += sent + " "
        else:
            sub_chunks.append({
                "id": str(uuid.uuid4()),
                "metadata": chunk["metadata"],
                "content": current.strip()
            })
            current = sent + " "
    if current:
        sub_chunks.append({
            "id": str(uuid.uuid4()),
            "metadata": chunk["metadata"],
            "content": current.strip()
        })
    return sub_chunks


In [122]:
chunks = chunk_text_for_rag(documents[0].page_content, max_chunk_size=2000)

In [124]:
chunks

[{'id': '41d9069c-a8b5-4428-9fc1-0ab878b0a80c',
  'metadata': {'title': None, 'subtitle': None},
  'content': 'Return Policy for E-commerce Platform'},
 {'id': '88757f0f-b62d-4223-9a5e-bb42e35dd985',
  'metadata': {'title': None, 'subtitle': '1. Overview '},
  'content': '1. Overview \n1. Overview \nThis return policy outlines the terms and conditions under which customers may return \nproducts purchased from our e-commerce platform. Our aim is to provide a transparent and \nfair return process that ensures customer satisfaction while maintaining operational \nefficiency. Customers are encouraged to review this policy before making a return request.'},
 {'id': '60105542-301c-415d-9638-5e6f1bc5d657',
  'metadata': {'title': None, 'subtitle': '2. Eligibility for Returns '},
  'content': '2. Eligibility for Returns \n2. Eligibility for Returns \n2.1 Timeframe \nReturns must be initiated within 30 calendar days from the date of delivery. This timeframe \nensures the returned items remain i

In [130]:
from unstructured.partition.pdf import partition_pdf
from langchain_core.documents import Document

elements = partition_pdf(filename="your_file.pdf")

# Build structured chunks
chunks = []
current_chunk = ""
current_header = ""
for el in elements:
    if el.category in ["Title", "Header", "Subheader"]:
        if current_chunk:
            chunks.append(Document(
                page_content=current_chunk.strip(),
                metadata={"header": current_header}
            ))
            current_chunk = ""
        current_header = el.text
    else:
        current_chunk += el.text + "\n"

# Last chunk
if current_chunk:
    chunks.append(Document(
        page_content=current_chunk.strip(),
        metadata={"header": current_header}
    ))

# Output
for i, chunk in enumerate(chunks):
    print(f"\n--- Chunk {i+1} ---")
    print("Header:", chunk.metadata["header"])
    print(chunk.page_content[:300])


INFO: PDF text extraction failed, skip text extraction...


PDFInfoNotInstalledError: Unable to get page count. Is poppler installed and in PATH?

In [131]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("./data/Dummy_Return_Policy.pdf")

In [136]:
pages = []
for page in loader.lazy_load():
    pages.append(page)

In [138]:
pages[0].page_content

'Return Policy for E-commerce Platform \n1. Overview \nThis return policy outlines the terms and conditions under which customers may return \nproducts purchased from our e-commerce platform. Our aim is to provide a transparent and \nfair return process that ensures customer satisfaction while maintaining operational \nefficiency. Customers are encouraged to review this policy before making a return request. \n2. Eligibility for Returns \n2.1 Timeframe \nReturns must be initiated within 30 calendar days from the date of delivery. This timeframe \nensures the returned items remain in good condition and helps us maintain inventory \naccuracy. Requests made after this period may not be accepted, unless mandated by \nconsumer protection laws or exceptional cases such as shipping delays or product recalls. \n2.2 Condition of Items \nReturned items must be unused, undamaged, and in a condition suitable for resale. Items \nshould be returned with the original packaging, including boxes, tags,

In [150]:
def rag_retriver(query: str):
    """
    Use this tool to retrieve relevant documents from the knowledge base.
    """
    embedding_model = OpenAIEmbeddings()
    vectordb = Chroma(persist_directory="./data/chroma_store", embedding_function=embedding_model)
    retriever = vectordb.as_retriever(search_kwargs={"k": 5})
    results = retriever.get_relevant_documents(query)
    results = [results[i].page_content for i in range(len(results))]
    return results

In [151]:
results = rag_retriver("non-returnable?")
print(results)

INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


['accuracy. Requests made after this period may not be accepted, unless mandated by \nconsumer protection laws or exceptional cases such as shipping delays or product recalls. \n2.2 Condition of Items \nReturned items must be unused, undamaged, and in a condition suitable for resale. Items \nshould be returned with the original packaging, including boxes, tags, manuals, and any \nincluded accessories. Products showing signs of wear, usage, or modification may not be \neligible for a full refund. \n2.3 Non-returnable Items \nCertain items are excluded from our return policy due to their nature or regulatory \nlimitations. These include:', '- Perishable goods such as food and flowers, which cannot be restocked. \n- Personalized or custom-made products, unless they are defective or damaged upon arrival. \n- Gift cards, which are considered final sale items. \n- Downloadable digital products or software, due to licensing restrictions. \n3. Return Process \n3.1 Initiating a Return \nCustome

In [144]:
[results[i].page_content for i in range(len(results))]

['accuracy. Requests made after this period may not be accepted, unless mandated by \nconsumer protection laws or exceptional cases such as shipping delays or product recalls. \n2.2 Condition of Items \nReturned items must be unused, undamaged, and in a condition suitable for resale. Items \nshould be returned with the original packaging, including boxes, tags, manuals, and any \nincluded accessories. Products showing signs of wear, usage, or modification may not be \neligible for a full refund. \n2.3 Non-returnable Items \nCertain items are excluded from our return policy due to their nature or regulatory \nlimitations. These include:',
 '- Perishable goods such as food and flowers, which cannot be restocked. \n- Personalized or custom-made products, unless they are defective or damaged upon arrival. \n- Gift cards, which are considered final sale items. \n- Downloadable digital products or software, due to licensing restrictions. \n3. Return Process \n3.1 Initiating a Return \nCustom