In [2]:
from langchain_community.document_loaders import JSONLoader
from pprint import pprint

In [3]:
# Path to your LlamaIndex documentation JSON file
file_path = './llamaindex_docs/test_llamaindex.json'

In [4]:
# Define the metadata extraction function
def metadata_func(record: dict, metadata: dict) -> dict:
    # Assuming each record in your JSON has a 'title' field you want to include in the metadata
    metadata["filename"] = record.get("filename")
    metadata["filepath"] = record.get("filepath")
    metadata["url"] = record.get("url")
    return metadata

In [5]:
# Initialize the JSONLoader with appropriate parameters
loader = JSONLoader(
    file_path=file_path,
    jq_schema='.[]',  # Adjust this based on your actual JSON structure
    content_key="content",  # Key where the actual content is stored
    metadata_func=metadata_func  # Function to extract additional metadata from each record
)

In [6]:
# Load the data
data = loader.load()

In [14]:
# Print the loaded documents to verify
pprint(data)

 Document(page_content='# Building a Custom Agent\n\nIn this cookbook we show you how to build a custom agent using LlamaIndex.\n\nThe easiest way to build a custom agent is to simply subclass `CustomSimpleAgentWorker` and implement a few required functions. You have complete flexibility in defining the agent step-wise logic.\n\nThis lets you add arbitrarily complex reasoning logic on top of your RAG pipeline.\n\nWe show you how to build a simple agent that adds a retry layer on top of a RouterQueryEngine, allowing it to retry queries until the task is complete. We build this on top of both a SQL tool and a vector index query tool. Even if the tool makes an error or only answers part of the question, the agent can continue retrying the question until the task is complete.\n\n## Setup the Custom Agent\n\nHere we setup the custom agent.\n\n### Refresher\n\nAn agent in LlamaIndex consists of both an agent runner + agent worker. An agent runner is an orchestrator that stores state like mem

In [24]:
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter

In [20]:
# Assuming we want to split on primary and secondary headers in Markdown
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3")
]

In [21]:
# Initialize the MarkdownHeaderTextSplitter with specified headers
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

In [22]:
# Perform the initial split based on headers
all_splits = []
for document in data:
    # Split the Markdown content of the document
    splits = markdown_splitter.split_text(document.page_content)
    # Append the resulting documents to the all_splits list
    all_splits.extend(splits)


In [23]:
# Now we have a list of Documents, each representing a chunk of Markdown content
# under a specific header level
pprint(all_splits)

[Document(page_content='<a href="https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/examples/agent/agent_builder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>  \nInspired by GPTs interface, presented at OpenAI Dev Day 2023. Construct an agent with natural language.  \nHere you can build your own agent...with another agent!  \n```python\nfrom llama_index.tools import BaseTool, FunctionTool\n```  \n```python\nfrom llama_index.agent import OpenAIAgent\nfrom llama_index.prompts import PromptTemplate\nfrom llama_index.llms import ChatMessage, OpenAI\nfrom llama_index import ServiceContext\n```  \n```python\nllm = OpenAI(model="gpt-4")\nservice_context = ServiceContext.from_defaults(llm=llm)\n```', metadata={'Header 1': 'GPT Builder Demo'}),
 Document(page_content='We also define a tool retriever to retrieve candidate tools.  \nIn this setting we define tools as different Wikipedia pages.  \n``

In [25]:
# Now, for each chunk, check if further splitting is needed
chunk_size = 8000  # Or whatever size is appropriate for GPT-4
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=100,  # Some overlap to maintain context
)

In [26]:
# Apply further splitting if necessary
final_splits = []
for md_chunk in all_splits:
    # Check if the chunk is too large
    if len(md_chunk.page_content) > chunk_size:
        # Split further
        smaller_chunks = text_splitter.split_documents([md_chunk])
        final_splits.extend(smaller_chunks)
    else:
        final_splits.append(md_chunk)

# final_splits now contains the optimally chunked documents

In [27]:
pprint(final_splits)

[Document(page_content='<a href="https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/examples/agent/agent_builder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>  \nInspired by GPTs interface, presented at OpenAI Dev Day 2023. Construct an agent with natural language.  \nHere you can build your own agent...with another agent!  \n```python\nfrom llama_index.tools import BaseTool, FunctionTool\n```  \n```python\nfrom llama_index.agent import OpenAIAgent\nfrom llama_index.prompts import PromptTemplate\nfrom llama_index.llms import ChatMessage, OpenAI\nfrom llama_index import ServiceContext\n```  \n```python\nllm = OpenAI(model="gpt-4")\nservice_context = ServiceContext.from_defaults(llm=llm)\n```', metadata={'Header 1': 'GPT Builder Demo'}),
 Document(page_content='We also define a tool retriever to retrieve candidate tools.  \nIn this setting we define tools as different Wikipedia pages.  \n``