In [1]:
import os
from openai import OpenAI as OpenAIClient
from dotenv import load_dotenv
from llama_parse import LlamaParse
from llama_index.core.node_parser import HierarchicalNodeParser
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI as LlamaOpenAI

import os
from llama_index.core.node_parser import HierarchicalNodeParser
from llama_index.core.node_parser import get_leaf_nodes
from llama_parse import LlamaParse

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY is not set in .env file")

llm = LlamaOpenAI(model="gpt-4o", api_key=openai_api_key)
Settings.llm = llm
openai_client = OpenAIClient(api_key=openai_api_key)

llama_cloud_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
if not llama_cloud_api_key:
    raise ValueError("LLAMA_CLOUD_API_KEY is not set in .env file")

In [None]:
llama_parser = LlamaParse(result_type="markdown_with_metadata", api_key=llama_cloud_api_key)
node_parser = HierarchicalNodeParser.from_defaults()

In [3]:
documents = llama_parser.load_data("runbooks\sample_runbook.docx")

  documents = llama_parser.load_data("runbooks\sample_runbook.docx")


Started parsing the file under job_id e78af953-ca97-45b0-b740-453eb8348c64


In [4]:
nodes = node_parser.get_nodes_from_documents(documents)

In [5]:
relevant_nodes = []
keywords = ["DR", "disaster", "recovery", "failover", "fallback", "redundant"]

for node in nodes:
    if any(kw.lower() in node.text.lower() for kw in keywords):
        relevant_nodes.append(node)

In [None]:
import os
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import HierarchicalNodeParser
from llama_index.core.node_parser import get_leaf_nodes
from llama_parse import LlamaParse

# --- Configuration ---
# Make sure to set your LlamaCloud API key
# os.environ["LLAMA_CLOUD_API_KEY"] = "llx-..."
llama_cloud_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
if not llama_cloud_api_key:
    raise ValueError("LlamaCloud API key not set. Please set the LLAMA_CLOUD_API_KEY environment variable.")

# --- Parsing the Document ---
llama_parser = LlamaParse(
    result_type="markdown",
    verbose=True
)
documents = llama_parser.load_data("./runbooks/sample_runbook.pdf")

print("--- Full Markdown Content of Document ---")
print(documents[0].text[:500].strip() + "...\n")
print("---------------------------------------")

# --- Creating a Node Hierarchy ---
node_parser = HierarchicalNodeParser.from_defaults(
    chunk_sizes=[2048, 512, 128]
)
nodes = node_parser.get_nodes_from_documents(documents)
leaf_nodes = get_leaf_nodes(nodes)

# --- [THE KEY FIX] Create a lookup dictionary from node ID to node object ---
# This allows us to easily retrieve a node by its ID.
all_nodes_dict = {node.node_id: node for node in nodes}

# --- Finding Relevant Nodes ---
relevant_nodes = []
keywords = ["DR", "disaster", "recovery", "failover", "fallback", "redundant"]

for node in leaf_nodes:
    if any(kw.lower() in node.text.lower() for kw in keywords):
        relevant_nodes.append(node)

print(f"\nFound {len(relevant_nodes)} relevant nodes based on keywords.\n" + "="*50)

# --- Printing the Results with Correct Metadata ---
for i, node in enumerate(relevant_nodes, start=1):
    if i<=5:
        print(f"--- Relevant Node {i} ---")
        # print(f"Text Snippet: {node.text[:100].strip()}...")

        # METADATA EXTRACTION:
        # 1. Document Name
        doc_name = node.metadata.get('file_path', 'N/A')
        print(f"Document Name: {doc_name}")

        # 2. Section Name - Corrected Logic
        section_name = "N/A (Top Level)"
        if node.parent_node:
            # a) Get the parent's ID from the RelatedNodeInfo object
            parent_id = node.parent_node.node_id
            
            # b) Look up the actual parent node in our dictionary
            parent_node_obj = all_nodes_dict.get(parent_id)
            
            if parent_node_obj:
                # c) Now you can safely access the parent's text
                section_name = parent_node_obj.text.strip()
                
        print(f"Section: {section_name}")

        # 3. Page Number
        page_num = node.metadata.get('page_label', 'N/A (Not available for .docx)')
        print(f"Page Number: {page_num}")
        
        # You can also print the full original metadata for debugging
        print(f"Original Metadata Dict: {node.metadata}")
        print("-" * 25 + "\n")

In [None]:
llama_parser = LlamaParse(page_prefix="START OF PAGE: {pageNumber}\n",page_suffix="\nEND OF PAGE: {pageNumber}",api_key="",verbose=True,result_type="markdown")
# Load the document again with the new parser settings
file_name = "sample_runbook.docx"
extra_info = {"file_name": file_name, "file_type": "docx"}

documents = llama_parser.load_data("./runbooks/sample_runbook.docx", extra_info=extra_info)
for i, doc in enumerate(documents, start=1):
    doc.metadata["page_number"] = i



In [None]:
print("--- Full Markdown Content of Document ---")
print(documents[0].text[:500].strip() + "...\n")
print("---------------------------------------")

# --- Creating a Node Hierarchy ---
node_parser = HierarchicalNodeParser.from_defaults(
    chunk_sizes=[2048, 512, 128]
)
nodes = node_parser.get_nodes_from_documents(documents)
leaf_nodes = get_leaf_nodes(nodes)

# --- [THE KEY FIX] Create a lookup dictionary from node ID to node object ---
# This allows us to easily retrieve a node by its ID.
all_nodes_dict = {node.node_id: node for node in nodes}

# --- Finding Relevant Nodes ---
relevant_nodes = []
keywords = ["DR", "disaster", "recovery", "failover", "fallback", "redundant"]

for node in leaf_nodes:
    if any(kw.lower() in node.text.lower() for kw in keywords):
        relevant_nodes.append(node)

print(f"\nFound {len(relevant_nodes)} relevant nodes based on keywords.\n" + "="*50)

# --- Printing the Results with Correct Metadata ---
for i, node in enumerate(relevant_nodes, start=1):
    if i<=5:
        print(f"--- Relevant Node {i} ---")
        # print(f"Text Snippet: {node.text[:100].strip()}...")

        # METADATA EXTRACTION:
        # 1. Document Name
        doc_name = node.metadata.get('file_path', 'N/A')
        print(f"Document Name: {doc_name}")

        # 2. Section Name - Corrected Logic
        section_name = "N/A (Top Level)"
        if node.parent_node:
            # a) Get the parent's ID from the RelatedNodeInfo object
            parent_id = node.parent_node.node_id
            
            # b) Look up the actual parent node in our dictionary
            parent_node_obj = all_nodes_dict.get(parent_id)
            
            if parent_node_obj:
                # c) Now you can safely access the parent's text
                section_name = parent_node_obj.text.strip()
                
        print(f"Section: {section_name}")

        # 3. Page Number
        page_num = node.metadata.get('page_label', 'N/A (Not available for .docx)')
        print(f"Page Number: {page_num}")
        
        # You can also print the full original metadata for debugging
        print(f"Original Metadata Dict: {node.metadata}")
        print("-" * 25 + "\n")

In [11]:
print(documents[0].extra_info)

{}


  print(documents[0].extra_info)


In [12]:
import os
import docx  # The library we'll use for pre-processing
from llama_index.core import Document
from llama_index.core.node_parser import HierarchicalNodeParser
from llama_index.core.node_parser import get_leaf_nodes

# --- Step 1: Manual Pre-processing to Simulate Pages ---

def create_paged_documents_from_docx(file_path, paragraphs_per_page=40):
    """
    Reads a DOCX file and splits its content into a list of LlamaIndex
    Document objects, each representing a "simulated" page with metadata.
    """
    print(f"Manually simulating pages for {file_path}...")
    doc = docx.Document(file_path)
    
    paged_documents = []
    current_page_text = []
    page_number = 1
    
    for i, para in enumerate(doc.paragraphs):
        current_page_text.append(para.text)
        
        # Check if we've hit our paragraph limit to define a new page
        if (i + 1) % paragraphs_per_page == 0 and i > 0:
            # Create a Document object for the page we just finished
            page_content = "\n".join(current_page_text)
            new_doc = Document(
                text=page_content,
                metadata={
                    "file_path": file_path,
                    "simulated_page_number": page_number
                }
            )
            paged_documents.append(new_doc)
            
            # Reset for the next page
            page_number += 1
            current_page_text = []

    # Add the last remaining page if it wasn't a perfect multiple
    if current_page_text:
        page_content = "\n".join(current_page_text)
        new_doc = Document(
            text=page_content,
            metadata={
                "file_path": file_path,
                "simulated_page_number": page_number
            }
        )
        paged_documents.append(new_doc)
        
    print(f"Created {len(paged_documents)} simulated page documents.")
    return paged_documents

# --- Step 2: Use the pre-processed documents in your LlamaIndex pipeline ---

# Create our paged documents first. We don't need LlamaParse for this workflow.
file_path = "./runbooks/sample_runbook.docx"
paged_docs = create_paged_documents_from_docx(file_path, paragraphs_per_page=40)

# Now, we use the NodeParser on our list of custom Document objects
# We don't need HierarchicalNodeParser if we aren't using LlamaParse for headers
from llama_index.core.node_parser import SentenceSplitter
node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)

# get_nodes_from_documents will preserve the metadata from each document
nodes = node_parser.get_nodes_from_documents(paged_docs)

# --- Step 3: Find relevant nodes and check their metadata ---
relevant_nodes = []
keywords = ["DR", "disaster", "recovery", "failover", "fallback", "redundant"]

for node in nodes:
    if any(kw.lower() in node.text.lower() for kw in keywords):
        relevant_nodes.append(node)

print(f"\nFound {len(relevant_nodes)} relevant nodes.\n" + "="*50)

for i, node in enumerate(relevant_nodes, start=1):
    print(f"--- Relevant Node {i} ---")
    print(f"Text Snippet: {node.text[:100].strip()}...")
    
    # NOW the metadata will be populated!
    print(f"Node's Metadata Dict: {node.metadata}")
    
    # And you can access your custom page number
    page_num = node.metadata.get('simulated_page_number', 'N/A')
    print(f"Simulated Page Number: {page_num}")
    print("-" * 25 + "\n")

Manually simulating pages for ./runbooks/sample_runbook.docx...
Created 8 simulated page documents.

Found 7 relevant nodes.
--- Relevant Node 1 ---
Text Snippet: Daily Health Reports: Sent by 10 AM 
Weekly Infra Summary: Includes uptime, incidents, changes 
Mont...
Node's Metadata Dict: {'file_path': './runbooks/sample_runbook.docx', 'simulated_page_number': 2}
Simulated Page Number: 2
-------------------------

--- Relevant Node 2 ---
Text Snippet: Cloud DR with snapshot restore. Fallback via cloud console.  
**Upstream Dependencies:** Database, S...
Node's Metadata Dict: {'file_path': './runbooks/sample_runbook.docx', 'simulated_page_number': 3}
Simulated Page Number: 3
-------------------------

--- Relevant Node 3 ---
Text Snippet: 5. Validate service status: sudo systemctl status <service>  
6. Notify stakeholders and update inci...
Node's Metadata Dict: {'file_path': './runbooks/sample_runbook.docx', 'simulated_page_number': 4}
Simulated Page Number: 4
-------------------------
