In [4]:
import os
from openai import OpenAI as OpenAIClient
from dotenv import load_dotenv
from llama_parse import LlamaParse
from llama_index.core.node_parser import HierarchicalNodeParser
from llama_index.core.node_parser import MarkdownElementNodeParser
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI as LlamaOpenAI

import os
from llama_index.core.node_parser import HierarchicalNodeParser
from llama_index.core.node_parser import get_leaf_nodes
from llama_parse import LlamaParse

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY is not set in .env file")

llm = LlamaOpenAI(model="gpt-4o", api_key=openai_api_key)
Settings.llm = llm
openai_client = OpenAIClient(api_key=openai_api_key)

llama_cloud_api_key = os.getenv("LLAMA_CLOUD_API_KEY")
if not llama_cloud_api_key:
    raise ValueError("LLAMA_CLOUD_API_KEY is not set in .env file")

In [5]:
llama_parser = LlamaParse(page_prefix="START OF PAGE: {pageNumber}\n",page_suffix="\nEND OF PAGE: {pageNumber}",api_key="",verbose=True,result_type="markdown")
# Load the document again with the new parser settings
file_name = "sample_runbook.docx"
extra_info = {"file_name": file_name, "file_type": "docx"}

documents = llama_parser.load_data("./runbooks/sample_runbook.docx", extra_info=extra_info)
for i, doc in enumerate(documents, start=1):
    doc.metadata["page_number"] = i



Started parsing the file under job_id f5ba448b-50cd-486c-ad1f-ee21a0ce1c63


In [6]:
print("--- Full Markdown Content of Document ---")
print(documents[0].text[:500].strip() + "...\n")
print("---------------------------------------")

# --- Creating a Node Hierarchy ---
# node_parser = HierarchicalNodeParser.from_defaults(
#     chunk_sizes=[2048, 512, 128]
# )
node_parser = MarkdownElementNodeParser(llm=llm, num_workers=15)
nodes = node_parser.get_nodes_from_documents(documents)
leaf_nodes = get_leaf_nodes(nodes)

# --- [THE KEY FIX] Create a lookup dictionary from node ID to node object ---
# This allows us to easily retrieve a node by its ID.
all_nodes_dict = {node.node_id: node for node in nodes}

# --- Finding Relevant Nodes ---
relevant_nodes = []
keywords = ["DR", "disaster", "recovery", "failover", "fallback", "redundant"]

for node in leaf_nodes:
    if any(kw.lower() in node.text.lower() for kw in keywords):
        relevant_nodes.append(node)

print(f"\nFound {len(relevant_nodes)} relevant nodes based on keywords.\n" + "="*50)

# --- Printing the Results with Correct Metadata ---
for i, node in enumerate(relevant_nodes, start=1):
    if i<=10:
        print(f"--- Relevant Node {i} ---")
        # print(f"Text Snippet: {node.text[:100].strip()}...")

        # METADATA EXTRACTION:
        # 1. Document Name
        doc_name = node.metadata.get('file_path', 'N/A')
        print(f"Document Name: {doc_name}")

        # 2. Section Name - Corrected Logic
        section_name = "N/A (Top Level)"
        if node.parent_node:
            # a) Get the parent's ID from the RelatedNodeInfo object
            parent_id = node.parent_node.node_id
            
            # b) Look up the actual parent node in our dictionary
            parent_node_obj = all_nodes_dict.get(parent_id)
            
            if parent_node_obj:
                # c) Now you can safely access the parent's text
                section_name = parent_node_obj.text.strip()
                
        print(f"Section: {section_name}")

        # 3. Page Number
        page_num = node.metadata.get('page_label', 'N/A (Not available for .docx)')
        print(f"Page Number: {page_num}")
        
        # You can also print the full original metadata for debugging
        print(f"Original Metadata Dict: {node.metadata}")
        print("-" * 25 + "\n")

--- Full Markdown Content of Document ---
START OF PAGE: 1
🏦 Bank IT Production Management Runbook – Infrastructure Support

# 1. 📘 Overview

This runbook provides standardized procedures for monitoring, maintaining, and supporting the bank’s IT infrastructure in production environments. It ensures high availability, performance, and security of systems critical to banking operations.

# 2. 📘 Infrastructure Components Covered

- Servers: Physical and virtual (Windows/Linux)
- Storage Systems: SAN/NAS
- Network Devices: Routers, switches...

---------------------------------------


2it [00:00, ?it/s]
1it [00:00, ?it/s]
1it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
3it [00:00, ?it/s]
4it [00:00, ?it/s]
0it [00:00, ?it/s]


Found 15 relevant nodes based on keywords.
--- Relevant Node 1 ---
Document Name: N/A
Section: N/A (Top Level)
Page Number: N/A (Not available for .docx)
Original Metadata Dict: {'file_name': 'sample_runbook.docx', 'file_type': 'docx', 'page_number': 2}
-------------------------

--- Relevant Node 2 ---
Document Name: N/A
Section: N/A (Top Level)
Page Number: N/A (Not available for .docx)
Original Metadata Dict: {'file_name': 'sample_runbook.docx', 'file_type': 'docx', 'page_number': 3, 'col_schema': 'Column: Application\nType: string\nSummary: Names of different applications or systems.\n\nColumn: Production IP\nType: string\nSummary: IP addresses used in the production environment.\n\nColumn: DR IP\nType: string\nSummary: IP addresses used in the disaster recovery environment.'}
-------------------------

--- Relevant Node 3 ---
Document Name: N/A
Section: N/A (Top Level)
Page Number: N/A (Not available for .docx)
Original Metadata Dict: {'file_name': 'sample_runbook.docx', 'file_ty


