In [1]:

import os
import asyncio
from llama_parse import LlamaParse
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import Document
import nest_asyncio
nest_asyncio.apply()

from dotenv import load_dotenv
load_dotenv("../config.env")
os.environ.get("OPENAI_API_KEY")
os.environ.get("LLAMAPARSE_API_KEY")

'llx-8X3oeM4OXz1Gmsu7YfI4AQdDkzkAZbE3ejHFXWbslhfBaOmS'

In [None]:
# instantiate LlamaParse
parser = LlamaParse(
    api_key=os.environ.get("LLAMAPARSE_API_KEY"),
    result_type="markdown",  # or "text"
    extract_charts=True,
    auto_mode=True,
    auto_mode_trigger_on_image_in_page=True,
    auto_mode_trigger_on_table_in_page=True,
    bbox_top=0.05,
    bbox_bottom=0.1,
    verbose=True
)

# documents = parser.load_data(f"GuidelinesSections/Kenya-ARV-Guidelines-2022-HepB-HepC-Coinfection.pdf")
# # Write the output to a file
# with open("output.md", "w", encoding="utf-8") as f:
#    for doc in documents:
#        f.write(doc.text)
# filename="GuidelinesSections/Kenya-ARV-Guidelines-2022-HepB-HepC-Coinfection.pdf"
# full_text = "\n\n".join(doc.text for doc in documents)
# combined_doc = Document(text=full_text)
# node_parser = SimpleNodeParser()
# nodes = node_parser.get_nodes_from_documents([combined_doc])
# # create the index
# index = VectorStoreIndex(nodes)
# # remove "Kenya-ARV-Guidelines-2022-" from filename
# short_filename = filename.replace("GuidelinesSections/Kenya-ARV-Guidelines-2022-","").replace(".pdf", "")
# # persist the index
# index.storage_context.persist(f"lp/indices/{short_filename}")


In [None]:
# iterate through all files in guidance_docs/GuidelinesSections
# first, load the data using the parser
# then, flatted the data in each doc to create a single large doc per section
# finally, chunk the data using SentenceSplitter (tight size control)
async def parse_docs():
    for filename in os.listdir("GuidelinesSections"):
        if filename.endswith(".pdf"):
            documents = parser.load_data(f"GuidelinesSections/{filename}")
            full_text = "\n\n".join(doc.text for doc in documents)
            combined_doc = Document(text=full_text)
            node_parser = SimpleNodeParser()
            nodes = node_parser.get_nodes_from_documents([combined_doc])
            # create the index
            index = VectorStoreIndex(nodes)
            # remove "Kenya-ARV-Guidelines-2022-" from filename
            short_filename = filename.replace("Kenya-ARV-Guidelines-2022-","").replace(".pdf", "")
            # persist the index
            index.storage_context.persist(f"lp/indices/{short_filename}")
        
await parse_docs()


In [None]:
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core.retrievers import VectorIndexRetriever
# load lp_kenya_arv_guidelines
storage_context = StorageContext.from_defaults(persist_dir="../data/processed/lp_kenya_arv_guidelines")
index = load_index_from_storage(storage_context)
# raw_retriever = VectorIndexRetriever(index=index, similarity_top_k=3)

In [None]:
# get the number of nodes in the index
# Get all node IDs from the docstore
node_ids = list(index.docstore.docs.keys())  # dict of {node_id: node}
num_nodes = len(node_ids)

print(f"Number of nodes: {num_nodes}")

In [None]:
import random

# Get all nodes
nodes = list(index.docstore.docs.values())

# Pick a random node
random_node = random.choice(nodes)
print("Node ID:", random_node.node_id)
print("Node Text:", random_node.get_content())

Combine nodes into single global vectorstore

In [None]:
import os
from llama_index.core import StorageContext, load_index_from_storage, VectorStoreIndex

# path where all your chapter indices live
# e.g. ./indices/chapter1, ./indices/chapter2, ... ./indices/chapter30
BASE_DIR = "../data/processed/lp/indices"
GLOBAL_DIR = os.path.join(BASE_DIR, "Global")

# collect all subfolders under indices (excluding "global" if it already exists)
chapter_folders = [
    os.path.join(BASE_DIR, f) for f in os.listdir(BASE_DIR)
    if os.path.isdir(os.path.join(BASE_DIR, f)) and f != "Global"
]

all_nodes = []

for folder in chapter_folders:
    print(f"Loading nodes from {folder} ...")
    sc = StorageContext.from_defaults(persist_dir=folder)
    idx = load_index_from_storage(sc)

    # FIX: access the underlying docs dict in the docstore
    nodes = list(idx.storage_context.docstore.docs.values())
    all_nodes.extend(nodes)

print(f"Total nodes collected: {len(all_nodes)}")

print("Building global index...")
global_index = VectorStoreIndex(all_nodes)

print(f"Persisting global index to {GLOBAL_DIR} ...")
global_index.storage_context.persist(persist_dir=GLOBAL_DIR)

print("Done! 🎉 Global index is ready.")

Collect all nodes per chapter and write a summary

In [8]:
import os
from llama_index.core import StorageContext, load_index_from_storage, VectorStoreIndex
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(temperature=0.0, model="gpt-4o")

# path where all your chapter indices live
# e.g. ./indices/chapter1, ./indices/chapter2, ... ./indices/chapter30
BASE_DIR = "../data/processed/lp/indices"
GLOBAL_DIR = os.path.join(BASE_DIR, "Global")

# collect all subfolders under indices (excluding "global" if it already exists)
chapter_folders = [
    os.path.join(BASE_DIR, f) for f in os.listdir(BASE_DIR)
    if os.path.isdir(os.path.join(BASE_DIR, f)) and f != "Global"
]

PROMPT_TEMPLATE = """
You are preparing a summary of a clinical guideline chapter for use in a retrieval-augmented generation (RAG) system.

Your goal is to create a summary that:
- Clearly differentiates this chapter from others in the same HIV guideline collection
- Lists all major clinical topics, drugs, diseases, populations, and interventions discussed
- Includes synonyms, abbreviations, and alternate terms clinicians might use
- Mentions any unique recommendations, exceptions, or special scenarios covered in this chapter
- Avoids copying large blocks of text, but is more detailed than a generic overview

Write the summary as a concise paragraph or a bulleted list. Include enough detail and keywords so that a search for any relevant clinical question about this chapter would match this summary.

Chapter text:
\"\"\"{chapter_text}\"\"\"
"""

all_summaries = []

for folder in chapter_folders:
    print(f"Loading nodes from {folder} ...")
    sc = StorageContext.from_defaults(persist_dir=folder)
    idx = load_index_from_storage(sc)

    # FIX: access the underlying docs dict in the docstore
    nodes = list(idx.storage_context.docstore.docs.values())
    
    # combine the text from all nodes
    chapter_text = "\n".join([node.text for node in nodes])

    # create a summary of the chapter
    summary = llm.invoke(PROMPT_TEMPLATE.format(chapter_text=chapter_text))

    # append summary to list
    all_summaries.append(summary)

# write out summaries to csv
import csv

with open("chapter_summaries.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Chapter", "Summary"])
    for i, summary in enumerate(all_summaries):
        writer.writerow([f"Chapter {i+1}", summary.content])

Loading nodes from ../data/processed/lp/indices/HIVTestingServicesandLinkagetoTreatment ...
Loading llama_index.core.storage.kvstore.simple_kvstore from ../data/processed/lp/indices/HIVTestingServicesandLinkagetoTreatment/docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from ../data/processed/lp/indices/HIVTestingServicesandLinkagetoTreatment/index_store.json.
Loading nodes from ../data/processed/lp/indices/SummaryKeyRecommendations ...
Loading llama_index.core.storage.kvstore.simple_kvstore from ../data/processed/lp/indices/SummaryKeyRecommendations/docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from ../data/processed/lp/indices/SummaryKeyRecommendations/index_store.json.
Loading nodes from ../data/processed/lp/indices/HTSAdultScreeningToolEnhancement ...
Loading llama_index.core.storage.kvstore.simple_kvstore from ../data/processed/lp/indices/HTSAdultScreeningToolEnhancement/docstore.json.
Loading llama_index.core.storage.kvstore.simple_k

In [7]:
print(summary.content)

**Chapter Summary: HIV Testing Services and Linkage to Treatment and Prevention**

This chapter focuses on HIV Testing Services (HTS) as the initial step in linking individuals to comprehensive HIV treatment and prevention services, including voluntary medical male circumcision (VMMC), pre-exposure prophylaxis (PrEP), and post-exposure prophylaxis (PEP). It emphasizes the ethical conduct of testing, adhering to the six Cs: Consent, Confidentiality, Counselling, Correct results, Connection to care, and Creating an enabling environment. Key strategies include targeted HIV testing, index testing, HIV self-testing (HIVST), and voluntary counselling and testing (VCT). Facility-based and community-based settings are outlined for delivering HTS, with specific recommendations for different populations such as infants, children, adolescents, pregnant women, and key populations. The chapter details the use of HIV testing algorithms, including early infant diagnosis (EID) and dual HIV/syphilis te