In [1]:

import os
import asyncio
from llama_parse import LlamaParse
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import Document
import nest_asyncio
nest_asyncio.apply()

from dotenv import load_dotenv
load_dotenv("../config.env")
os.environ.get("OPENAI_API_KEY")
os.environ.get("LLAMAPARSE_API_KEY")

'llx-8X3oeM4OXz1Gmsu7YfI4AQdDkzkAZbE3ejHFXWbslhfBaOmS'

In [2]:
# instantiate LlamaParse
parser = LlamaParse(
    api_key=os.environ.get("LLAMAPARSE_API_KEY"),
    result_type="markdown",  # or "text"
    extract_charts=True,
    auto_mode=True,
    auto_mode_trigger_on_image_in_page=True,
    auto_mode_trigger_on_table_in_page=True,
    bbox_top=0.05,
    bbox_bottom=0.1,
    verbose=True
)

# documents = parser.load_data(f"GuidelinesSections/Kenya-ARV-Guidelines-2022-HepB-HepC-Coinfection.pdf")
# # Write the output to a file
# with open("output.md", "w", encoding="utf-8") as f:
#    for doc in documents:
#        f.write(doc.text)
# filename="GuidelinesSections/Kenya-ARV-Guidelines-2022-HepB-HepC-Coinfection.pdf"
# full_text = "\n\n".join(doc.text for doc in documents)
# combined_doc = Document(text=full_text)
# node_parser = SimpleNodeParser()
# nodes = node_parser.get_nodes_from_documents([combined_doc])
# # create the index
# index = VectorStoreIndex(nodes)
# # remove "Kenya-ARV-Guidelines-2022-" from filename
# short_filename = filename.replace("GuidelinesSections/Kenya-ARV-Guidelines-2022-","").replace(".pdf", "")
# # persist the index
# index.storage_context.persist(f"lp/indices/{short_filename}")


In [None]:
# iterate through all files in guidance_docs/GuidelinesSections
# first, load the data using the parser
# then, flatted the data in each doc to create a single large doc per section
# finally, chunk the data using SentenceSplitter (tight size control)
async def parse_docs():
    for filename in os.listdir("GuidelinesSections"):
        if filename.endswith(".pdf"):
            documents = parser.load_data(f"GuidelinesSections/{filename}")
            full_text = "\n\n".join(doc.text for doc in documents)
            combined_doc = Document(text=full_text)
            node_parser = SimpleNodeParser()
            nodes = node_parser.get_nodes_from_documents([combined_doc])
            # create the index
            index = VectorStoreIndex(nodes)
            # remove "Kenya-ARV-Guidelines-2022-" from filename
            short_filename = filename.replace("Kenya-ARV-Guidelines-2022-","").replace(".pdf", "")
            # persist the index
            index.storage_context.persist(f"lp/indices/{short_filename}")
        
await parse_docs()


In [2]:
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core.retrievers import VectorIndexRetriever
# load lp_kenya_arv_guidelines
storage_context = StorageContext.from_defaults(persist_dir="../data/processed/lp_kenya_arv_guidelines")
index = load_index_from_storage(storage_context)
# raw_retriever = VectorIndexRetriever(index=index, similarity_top_k=3)

Loading llama_index.core.storage.kvstore.simple_kvstore from ../data/processed/lp_kenya_arv_guidelines/docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from ../data/processed/lp_kenya_arv_guidelines/index_store.json.


In [6]:
# get the number of nodes in the index
# Get all node IDs from the docstore
node_ids = list(index.docstore.docs.keys())  # dict of {node_id: node}
num_nodes = len(node_ids)

print(f"Number of nodes: {num_nodes}")

Number of nodes: 288


In [8]:
import random

# Get all nodes
nodes = list(index.docstore.docs.values())

# Pick a random node
random_node = random.choice(nodes)
print("Node ID:", random_node.node_id)
print("Node Text:", random_node.get_content())

Node ID: f2f1fb3d-78e9-4aa9-85f2-742e9df34e31
Node Text: Kenya HIV Prevention and Treatment Guidelines, 2022

▪ Pregnant or breastfeeding: at confirmation of pregnancy (if already on ART) or 3 months after ART initiation (if ART initiated during pregnancy/breastfeeding), and then every 6 months until complete cessation of breastfeeding

▪ Before any drug substitution (if no VL result available from the prior 6 months)

▪ Three months after any regimen modification (including single-drug substitution)

● PLHIV should receive differentiated care based on initial evaluation (advanced vs. well) and follow up (established vs not established on ART)

# 1.3 Standard Package of Care for PLHIV

Consists of 8 components:

1. Antiretroviral Therapy
- All PLHIV are eligible for ART irrespective of CD4 cell count or percentage, WHO clinical stage, age, pregnancy status, or comorbidities
- ART should be initiated as soon as the patient is ready to start, preferably within two weeks from time of HIV 