In [None]:

import os
import asyncio
from llama_parse import LlamaParse
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import Document
import nest_asyncio
nest_asyncio.apply()

from dotenv import load_dotenv
load_dotenv("../config.env")
os.environ.get("OPENAI_API_KEY")
os.environ.get("LLAMAPARSE_API_KEY")

In [None]:
# instantiate LlamaParse
parser = LlamaParse(
    api_key=os.environ.get("LLAMAPARSE_API_KEY"),
    result_type="markdown",  # or "text"
    extract_charts=True,
    auto_mode=True,
    auto_mode_trigger_on_image_in_page=True,
    auto_mode_trigger_on_table_in_page=True,
    bbox_top=0.05,
    bbox_bottom=0.1,
    verbose=True
)

# documents = parser.load_data(f"GuidelinesSections/Kenya-ARV-Guidelines-2022-HepB-HepC-Coinfection.pdf")
# # Write the output to a file
# with open("output.md", "w", encoding="utf-8") as f:
#    for doc in documents:
#        f.write(doc.text)
# filename="GuidelinesSections/Kenya-ARV-Guidelines-2022-HepB-HepC-Coinfection.pdf"
# full_text = "\n\n".join(doc.text for doc in documents)
# combined_doc = Document(text=full_text)
# node_parser = SimpleNodeParser()
# nodes = node_parser.get_nodes_from_documents([combined_doc])
# # create the index
# index = VectorStoreIndex(nodes)
# # remove "Kenya-ARV-Guidelines-2022-" from filename
# short_filename = filename.replace("GuidelinesSections/Kenya-ARV-Guidelines-2022-","").replace(".pdf", "")
# # persist the index
# index.storage_context.persist(f"lp/indices/{short_filename}")


In [None]:
# iterate through all files in guidance_docs/GuidelinesSections
# first, load the data using the parser
# then, flatted the data in each doc to create a single large doc per section
# finally, chunk the data using SentenceSplitter (tight size control)
async def parse_docs():
    for filename in os.listdir("GuidelinesSections"):
        if filename.endswith(".pdf"):
            documents = parser.load_data(f"GuidelinesSections/{filename}")
            full_text = "\n\n".join(doc.text for doc in documents)
            combined_doc = Document(text=full_text)
            node_parser = SimpleNodeParser()
            nodes = node_parser.get_nodes_from_documents([combined_doc])
            # create the index
            index = VectorStoreIndex(nodes)
            # remove "Kenya-ARV-Guidelines-2022-" from filename
            short_filename = filename.replace("Kenya-ARV-Guidelines-2022-","").replace(".pdf", "")
            # persist the index
            index.storage_context.persist(f"lp/indices/{short_filename}")
        
await parse_docs()
