In [10]:
import glob
import math
import os
import shutil
import chromadb

from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import BaseNode, Document
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.chroma import ChromaVectorStore
from llmsherpa.readers import LayoutPDFReader
from llmsherpa.readers.layout_reader import Block

from embedding import get_ollama_embedding

In [11]:
DATA_PATH = "./pdf"
CHROMA_PATH = "chroma"
LLM_SHERPA_API_URL = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"

In [57]:
pdf_reader = LayoutPDFReader(LLM_SHERPA_API_URL)
blocks = []
documents = []
i = 0
for file in glob.glob(DATA_PATH + "/*.pdf"):
    if i > 0:
        break
    i += 1
    print(file, "k=", i)
    doc = pdf_reader.read_pdf(file)
    block: Block
    for block in doc.chunks():
        parent = block.parent
        metadata = {
            "page": block.page_idx,
            "source": os.path.basename(file),
            "tag": block.tag,
        }
        document = Document(text=block.to_text(), metadata=metadata)
        documents.append(document)
        # blocks.append(block)
# blocks

./pdf/PMC10166749.pdf k= 1


In [58]:
for document in documents:
    print(document.metadata)
    # print(document.text)

{'page': 0, 'source': 'PMC10166749.pdf', 'keyword': 'para'}
{'page': 0, 'source': 'PMC10166749.pdf', 'keyword': 'para'}
{'page': 0, 'source': 'PMC10166749.pdf', 'keyword': 'para'}
{'page': 0, 'source': 'PMC10166749.pdf', 'keyword': 'para'}
{'page': 0, 'source': 'PMC10166749.pdf', 'keyword': 'para'}
{'page': 0, 'source': 'PMC10166749.pdf', 'keyword': 'para'}
{'page': 0, 'source': 'PMC10166749.pdf', 'keyword': 'para'}
{'page': 0, 'source': 'PMC10166749.pdf', 'keyword': 'para'}
{'page': 0, 'source': 'PMC10166749.pdf', 'keyword': 'para'}
{'page': 1, 'source': 'PMC10166749.pdf', 'keyword': 'para'}
{'page': 1, 'source': 'PMC10166749.pdf', 'keyword': 'para'}
{'page': 1, 'source': 'PMC10166749.pdf', 'keyword': 'para'}
{'page': 1, 'source': 'PMC10166749.pdf', 'keyword': 'para'}
{'page': 1, 'source': 'PMC10166749.pdf', 'keyword': 'para'}
{'page': 1, 'source': 'PMC10166749.pdf', 'keyword': 'para'}
{'page': 2, 'source': 'PMC10166749.pdf', 'keyword': 'para'}
{'page': 2, 'source': 'PMC10166749.pdf',

In [31]:
documents = []
for block in blocks:
    metadata = []
    
    for item in block.parent_text().split(">"):
        metadata.append(item.strip())
    documents.append(Document(text=block.to_text(), metadata={"metadata": metadata}))
documents

[Document(id_='8e1267ed-fbb5-4494-a866-ca099ceaf837', embedding=None, metadata={'metadata': ['Clinical effects of Bifidobacterium as a probiotic on oral health: A systematic review', 'M.Y Jayachandra, R. Gayathiri, C.N.Aruna, Padma K. Bhat, P.M. Arumugam']}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Department of Public Health Dentistry, Rajarajeswari Dental College and Hospital, Bengaluru, Karnataka, India', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='fb0f6f55-3fc3-4d6f-9f46-4e2f5101bbc5', embedding=None, metadata={'metadata': ['Clinical effects of Bifidobacterium as a probiotic on oral health: A systematic review', 'ABSTRACT']}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Background: The burden of oral diseases is increasing, which constitute a major public health problem.\nThe u

In [39]:
for block in blocks:
  print("---")
  print(block.to_context_text())

---
Clinical effects of Bifidobacterium as a probiotic on oral health: A systematic review > M.Y Jayachandra, R. Gayathiri, C.N.Aruna, Padma K. Bhat, P.M. Arumugam
Department of Public Health Dentistry, Rajarajeswari Dental College and Hospital, Bengaluru, Karnataka, India
---
Clinical effects of Bifidobacterium as a probiotic on oral health: A systematic review > ABSTRACT
Background: The burden of oral diseases is increasing, which constitute a major public health problem.
The use of probiotics as an adjuvant, along with routine dental care practice by an individual, can produce additional benefits in the maintenance of one’s oral health.
The study aimed to investigate the effect of Bifidobacterium as a probiotic on oral health.
---
Clinical effects of Bifidobacterium as a probiotic on oral health: A systematic review > ABSTRACT > Address for correspondence:
Dr. R. Gayathiri, Rajarajeswari Dental College and Hospital, No 14, Ramohalli Cross, Mysore Road, Kumbalgodu, Bengaluru ‑ 560 07

In [34]:
for doc in documents:
    print("--- Doc ---")
    print(doc.metadata)
    # print("---")
    # print(doc.text)


--- Doc ---
{'metadata': ['Clinical effects of Bifidobacterium as a probiotic on oral health: A systematic review', 'M.Y Jayachandra, R. Gayathiri, C.N.Aruna, Padma K. Bhat, P.M. Arumugam']}
--- Doc ---
{'metadata': ['Clinical effects of Bifidobacterium as a probiotic on oral health: A systematic review', 'ABSTRACT']}
--- Doc ---
{'metadata': ['Clinical effects of Bifidobacterium as a probiotic on oral health: A systematic review', 'ABSTRACT', 'Address for correspondence:']}
--- Doc ---
{'metadata': ['Clinical effects of Bifidobacterium as a probiotic on oral health: A systematic review', 'ABSTRACT', 'Address for correspondence:']}
--- Doc ---
{'metadata': ['Clinical effects of Bifidobacterium as a probiotic on oral health: A systematic review', 'ABSTRACT', 'Address for correspondence:']}
--- Doc ---
{'metadata': ['Clinical effects of Bifidobacterium as a probiotic on oral health: A systematic review', 'INTRODUCTION']}
--- Doc ---
{'metadata': ['Clinical effects of Bifidobacterium as a