In [59]:
import glob
import math
import os
import shutil
from typing import Sequence
import chromadb

from llama_index.core import Settings
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import BaseNode, Document
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.chroma import ChromaVectorStore

from llama_index.core.ingestion import IngestionPipeline
from llama_index.extractors.entity import EntityExtractor
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
)

from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding

from llmsherpa.readers import LayoutPDFReader
from llmsherpa.readers.layout_reader import Block
from sympy import sequence

  from .autonotebook import tqdm as notebook_tqdm


In [60]:
DATA_PATH = "./pdf"
CHROMA_PATH = "chroma"
LLM_SHERPA_API_URL = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"

In [61]:
Settings.llm = Ollama(model="gemma:2b")
Settings.embed_model = OllamaEmbedding(model_name="snowflake-arctic-embed")

In [62]:
pdf_reader = LayoutPDFReader(LLM_SHERPA_API_URL)
documents = []
i = 0
for file in glob.glob(DATA_PATH + "/*.pdf"):
    if i > 0:
        break
    i += 1
    print(file, "k=", i)
    doc = pdf_reader.read_pdf(file)
    block: Block
    for block in doc.chunks():
        metadata = {
            "page": block.page_idx,
            "source": os.path.basename(file),
            "tag": block.tag,
        }
        document = Document(text=block.to_text(), metadata=metadata)
        documents.append(document)
        # blocks.append(block)
# blocks

./pdf/PMC10166749.pdf k= 1


In [65]:
import nest_asyncio
nest_asyncio.apply()

transformations = [
  KeywordExtractor(),
  EntityExtractor(device="cpu"),
]

pipeline = IngestionPipeline(transformations=transformations)
nodes = pipeline.run(show_progress=True, documents=documents)
nodes

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
100%|██████████| 119/119 [01:41<00:00,  1.18it/s] 
Extracting entities: 100%|██████████| 119/119 [02:01<00:00,  1.02s/it]


[Document(id_='8f41bb73-c78c-48b9-ab6a-ee3ad966ceb4', embedding=None, metadata={'page': 0, 'source': 'PMC10166749.pdf', 'tag': 'para', 'excerpt_keywords': 'Sure, here are 5 unique keywords for the document:\n\n1. PMC10166749\n2. Department of Public Health Dentistry\n3. Rajarajeswari Dental College and Hospital\n4. Bengaluru, Karnataka, India\n5. PMC10166749.pdf', 'entities': ['Rajarajeswari Dental College and Hospital', 'India', 'Department of Public Health Dentistry', 'Karnataka', 'Bengaluru']}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Department of Public Health Dentistry, Rajarajeswari Dental College and Hospital, Bengaluru, Karnataka, India', start_char_idx=None, end_char_idx=None, text_template='[Excerpt from document]\n{metadata_str}\nExcerpt:\n-----\n{content}\n-----\n', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='91976069-a6ba-4027-a06e-e3517565ca47', embedding=None, metadata={'page': 0, 'source': 