In [1]:
from haystack import Document
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.writers import DocumentWriter
from haystack_integrations.components.generators.ollama import OllamaGenerator
from haystack_integrations.components.embedders.ollama.document_embedder import OllamaDocumentEmbedder
from haystack_integrations.components.embedders.ollama.text_embedder import OllamaTextEmbedder
import pymupdf4llm


def extractWithImages(pdfPath):
    md_text = pymupdf4llm.to_markdown(pdfPath, write_images=True)
    return md_text

pdf_path="./documents/3.pdf"
dataset = extractWithImages(pdf_path)

documents = [Document(content=dataset, meta={"source": pdf_path})]

model = "llama2"
url = "http://localhost:11434/api/embeddings"
""" model = OllamaGenerator(model="llama2",
                            url = "http://localhost:11434/api/generate",
                            generation_kwargs={
                              "num_predict": 100,
                              "temperature": 0.9,
                              }) """

document_store = InMemoryDocumentStore()

indexing_pipeline = Pipeline()

indexing_pipeline.add_component(instance=OllamaDocumentEmbedder(model=model, url=url), name="embedder_doc")
indexing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
indexing_pipeline.connect("embedder_doc.documents", "writer.documents")

indexing_pipeline.run({"documents": documents})

  from .autonotebook import tqdm as notebook_tqdm
Calculating embeddings: 100%|██████████| 1/1 [00:18<00:00, 18.14s/it]


{'embedder_doc': {'meta': {'model': 'llama2'}},
 'writer': {'documents_written': 1}}

In [2]:
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.readers import ExtractiveReader
from haystack.components.embedders import SentenceTransformersTextEmbedder


retriever = InMemoryEmbeddingRetriever(document_store=document_store)
reader = ExtractiveReader()
reader.warm_up()

extractive_qa_pipeline = Pipeline()

extractive_qa_pipeline.add_component(instance=OllamaTextEmbedder(model=model, url=url), name="embedder")
extractive_qa_pipeline.add_component(instance=retriever, name="retriever")
extractive_qa_pipeline.add_component(instance=reader, name="reader")

extractive_qa_pipeline.connect("embedder.embedding", "retriever.query_embedding")
extractive_qa_pipeline.connect("retriever.documents", "reader.documents")

<haystack.core.pipeline.pipeline.Pipeline object at 0x110eac7d0>
🚅 Components
  - embedder: OllamaTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - reader: ExtractiveReader
🛤️ Connections
  - embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> reader.documents (List[Document])

Try extracting some answers.

In [3]:
query = "what is the Estimated online orders?"
extractive_qa_pipeline.run(
    data={"embedder": {"text": query}, "retriever": {"top_k": 3}, "reader": {"query": query, "top_k": 2}}
)

{'embedder': {'meta': {'model': 'llama2',
   'duration': datetime.timedelta(seconds=11, microseconds=904542)}},
 'reader': {'answers': [ExtractedAnswer(query='what is the Estimated online orders?', score=0.8233605623245239, data='100 a day', document=Document(id=e0e0cebded82722dc00fa203859c616a5d5c6f91013c35ed4b7d8a7094b8b59a, content: '|Sample Software Architecture Document (version 0.7)|Col2|
   |---|---|
   
   
   # Software Architecture Docum...', meta: {'source': './documents/3.pdf'}, score: 1837.210506401047), context=None, document_offset=ExtractedAnswer.Span(start=20746, end=20755), context_offset=None, meta={}),
   ExtractedAnswer(query='what is the Estimated online orders?', score=0.5708017945289612, data='---|---|\n\n\nThe time left (8 hours) is reserved for any maintenance activities\n\n**3.6** **Performance**\n\nThe payment process (credit card authorization and confirmation) must be under 10 seconds.\n\n**3.7** **Internationalization (i18n)**\n\nThe online catering servi