In [4]:
from haystack import Document
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.readers import ExtractiveReader
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
import pymupdf4llm


def extractWithImages(pdfPath):
    md_text = pymupdf4llm.to_markdown(pdfPath, write_images=True)
    return md_text

pdf_path="./documents/3.pdf"
dataset = extractWithImages(pdf_path)

documents = [Document(content=dataset, meta={"source": pdf_path})]

model = "bert-base-uncased"

document_store = InMemoryDocumentStore()

indexing_pipeline = Pipeline()

indexing_pipeline.add_component(instance=SentenceTransformersDocumentEmbedder(model=model), name="embedder")
indexing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
indexing_pipeline.connect("embedder.documents", "writer.documents")

indexing_pipeline.run({"documents": documents})

Batches: 100%|██████████| 1/1 [00:04<00:00,  4.61s/it]


{'writer': {'documents_written': 1}}

In [5]:
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.readers import ExtractiveReader
from haystack.components.embedders import SentenceTransformersTextEmbedder


retriever = InMemoryEmbeddingRetriever(document_store=document_store)
reader = ExtractiveReader()
reader.warm_up()

extractive_qa_pipeline = Pipeline()

extractive_qa_pipeline.add_component(instance=SentenceTransformersTextEmbedder(model=model), name="embedder")
extractive_qa_pipeline.add_component(instance=retriever, name="retriever")
extractive_qa_pipeline.add_component(instance=reader, name="reader")

extractive_qa_pipeline.connect("embedder.embedding", "retriever.query_embedding")
extractive_qa_pipeline.connect("retriever.documents", "reader.documents")

<haystack.core.pipeline.pipeline.Pipeline object at 0x29d262b10>
🚅 Components
  - embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - reader: ExtractiveReader
🛤️ Connections
  - embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> reader.documents (List[Document])

Try extracting some answers.

In [6]:
query = "what is the Targeted availability?"
extractive_qa_pipeline.run(
    data={"embedder": {"text": query}, "retriever": {"top_k": 3}, "reader": {"query": query, "top_k": 2}}
)

Batches: 100%|██████████| 1/1 [00:00<00:00,  7.75it/s]


{'reader': {'answers': [ExtractedAnswer(query='what is the Targeted availability?', score=0.811827540397644, data='16/7: 16 hours a day, 7 days a week', document=Document(id=e0e0cebded82722dc00fa203859c616a5d5c6f91013c35ed4b7d8a7094b8b59a, content: '|Sample Software Architecture Document (version 0.7)|Col2|
   |---|---|
   
   
   # Software Architecture Docum...', meta: {'source': './documents/3.pdf'}, score: 34.41210952502766), context=None, document_offset=ExtractedAnswer.Span(start=13629, end=13664), context_offset=None, meta={}),
   ExtractedAnswer(query='what is the Targeted availability?', score=0.6228777170181274, data='Transparent failover mechanism', document=Document(id=e0e0cebded82722dc00fa203859c616a5d5c6f91013c35ed4b7d8a7094b8b59a, content: '|Sample Software Architecture Document (version 0.7)|Col2|
   |---|---|
   
   
   # Software Architecture Docum...', meta: {'source': './documents/3.pdf'}, score: 34.41210952502766), context=None, document_offset=ExtractedAnswer.Span