In [1]:
from haystack.document_stores.in_memory import InMemoryDocumentStore

document_store = InMemoryDocumentStore()

In [3]:
from haystack.components.writers import DocumentWriter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.preprocessors.document_splitter import DocumentSplitter
from haystack import Pipeline
from haystack.utils import ComponentDevice
from haystack.components.converters import PyPDFToDocument


document_splitter = DocumentSplitter(split_by="word", split_length=512, split_overlap=32)
document_embedder = SentenceTransformersDocumentEmbedder(
    model="BAAI/bge-small-en-v1.5"
)
document_writer = DocumentWriter(document_store)

indexing_pipeline = Pipeline()
indexing_pipeline.add_component("document_converter", PyPDFToDocument())
indexing_pipeline.add_component("document_splitter", document_splitter)
indexing_pipeline.add_component("document_embedder", document_embedder)
indexing_pipeline.add_component("document_writer", document_writer)

indexing_pipeline.connect("document_converter", "document_splitter")
indexing_pipeline.connect("document_splitter", "document_embedder")
indexing_pipeline.connect("document_embedder", "document_writer")

indexing_pipeline.run({"document_converter": {"sources": ["nvidia-10q-april-2024.pdf"]}})


Batches:   0%|          | 0/5 [00:00<?, ?it/s]

{'document_writer': {'documents_written': 129}}

In [4]:
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever, InMemoryEmbeddingRetriever
from haystack.components.embedders import SentenceTransformersTextEmbedder

text_embedder = SentenceTransformersTextEmbedder(
    model="BAAI/bge-small-en-v1.5"
)
embedding_retriever = InMemoryEmbeddingRetriever(document_store)
bm25_retriever = InMemoryBM25Retriever(document_store)


In [5]:
from haystack.components.joiners import DocumentJoiner

document_joiner = DocumentJoiner()


In [6]:
from haystack.components.rankers import TransformersSimilarityRanker

ranker = TransformersSimilarityRanker(model="BAAI/bge-reranker-base")


In [7]:
from haystack import Pipeline

hybrid_retrieval = Pipeline()
hybrid_retrieval.add_component("text_embedder", text_embedder)
hybrid_retrieval.add_component("embedding_retriever", embedding_retriever)
hybrid_retrieval.add_component("bm25_retriever", bm25_retriever)
hybrid_retrieval.add_component("document_joiner", document_joiner)
hybrid_retrieval.add_component("ranker", ranker)

hybrid_retrieval.connect("text_embedder", "embedding_retriever")
hybrid_retrieval.connect("bm25_retriever", "document_joiner")
hybrid_retrieval.connect("embedding_retriever", "document_joiner")
hybrid_retrieval.connect("document_joiner", "ranker")


<haystack.core.pipeline.pipeline.Pipeline object at 0x000002353C1A7F50>
🚅 Components
  - text_embedder: SentenceTransformersTextEmbedder
  - embedding_retriever: InMemoryEmbeddingRetriever
  - bm25_retriever: InMemoryBM25Retriever
  - document_joiner: DocumentJoiner
  - ranker: TransformersSimilarityRanker
🛤️ Connections
  - text_embedder.embedding -> embedding_retriever.query_embedding (List[float])
  - embedding_retriever.documents -> document_joiner.documents (List[Document])
  - bm25_retriever.documents -> document_joiner.documents (List[Document])
  - document_joiner.documents -> ranker.documents (List[Document])

In [10]:
query = "How much did Nvidia pay in income taxes"

result = hybrid_retrieval.run(
    {"text_embedder": {"text": query}, "bm25_retriever": {"query": query}, "ranker": {"query": query}}
)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [29]:
result['ranker']['documents'][0].content

'$ 1,553 $ 1,361 \nDeferred revenue (2) 704 573 \nDeferred income tax 583 462 \nLicenses payable 60 80 \nOther 66 65 \nTotal other long-term liabilities $ 2,966 $ 2,541 \n(1)    Income tax payable is comprised of the long-term portion of the one-time transition tax payable, unrecognized tax benefits, and related interest and penalties.\n(2)    Deferred revenue primarily includes deferrals related to hardware support and software support.\nDeferred Revenue\nThe following table shows the changes in short and long term deferred revenue during the first quarter of fiscal years 2025 and 2024:\n Apr 28, 2024 Apr 30, 2023\n(In millions)\nBalance at beginning of period $ 1,337 $ 572 \nDeferred revenue additions during the period 553 287 \nRevenue recognized during the period (341) (262)\nBalance at end of period $ 1,549 $ 597 \nWe recognized $188 million in revenue in the first quarter of fiscal year 2025 from deferred revenue as of January 28, 2024.\nRevenue allocated to remaining performance

In [30]:
def pretty_print_results(prediction):
    for doc in prediction["documents"]:
        print(doc.content, "\t", doc.score)
        print(doc.meta["abstract"])
        print("\n", "\n")
pretty_print_results(result["ranker"])

$ 1,553 $ 1,361 
Deferred revenue (2) 704 573 
Deferred income tax 583 462 
Licenses payable 60 80 
Other 66 65 
Total other long-term liabilities $ 2,966 $ 2,541 
(1)    Income tax payable is comprised of the long-term portion of the one-time transition tax payable, unrecognized tax benefits, and related interest and penalties.
(2)    Deferred revenue primarily includes deferrals related to hardware support and software support.
Deferred Revenue
The following table shows the changes in short and long term deferred revenue during the first quarter of fiscal years 2025 and 2024:
 Apr 28, 2024 Apr 30, 2023
(In millions)
Balance at beginning of period $ 1,337 $ 572 
Deferred revenue additions during the period 553 287 
Revenue recognized during the period (341) (262)
Balance at end of period $ 1,549 $ 597 
We recognized $188 million in revenue in the first quarter of fiscal year 2025 from deferred revenue as of January 28, 2024.
Revenue allocated to remaining performance obligations, whic

KeyError: 'abstract'