### Replica 9: RAG con recuperación por extracción + generador

#### A. Indexar documentos con un cleaner

In [1]:
from pprint import pprint
from datasets import load_dataset
from haystack.dataclasses import Document

#! Preparar documentos
dataset = load_dataset("bilgeyucel/seven-wonders", split="train")
pprint(dataset)

documents = [Document(content=doc["content"], meta=doc['meta']) for doc in dataset]
print('\n', documents[0].content)

Dataset({
    features: ['id', 'content', 'content_type', 'meta', 'id_hash_keys', 'score', 'embedding'],
    num_rows: 151
})

 The Colossus of Rhodes (Ancient Greek: ὁ Κολοσσὸς Ῥόδιος, romanized: ho Kolossòs Rhódios Greek: Κολοσσός της Ρόδου, romanized: Kolossós tes Rhódou)[a] was a statue of the Greek sun-god Helios, erected in the city of Rhodes, on the Greek island of the same name, by Chares of Lindos in 280 BC. One of the Seven Wonders of the Ancient World, it was constructed to celebrate the successful defence of Rhodes city against an attack by Demetrius Poliorcetes, who had besieged it for a year with a large army and navy.
According to most contemporary descriptions, the Colossus stood approximately 70 cubits, or 33 metres (108 feet) high – approximately the height of the modern Statue of Liberty from feet to crown – making it the tallest statue in the ancient world.[2] It collapsed during the earthquake of 226 BC, although parts of it were preserved. In accordance with a cer

In [2]:
from haystack import Pipeline
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy
from haystack.document_stores.in_memory import InMemoryDocumentStore

#! Definir componentes del pipeline
document_store = InMemoryDocumentStore(embedding_similarity_function='cosine')
document_cleaner = DocumentCleaner()
document_splitter = DocumentSplitter(split_length=200, split_overlap=50, split_by="sentence")
document_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/multi-qa-mpnet-base-dot-v1")
document_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.OVERWRITE)

#! Construir pipeline
index_pipeline = Pipeline()
index_pipeline.add_component("cleaner", document_cleaner)
index_pipeline.add_component("splitter", document_splitter)
index_pipeline.add_component("embedder", document_embedder)
index_pipeline.add_component("writer", document_writer)

#! Conectar pipeline
index_pipeline.connect("cleaner", "splitter")
index_pipeline.connect("splitter", "embedder")
index_pipeline.connect("embedder", "writer")

#! Ejecutar pipeline
result = index_pipeline.run({"cleaner": {"documents": documents}})
pprint(result)



Batches:   0%|          | 0/5 [00:00<?, ?it/s]

{'writer': {'documents_written': 151}}


### B. Pipeline del generador

In [3]:
from haystack.components.builders import PromptBuilder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.readers import ExtractiveReader
from haystack.components.generators import OpenAIGenerator
from haystack.utils import Secret

#! Definir componentes del pipeline
text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/multi-qa-mpnet-base-dot-v1")
retriever = InMemoryEmbeddingRetriever(document_store=document_store)
reader = ExtractiveReader()
reader.warm_up()

template = """
You are a helpful assistant. Use the extracted answers and additional context provided to answer the question.

Question: {{question}}

Extracted Answers:
{% for answer in answers if answer.score > 0.5 %}
- {{ answer.data }} (Score: {{ answer.score }})
{% endfor %}

Context:
{% for document in documents %}
{{ document.content }}
{% endfor %}

Answer:
"""
prompt_builder = PromptBuilder(template=template)


generator = OpenAIGenerator(
    api_key= Secret.from_env_var("OPENAI_API_KEY"),
    model="gpt-4o-mini-2024-07-18"
) 

In [4]:
from haystack import Pipeline

#! Construir pipeline
query_pipeline = Pipeline()
query_pipeline.add_component("text_embedder", text_embedder)
query_pipeline.add_component("retriever", retriever)
query_pipeline.add_component("reader", reader)
query_pipeline.add_component("prompt_builder", prompt_builder)
query_pipeline.add_component("generator", generator)

#! Conectar pipeline
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
query_pipeline.connect("retriever.documents", "reader.documents")
query_pipeline.connect("reader.answers", "prompt_builder.answers")
query_pipeline.connect("retriever.documents", "prompt_builder.documents")
query_pipeline.connect("prompt_builder.prompt", "generator.prompt")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7fa6942040d0>
🚅 Components
  - text_embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - reader: ExtractiveReader
  - prompt_builder: PromptBuilder
  - generator: OpenAIGenerator
🛤️ Connections
  - text_embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> reader.documents (List[Document])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - reader.answers -> prompt_builder.answers (List[ExtractedAnswer])
  - prompt_builder.prompt -> generator.prompt (str)

In [7]:
# query_pipeline.show()

In [5]:
query = "Who was Pliny the Elder?"
result = query_pipeline.run(
    data={"text_embedder": {"text": query}, "retriever": {"top_k": 2}, "reader": {"query": query, "top_k": 2}}
)
pprint(result)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'generator': {'meta': [{'finish_reason': 'stop',
                         'index': 0,
                         'model': 'gpt-4o-mini-2024-07-18',
                         'usage': {'completion_tokens': 57,
                                   'completion_tokens_details': CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0),
                                   'prompt_tokens': 551,
                                   'prompt_tokens_details': PromptTokensDetails(audio_tokens=0, cached_tokens=0),
                                   'total_tokens': 608}}],
               'replies': ['Pliny the Elder was a Roman author known for his '
                           'extensive work, the Naturalis Historia, which '
                           'covered a wide range of knowledge and has survived '
                           'from the Roman Empire. He is often recognized as a '
                           'significant Roman writer and natu