### Replica 7: RAG con recuperación por extracción

#### A. Indexar documentos con un cleaner

In [17]:
from pprint import pprint
from datasets import load_dataset
from haystack.dataclasses import Document

#! Preparar documentos
dataset = load_dataset("bilgeyucel/seven-wonders", split="train")
pprint(dataset)

documents = [Document(content=doc["content"], meta=doc["meta"]) for doc in dataset]
print('\n',documents[0].content)

Dataset({
    features: ['id', 'content', 'content_type', 'meta', 'id_hash_keys', 'score', 'embedding'],
    num_rows: 151
})

 The Colossus of Rhodes (Ancient Greek: ὁ Κολοσσὸς Ῥόδιος, romanized: ho Kolossòs Rhódios Greek: Κολοσσός της Ρόδου, romanized: Kolossós tes Rhódou)[a] was a statue of the Greek sun-god Helios, erected in the city of Rhodes, on the Greek island of the same name, by Chares of Lindos in 280 BC. One of the Seven Wonders of the Ancient World, it was constructed to celebrate the successful defence of Rhodes city against an attack by Demetrius Poliorcetes, who had besieged it for a year with a large army and navy.
According to most contemporary descriptions, the Colossus stood approximately 70 cubits, or 33 metres (108 feet) high – approximately the height of the modern Statue of Liberty from feet to crown – making it the tallest statue in the ancient world.[2] It collapsed during the earthquake of 226 BC, although parts of it were preserved. In accordance with a cer

In [19]:
from haystack import Pipeline
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy
from haystack.document_stores.in_memory import InMemoryDocumentStore

#! Definir componentes del pipeline
document_store = InMemoryDocumentStore(embedding_similarity_function='cosine')
document_cleaner = DocumentCleaner()
document_splitter = DocumentSplitter(split_length=200, split_overlap=50, split_by="sentence")
document_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/multi-qa-mpnet-base-dot-v1")
document_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.OVERWRITE)

#! Construir pipeline
pipeline = Pipeline()
pipeline.add_component("cleaner", document_cleaner)
pipeline.add_component("splitter", document_splitter)
pipeline.add_component("embedder", document_embedder)
pipeline.add_component("writer", document_writer)

#! Conecar pipeline
pipeline.connect("cleaner", "splitter")
pipeline.connect("splitter", "embedder")
pipeline.connect("embedder", "writer")

#! Ejecutar pipeline
result = pipeline.run({"cleaner": {"documents": documents}})
pprint(result)




Batches:   0%|          | 0/5 [00:00<?, ?it/s]

{'writer': {'documents_written': 151}}


#### B. Construir un pipeline para extraer por similitud

In [21]:
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.readers import ExtractiveReader
from haystack.components.embedders import SentenceTransformersTextEmbedder

#! Definir componentes del pipeline
text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/multi-qa-mpnet-base-dot-v1")
retriever = InMemoryEmbeddingRetriever(document_store=document_store)
reader = ExtractiveReader()
reader.warm_up()

#! Construir pipeline
qa_pipeline = Pipeline()
qa_pipeline.add_component("embedder", text_embedder)
qa_pipeline.add_component("retriever", retriever)
qa_pipeline.add_component("reader", reader)

#! Conecar pipeline
qa_pipeline.connect("embedder.embedding", "retriever.query_embedding")
qa_pipeline.connect("retriever.documents", "reader.documents")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7f1970ea0590>
🚅 Components
  - embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - reader: ExtractiveReader
🛤️ Connections
  - embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> reader.documents (List[Document])

In [27]:
query = "Who was Pliny the Elder?"
result = qa_pipeline.run(
    data={"embedder": {"text": query}, "retriever": {"top_k": 3}, "reader": {"query": query, "top_k": 2}}
)
pprint(result)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'reader': {'answers': [ExtractedAnswer(query='Who was Pliny the Elder?',
                                        score=0.8304970860481262,
                                        data='Roman writer',
                                        document=Document(id=b29ec3cdee4191168d2654d66e72a6e5a2a3a32c35a78751ff7df67a86962968, content: 'The Roman writer Pliny the Elder, writing in the first century AD, argued that the Great Pyramid had...', meta: {'url': 'https://en.wikipedia.org/wiki/Great_Pyramid_of_Giza', '_split_id': 16, 'source_id': 'f442d4b35d66aa01ba9aef22155dbecb0711cd430fd31da92a0b752dc9c93f85', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0, '_split_overlap': []}, score: 0.5954216479225498),
                                        context=None,
                                        document_offset=Span(start=4,
                                                             end=16),
                                        context_offset=None,
                            

In [25]:
valid_answers = [
    answer for answer in result["reader"]["answers"]
    if answer.data is not None and answer.score > 0.1  # Ajusta el umbral según sea necesario
]

for ans in valid_answers:
    print(f"Answer: {ans.data}, Score: {ans.score}")


Answer: Roman writer, Score: 0.8304970860481262
Answer: a Roman author, Score: 0.7340795993804932
