### Replica 2: Mejorando la indexación de datos
* Probando nuevos modelos para el Embedder

In [6]:
# Definir un documento de prueba para hacer un RAG simple

from haystack.dataclasses import Document

documents = [
    Document(
        content="""Munich, the vibrant capital of Bavaria in southern Germany, exudes a perfect blend of rich cultural heritage and modern urban sophistication. Nestled along the banks of the Isar River, Munich is renowned for its splendid architecture, including the iconic Neues Rathaus (New Town Hall) at Marienplatz and the grandeur of Nymphenburg Palace. The city is a haven for art enthusiasts, with world-class museums like the Alte Pinakothek housing masterpieces by renowned artists. Munich is also famous for its lively beer gardens, where locals and tourists gather to enjoy the city's famed beers and traditional Bavarian cuisine. The city's annual Oktoberfest celebration, the world's largest beer festival, attracts millions of visitors from around the globe. Beyond its cultural and culinary delights, Munich offers picturesque parks like the English Garden, providing a serene escape within the heart of the bustling metropolis. Visitors are charmed by Munich's warm hospitality, making it a must-visit destination for travelers seeking a taste of both old-world charm and contemporary allure."""
    )
]

In [7]:
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack import Pipeline

embedder_pipeline = Pipeline()
document_store = InMemoryDocumentStore()

embedder_pipeline.add_component("embedder",SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))
embedder_pipeline.add_component("writter",DocumentWriter(document_store=document_store))

embedder_pipeline.connect("embedder","writter")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7f9d591f8e50>
🚅 Components
  - embedder: SentenceTransformersDocumentEmbedder
  - writter: DocumentWriter
🛤️ Connections
  - embedder.documents -> writter.documents (List[Document])

In [8]:
embedder_pipeline.run({"embedder":{ "documents": documents}})

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'writter': {'documents_written': 1}}

### Prueba con más datos

In [11]:
from datasets import load_dataset
from haystack import Document

dataset = load_dataset("bilgeyucel/seven-wonders", split="train")
dataset

Dataset({
    features: ['id', 'content', 'content_type', 'meta', 'id_hash_keys', 'score', 'embedding'],
    num_rows: 151
})

In [21]:
documents = [Document(content=document["content"], meta=document["meta"]) for document in dataset]
documents[:10]

[Document(id=75fd8474f2c88337f7e0dad69eba0f24ba293cb06693fb746ec403df01a1c0c5, content: 'The Colossus of Rhodes (Ancient Greek: ὁ Κολοσσὸς Ῥόδιος, romanized: ho Kolossòs Rhódios Greek: Κολο...', meta: {'url': 'https://en.wikipedia.org/wiki/Colossus_of_Rhodes', '_split_id': 0}),
 Document(id=5e4115a663f0afb5f51c3aba9d04daf6f4fae39031cc55e553e31d7be7f1d734, content: '[6]
 In 653, an Arab force under Muslim general Muawiyah I conquered Rhodes, and according to the Chr...', meta: {'url': 'https://en.wikipedia.org/wiki/Colossus_of_Rhodes', '_split_id': 1}),
 Document(id=c674d039894fc1fdcfef9d3801ae976919c7fbe6b81a189a8ca630cd4e1d7961, content: 'Construction[edit]
 Timeline and map of the Seven Wonders of the Ancient World, including the Colossu...', meta: {'url': 'https://en.wikipedia.org/wiki/Colossus_of_Rhodes', '_split_id': 2}),
 Document(id=6554927b94b6bdbb39d6276775c18233900dbdef5e47b4222130c7b861be8fba, content: 'Philo of Byzantium wrote in De septem mundi miraculis that Chares create

In [22]:
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack import Pipeline

pipeline = Pipeline()
document_store = InMemoryDocumentStore()

pipeline.add_component("embedder", SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))
pipeline.add_component("writer", DocumentWriter(document_store=document_store))

pipeline.connect("embedder", "writer")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7f9d587b2f90>
🚅 Components
  - embedder: SentenceTransformersDocumentEmbedder
  - writer: DocumentWriter
🛤️ Connections
  - embedder.documents -> writer.documents (List[Document])

In [23]:
pipeline.run({"embedder":{"documents":documents}})

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

{'writer': {'documents_written': 151}}