### Replica 5: Preparación de documentos con metadata y embedding con metadata

#### Indexación simple de metadatos

In [1]:
from datetime import datetime
from haystack import Document

documents = [
    Document(
        content="Use pip to install a basic version of Haystack's latest release: pip install farm-haystack. All the core Haystack components live in the haystack repo. But there's also the haystack-extras repo which contains components that are not as widely used, and you need to install them separately.",
        meta={"version": 1.15, "date": datetime(2023, 3, 30)},
    ),
    Document(
        content="Use pip to install a basic version of Haystack's latest release: pip install farm-haystack[inference]. All the core Haystack components live in the haystack repo. But there's also the haystack-extras repo which contains components that are not as widely used, and you need to install them separately.",
        meta={"version": 1.22, "date": datetime(2023, 11, 7)},
    ),
    Document(
        content="Use pip to install only the Haystack 2.0 code: pip install haystack-ai. The haystack-ai package is built on the main branch which is an unstable beta version, but it's useful if you want to try the new features as soon as they are merged.",
        meta={"version": 2.0, "date": datetime(2023, 12, 4)},
    ),
]

In [2]:
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter

pipeline = Pipeline()
document_store = InMemoryDocumentStore()

pipeline.add_component("embedder", SentenceTransformersDocumentEmbedder(model='sentence-transformers/all-MiniLM-L6-v2'))
pipeline.add_component("writer", DocumentWriter(document_store=document_store))

pipeline.connect("embedder", "writer")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7f781cb2db10>
🚅 Components
  - embedder: SentenceTransformersDocumentEmbedder
  - writer: DocumentWriter
🛤️ Connections
  - embedder.documents -> writer.documents (List[Document])

In [3]:
pipeline.run({"embedder":{ "documents": documents}})



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'writer': {'documents_written': 3}}

#### Indexación **CON** y **SIN** metadatos

In [7]:
from haystack import Pipeline
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy

def create_pipeline(document_store, metadata_fields_to_embed=None):
    document_cleaner = DocumentCleaner()
    document_splitter = DocumentSplitter(split_by="sentence", split_length=2)
    document_embedder = SentenceTransformersDocumentEmbedder(model="thenlper/gte-large", meta_fields_to_embed=metadata_fields_to_embed)
    document_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.OVERWRITE)
    
    pipeline = Pipeline()
    pipeline.add_component("cleaner", document_cleaner)
    pipeline.add_component("splitter", document_splitter)
    pipeline.add_component("embedder", document_embedder)
    pipeline.add_component("writer", document_writer)

    pipeline.connect("cleaner", "splitter")
    pipeline.connect("splitter", "embedder")
    pipeline.connect("embedder", "writer")

    return pipeline

In [5]:
# Preparación de los documentos
import wikipedia
from haystack import Document

some_bands = """The Beatles,The Cure""".split(",")

raw_docs = []
for title in some_bands:
    page = wikipedia.page(title=title, auto_suggest=False)
    doc = Document(content=page.content, meta={"title": page.title, "url": page.url})
    raw_docs.append(doc)

raw_docs

[Document(id=3bd51e0bccf9d0b49b0e428b403ff79a6d514cd4e75fcff000170e2bd82811e8, content: 'The Beatles were an English rock band formed in Liverpool in 1960. The core lineup of the band compr...', meta: {'title': 'The Beatles', 'url': 'https://en.wikipedia.org/wiki/The_Beatles'}),
 Document(id=42887854a14f7f08a00eeafb61dd668efdb4c12a87784066a2ef13a25e751ff9, content: 'The Cure are an English rock band formed in Crawley, West Sussex in 1976 by guitarist, lead vocalist...', meta: {'title': 'The Cure', 'url': 'https://en.wikipedia.org/wiki/The_Cure'})]

In [8]:
# Indexación sin metadatos
from haystack.document_stores.in_memory import InMemoryDocumentStore

document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
indexing_pipeline = create_pipeline(document_store=document_store)

indexing_pipeline.run({"cleaner": {"documents": raw_docs}})

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

{'writer': {'documents_written': 542}}

In [None]:
# Indexación con metadatos
document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
indexing_with_metadata_pipeline = create_pipeline(document_store=document_store, metadata_fields_to_embed=["title"])

indexing_with_metadata_pipeline.run({"cleaner": {"documents": raw_docs}})

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

{'writer': {'documents_written': 542}}

#### Comparación de embeddings

In [25]:
# Indexación sin metadatos
from haystack.document_stores.in_memory import InMemoryDocumentStore

document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
indexing_pipeline = create_pipeline(document_store=document_store)

indexing_pipeline.run({"cleaner": {"documents": raw_docs}})

# Obtener los documentos
stored_docs = document_store.filter_documents()

# Inspeccionar los embeddings (primeros valores para simplicidad)
for doc in stored_docs[:5]:
    print("Content:", doc.content)
    print("Metadata:", doc.meta)
    print("Embedding (first 5 values):", doc.embedding[:5] if doc.embedding is not None else "No embedding")
    print()


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Content: The Beatles were an English rock band formed in Liverpool in 1960. The core lineup of the band comprised John Lennon, Paul McCartney, George Harrison, and Ringo Starr.
Metadata: {'title': 'The Beatles', 'url': 'https://en.wikipedia.org/wiki/The_Beatles', 'source_id': 'd587551b97b66e9d0b407314a2f3a63583b4996b425a76aae4193d6e0a757a3c', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}
Embedding (first 5 values): [-0.012269005179405212, -0.02238268032670021, 0.005331412423402071, -0.03672565519809723, 0.008923281915485859]

Content:  They are widely regarded as the most influential band of all time and were integral to the development of 1960s counterculture and the recognition of popular music as an art form. Rooted in skiffle, beat, and 1950s rock 'n' roll, their sound incorporated elements of classical music and traditional pop in innovative ways.
Metadata: {'title': 'The Beatles', 'url': 'https://en.wikipedia.org/wiki/The_Beatles', 'source_id': 'd587551b97b66e9d0b407314a

In [26]:
# Indexación con metadatos
document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
indexing_with_metadata_pipeline = create_pipeline(document_store=document_store, metadata_fields_to_embed=["title"])

indexing_with_metadata_pipeline.run({"cleaner": {"documents": raw_docs}})

# Acceder a los embeddings en los documentos almacenados
stored_docs = document_store.filter_documents()

# Inspeccionar los embeddings (primeros valores para simplicidad)
for doc in stored_docs[:5]:
    print("Content:", doc.content)
    print("Metadata:", doc.meta)
    print("Embedding (first 5 values):", doc.embedding[:5] if doc.embedding is not None else "No embedding")
    print()


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Content: The Beatles were an English rock band formed in Liverpool in 1960. The core lineup of the band comprised John Lennon, Paul McCartney, George Harrison, and Ringo Starr.
Metadata: {'title': 'The Beatles', 'url': 'https://en.wikipedia.org/wiki/The_Beatles', 'source_id': 'd587551b97b66e9d0b407314a2f3a63583b4996b425a76aae4193d6e0a757a3c', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}
Embedding (first 5 values): [-0.014133794233202934, -0.021062562242150307, 0.0007249260088428855, -0.028358029201626778, 0.0006535885040648282]

Content:  They are widely regarded as the most influential band of all time and were integral to the development of 1960s counterculture and the recognition of popular music as an art form. Rooted in skiffle, beat, and 1950s rock 'n' roll, their sound incorporated elements of classical music and traditional pop in innovative ways.
Metadata: {'title': 'The Beatles', 'url': 'https://en.wikipedia.org/wiki/The_Beatles', 'source_id': 'd587551b97b66e9d0b407

In [22]:
# Indexar sin metadatos
document_store_no_meta = InMemoryDocumentStore(embedding_similarity_function="cosine")
pipeline_no_meta = create_pipeline(document_store=document_store_no_meta)
pipeline_no_meta.run({"cleaner": {"documents": raw_docs}})

# Indexar con metadatos
document_store_with_meta = InMemoryDocumentStore(embedding_similarity_function="cosine")
pipeline_with_meta = create_pipeline(document_store=document_store_with_meta, metadata_fields_to_embed=["title"])
pipeline_with_meta.run({"cleaner": {"documents": raw_docs}})

# Recuperar documentos
docs_no_meta = document_store_no_meta.filter_documents()
docs_with_meta = document_store_with_meta.filter_documents()

# Comparar embeddings
for doc_no_meta, doc_with_meta in zip(docs_no_meta[:5], docs_with_meta[:5]):
    print("Content:", doc_no_meta.content)
    print("Metadata (no meta):", doc_no_meta.meta)
    print("Metadata (with meta):", doc_with_meta.meta)
    print("Embedding equal?:", doc_no_meta.embedding == doc_with_meta.embedding)
    print()


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Batches:   0%|          | 0/17 [00:00<?, ?it/s]

Content: The Beatles were an English rock band formed in Liverpool in 1960. The core lineup of the band comprised John Lennon, Paul McCartney, George Harrison, and Ringo Starr.
Metadata (no meta): {'title': 'The Beatles', 'url': 'https://en.wikipedia.org/wiki/The_Beatles', 'source_id': 'd587551b97b66e9d0b407314a2f3a63583b4996b425a76aae4193d6e0a757a3c', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}
Metadata (with meta): {'title': 'The Beatles', 'url': 'https://en.wikipedia.org/wiki/The_Beatles', 'source_id': 'd587551b97b66e9d0b407314a2f3a63583b4996b425a76aae4193d6e0a757a3c', 'page_number': 1, 'split_id': 0, 'split_idx_start': 0}
Embedding equal?: False

Content:  They are widely regarded as the most influential band of all time and were integral to the development of 1960s counterculture and the recognition of popular music as an art form. Rooted in skiffle, beat, and 1950s rock 'n' roll, their sound incorporated elements of classical music and traditional pop in innovative way