### Replica 4: Limpieza de multiples fuentes de datos, joiner, cleaner y al final escribir en memoria

In [1]:
from haystack.components.writers import DocumentWriter
from haystack.components.converters import MarkdownToDocument, PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import SentenceTransformersDocumentEmbedder

document_store = InMemoryDocumentStore()

file_type_router = FileTypeRouter(mime_types=["text/markdown", "application/pdf", "text/plain"])
text_file_converter = TextFileToDocument()
markdown_converter = MarkdownToDocument()
pdf_converter = PyPDFToDocument()

document_joiner = DocumentJoiner()
document_cleaner = DocumentCleaner()
document_splitter = DocumentSplitter(split_by="word", split_length=150, split_overlap=50)

document_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
document_writer = DocumentWriter(document_store=document_store)

In [2]:
from haystack import Pipeline

pipeline = Pipeline()
pipeline.add_component(name="file_type_router", instance=file_type_router)
pipeline.add_component(name="text_file_converter", instance=text_file_converter)
pipeline.add_component(name="markdown_converter", instance=markdown_converter)
pipeline.add_component(name="pypdf_converter", instance=pdf_converter)
pipeline.add_component(name="document_joiner", instance=document_joiner)
pipeline.add_component(name="document_cleaner", instance=document_cleaner)
pipeline.add_component(name="document_splitter", instance=document_splitter)
pipeline.add_component(name="document_embedder", instance=document_embedder)
pipeline.add_component(name="document_writer", instance=document_writer)

In [3]:
pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
pipeline.connect("file_type_router.application/pdf", "pypdf_converter.sources")
pipeline.connect("file_type_router.text/markdown", "markdown_converter.sources")
pipeline.connect("text_file_converter", "document_joiner")
pipeline.connect("pypdf_converter", "document_joiner")
pipeline.connect("markdown_converter", "document_joiner")
pipeline.connect("document_joiner", "document_cleaner")
pipeline.connect("document_cleaner", "document_splitter")
pipeline.connect("document_splitter", "document_embedder")
pipeline.connect("document_embedder", "document_writer")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7f2fde713150>
🚅 Components
  - file_type_router: FileTypeRouter
  - text_file_converter: TextFileToDocument
  - markdown_converter: MarkdownToDocument
  - pypdf_converter: PyPDFToDocument
  - document_joiner: DocumentJoiner
  - document_cleaner: DocumentCleaner
  - document_splitter: DocumentSplitter
  - document_embedder: SentenceTransformersDocumentEmbedder
  - document_writer: DocumentWriter
🛤️ Connections
  - file_type_router.text/plain -> text_file_converter.sources (List[Union[str, Path, ByteStream]])
  - file_type_router.application/pdf -> pypdf_converter.sources (List[Union[str, Path, ByteStream]])
  - file_type_router.text/markdown -> markdown_converter.sources (List[Union[str, Path, ByteStream]])
  - text_file_converter.documents -> document_joiner.documents (List[Document])
  - markdown_converter.documents -> document_joiner.documents (List[Document])
  - pypdf_converter.documents -> document_joiner.documents (List[Docume

In [None]:
from pathlib import Path
output_dir = "../tutorials/recipe_files"

pipeline.run({"file_type_router": {"sources": list(Path(output_dir).glob("**/*"))}})

In [None]:
# import logging

# logging.basicConfig(level=logging.DEBUG)  # Configurar nivel de logs

# # Ahora al ejecutar el pipeline, se mostrarán logs detallados
# pipeline.run({"file_type_router": {"sources": list(Path(output_dir).glob("**/*"))}})


INFO:haystack.core.pipeline.base:Warming up component document_embedder...
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/README.md HTTP/11" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): eu.i.posthog.com:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/11" 200 0
DEBUG:urllib3.connectionpool:https://hugging

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:haystack.core.pipeline.pipeline:Running component document_writer


{'document_writer': {'documents_written': 14}}