### [Preprocessing Different File Types](https://haystack.deepset.ai/tutorials/30_file_type_preprocessing_index_pipeline)

In [13]:
#! Solo es para haystack sepa que tutorial se esta ejecutando
from haystack.telemetry import tutorial_running
tutorial_running(30)

### 1. Descargar los archivos

In [14]:
# import gdown

url = "https://drive.google.com/drive/folders/1n9yqq5Gl_HWfND5bTlrCwAOycMDt5EMj"
output_dir = "recipe_files"

# gdown.download_folder(url, quiet=True, output=output_dir)

### 2. Crear un pipeline para indexar documentos

In [15]:
from haystack.components.writers import DocumentWriter
from haystack.components.converters import MarkdownToDocument, PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore

document_store = InMemoryDocumentStore()
file_type_router = FileTypeRouter(mime_types=["text/plain", "application/pdf", "text/markdown"])
text_file_converter = TextFileToDocument()
markdown_converter = MarkdownToDocument()
pdf_converter = PyPDFToDocument()
document_joiner = DocumentJoiner()

In [16]:
"""
A partir de ahí, los pasos para este proceso de indexación son un poco más estándar. DocumentCleanerSe eliminan los espacios en blanco y luego DocumentSplitterse dividen en fragmentos de 150 palabras, con un poco de superposición para evitar perder contexto.
"""

document_cleaner = DocumentCleaner()
document_splitter = DocumentSplitter(split_by="word", split_length=150, split_overlap=50)

In [17]:
# Ahora, agregará un SentenceTransformersDocumentEmbedderpara crear incrustaciones a partir de los documentos
document_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
document_writer = DocumentWriter(document_store=document_store)

In [18]:
# Crear pipeline
preprocessing_pipeline = Pipeline()
preprocessing_pipeline.add_component(instance=file_type_router, name="file_type_router")
preprocessing_pipeline.add_component(instance=text_file_converter, name="text_file_converter")
preprocessing_pipeline.add_component(instance=markdown_converter, name="markdown_converter")
preprocessing_pipeline.add_component(instance=pdf_converter, name="pypdf_converter")
preprocessing_pipeline.add_component(instance=document_joiner, name="document_joiner")
preprocessing_pipeline.add_component(instance=document_cleaner, name="document_cleaner")
preprocessing_pipeline.add_component(instance=document_splitter, name="document_splitter")
preprocessing_pipeline.add_component(instance=document_embedder, name="document_embedder")
preprocessing_pipeline.add_component(instance=document_writer, name="document_writer")

In [19]:
# Conectar pipelines
preprocessing_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
preprocessing_pipeline.connect("file_type_router.application/pdf", "pypdf_converter.sources")
preprocessing_pipeline.connect("file_type_router.text/markdown", "markdown_converter.sources")
preprocessing_pipeline.connect("text_file_converter", "document_joiner")
preprocessing_pipeline.connect("pypdf_converter", "document_joiner")
preprocessing_pipeline.connect("markdown_converter", "document_joiner")
preprocessing_pipeline.connect("document_joiner", "document_cleaner")
preprocessing_pipeline.connect("document_cleaner", "document_splitter")
preprocessing_pipeline.connect("document_splitter", "document_embedder")
preprocessing_pipeline.connect("document_embedder", "document_writer")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7f4aaa8fe090>
🚅 Components
  - file_type_router: FileTypeRouter
  - text_file_converter: TextFileToDocument
  - markdown_converter: MarkdownToDocument
  - pypdf_converter: PyPDFToDocument
  - document_joiner: DocumentJoiner
  - document_cleaner: DocumentCleaner
  - document_splitter: DocumentSplitter
  - document_embedder: SentenceTransformersDocumentEmbedder
  - document_writer: DocumentWriter
🛤️ Connections
  - file_type_router.text/plain -> text_file_converter.sources (List[Union[str, Path, ByteStream]])
  - file_type_router.application/pdf -> pypdf_converter.sources (List[Union[str, Path, ByteStream]])
  - file_type_router.text/markdown -> markdown_converter.sources (List[Union[str, Path, ByteStream]])
  - text_file_converter.documents -> document_joiner.documents (List[Document])
  - markdown_converter.documents -> document_joiner.documents (List[Document])
  - pypdf_converter.documents -> document_joiner.documents (List[Docume

In [20]:
# preprocessing_pipeline.show()

In [21]:
from pathlib import Path

preprocessing_pipeline.run({"file_type_router": {"sources": list(Path(output_dir).glob("**/*"))}})

Converting markdown files to Documents: 100%|██████████| 1/1 [00:00<00:00, 876.19it/s]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'document_writer': {'documents_written': 14}}

### 3. **Opcional** - Crear una canalización para consultar documentos

In [22]:
# Construir un RAG apra responder a las preguntas en función a los documentos indexados

import os
from getpass import getpass

if "HF_API_TOKEN" not in os.environ:
    os.environ["HF_API_TOKEN"] = getpass("Enter Hugging Face token:")

In [23]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders import PromptBuilder
from haystack.utils import Secret
from haystack.components.generators import OpenAIGenerator

template = """
Answer the questions based on the given context.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{ question }}
Answer:
"""
pipe = Pipeline()
pipe.add_component("embedder", SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))
pipe.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
pipe.add_component("prompt_builder", PromptBuilder(template=template))
pipe.add_component(
    "llm",
    OpenAIGenerator(
        api_key=Secret.from_env_var("OPENAI_API_KEY"),
        model="gpt-4o-mini-2024-07-18"
    ),
)

pipe.connect("embedder.embedding", "retriever.query_embedding")
pipe.connect("retriever", "prompt_builder.documents")
pipe.connect("prompt_builder", "llm")


<haystack.core.pipeline.pipeline.Pipeline object at 0x7f4b0aaab550>
🚅 Components
  - embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - prompt_builder: PromptBuilder
  - llm: OpenAIGenerator
🛤️ Connections
  - embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)

In [24]:
question = (
    "What ingredients would I need to make vegan keto eggplant lasagna, vegan persimmon flan, and vegan hemp cheese?"
)

pipe.run(
    {
        "embedder": {"text": question},
        "prompt_builder": {"question": question}
    }
)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'llm': {'replies': ['To make vegan keto eggplant lasagna, vegan persimmon flan, and vegan hemp cheese, you would need the following ingredients:\n\n**Vegan Keto Eggplant Lasagna:**\n- 2 large eggplants\n- Salt (a lot)\n- 1/2 cup store-bought vegan mozzarella (optional)\n- **For the Pesto:**\n  - 4 oz basil\n  - 1/4 cup almonds\n  - 1/4 cup nutritional yeast\n  - 1/4 cup olive oil\n- **For the Spinach Tofu Ricotta:**\n  - 14 oz firm or extra firm tofu\n  - 10 oz spinach\n  - 1 tsp garlic powder\n  - Juice of half a lemon\n  - Salt to taste\n- **For the Macadamia Nut Cheese:**\n  - 1 cup macadamia nuts (unsalted, unroasted)\n  - Juice of 1 lemon\n  - Garlic powder to taste\n  - Salt to taste\n\n**Vegan Persimmon Flan:**\n- ½ cup persimmon pulp (strained from 2 average-sized fuyu persimmons)\n- 1 tbsp cornstarch\n- ½ tsp agar agar\n- 1 tbsp agave nectar (or to taste)\n- 2 tbsp granulated sugar\n- ¼ cup coconut cream\n- ½ cup almond milk\n- ½ tsp vanilla\n\n**Vegan Hemp Cheese:**\n- ½ cup