### Solución **Haystack**

In [1]:
import getpass
import os

if "LANGCHAIN_API_KEY" not in os.environ:
    os.environ["LANGCHAIN_TRACING_V2"] = "true"
    os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()

In [2]:
if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass()

#### A. Indexar

In [3]:
from haystack.components.writers import DocumentWriter
from haystack.components.converters import PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import SentenceTransformersDocumentEmbedder

document_store = InMemoryDocumentStore()

file_type_router = FileTypeRouter(mime_types = ["application/pdf", "text/plain"])
text_file_converter = TextFileToDocument()
pdf_converter = PyPDFToDocument()

document_joiner = DocumentJoiner()
document_cleaner = DocumentCleaner()
document_splitter = DocumentSplitter()

document_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
document_writer = DocumentWriter(document_store=document_store)

In [4]:
from haystack import Pipeline

index_pipeline = Pipeline()
index_pipeline.add_component(name="file_type_router", instance=file_type_router)
index_pipeline.add_component(name="text_file_converter", instance=text_file_converter)
index_pipeline.add_component(name="pdf_converter", instance=pdf_converter)
index_pipeline.add_component(name="document_joiner", instance=document_joiner)
index_pipeline.add_component(name="document_cleaner", instance=document_cleaner)
index_pipeline.add_component(name="document_splitter", instance=document_splitter)
index_pipeline.add_component(name="document_embedder", instance=document_embedder)
index_pipeline.add_component(name="document_writer", instance=document_writer)

index_pipeline.connect("file_type_router.text/plain", "text_file_converter.sources")
index_pipeline.connect("file_type_router.application/pdf", "pdf_converter.sources")
index_pipeline.connect("text_file_converter", "document_joiner")
index_pipeline.connect("pdf_converter", "document_joiner")
index_pipeline.connect("document_joiner", "document_cleaner")
index_pipeline.connect("document_cleaner", "document_splitter")
index_pipeline.connect("document_splitter", "document_embedder")
index_pipeline.connect("document_embedder", "document_writer")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7ff7fbfc00d0>
🚅 Components
  - file_type_router: FileTypeRouter
  - text_file_converter: TextFileToDocument
  - pdf_converter: PyPDFToDocument
  - document_joiner: DocumentJoiner
  - document_cleaner: DocumentCleaner
  - document_splitter: DocumentSplitter
  - document_embedder: SentenceTransformersDocumentEmbedder
  - document_writer: DocumentWriter
🛤️ Connections
  - file_type_router.text/plain -> text_file_converter.sources (List[Union[str, Path, ByteStream]])
  - file_type_router.application/pdf -> pdf_converter.sources (List[Union[str, Path, ByteStream]])
  - text_file_converter.documents -> document_joiner.documents (List[Document])
  - pdf_converter.documents -> document_joiner.documents (List[Document])
  - document_joiner.documents -> document_cleaner.documents (List[Document])
  - document_cleaner.documents -> document_splitter.documents (List[Document])
  - document_splitter.documents -> document_embedder.documents (List[

In [5]:
from pathlib import Path

output_dir = "./data"
list_docs = list(Path(output_dir).glob("**/*"))

index_pipeline.run({"file_type_router": {"sources": list_docs}})



Batches:   0%|          | 0/13 [00:00<?, ?it/s]

{'document_writer': {'documents_written': 411}}

### B. Retriever pipeline

In [6]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders import PromptBuilder


ret_text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
retriever = InMemoryEmbeddingRetriever(document_store=document_store)
ret_template = """
Dada la siguiente información, responde a la pregunta

Contexto:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Pregunta:{{ question }}
Respuesta:
"""
ret_prompt_builder = PromptBuilder(template=ret_template)

In [7]:
from haystack import Pipeline

ret_pipeline = Pipeline()
ret_pipeline.add_component("text_embedder", ret_text_embedder)
ret_pipeline.add_component("retriever", retriever)
ret_pipeline.add_component("prompt_builder", ret_prompt_builder)

ret_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
ret_pipeline.connect("retriever", "prompt_builder.documents")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7ff74ea9a310>
🚅 Components
  - text_embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - prompt_builder: PromptBuilder
🛤️ Connections
  - text_embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])

In [8]:
from pprint import pprint

question = "Tienes el articulo Acceso de los grupos de interés a la arena gubernamental"

response = ret_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})
pprint(response)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'prompt_builder': {'prompt': '\n'
                              'Dada la siguiente información, responde a la '
                              'pregunta\n'
                              '\n'
                              'Contexto:\n'
                              '\n'
                              '    del Gobierno 35 1,5\n'
                              'Vicepresidenta del Gobierno 6 0,3\n'
                              'Interior Interior 108 4,5\n'
                              'Justicia Justicia 230 9,5\n'
                              'Salud\n'
                              'Sanidad 28 1,2\n'
                              'Sanidad, Consumo y Bienestar Social 75 3,1\n'
                              'Sanidad, Servicios Sociales e Igualdad 89 3,7\n'
                              'Trabajo\n'
                              'Empleo y Seguridad Social 16 0,7\n'
                              'Empleo y Seguridad Social 33 1,4\n'
                              'Inclusión, Seguridad Social y M

#### C. RAG pipeline

In [9]:
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.retrievers import InMemoryEmbeddingRetriever
from haystack.components.builders import PromptBuilder
from haystack.components.generators import OpenAIGenerator

rag_text_embeder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
rag_retriever = InMemoryEmbeddingRetriever(document_store=document_store)

rag_template = """
Dada la siguiente información, responde a la pregunta

Contexto:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Pregunta:{{ question }}
Respuesta:
"""

rag_prompt_builder = PromptBuilder(template=rag_template)
rag_generator = OpenAIGenerator(model="gpt-4o-mini-2024-07-18")

In [10]:
from haystack import Pipeline

rag_pipeline = Pipeline()
rag_pipeline.add_component("text_embedder", rag_text_embeder)
rag_pipeline.add_component("retriever", rag_retriever)
rag_pipeline.add_component("prompt_builder", rag_prompt_builder)
rag_pipeline.add_component("generator", rag_generator)

rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
rag_pipeline.connect("retriever", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder", "generator")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7ff74e961ad0>
🚅 Components
  - text_embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - prompt_builder: PromptBuilder
  - generator: OpenAIGenerator
🛤️ Connections
  - text_embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> generator.prompt (str)

In [16]:
from pprint import pprint

question = "dame mas información del articulo Acceso de los grupos de interés a la arena gubernamental"

response = rag_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})
pprint(response)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'generator': {'meta': [{'finish_reason': 'stop',
                         'index': 0,
                         'model': 'gpt-4o-mini-2024-07-18',
                         'usage': {'completion_tokens': 613,
                                   'completion_tokens_details': CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0),
                                   'prompt_tokens': 3172,
                                   'prompt_tokens_details': PromptTokensDetails(audio_tokens=0, cached_tokens=0),
                                   'total_tokens': 3785}}],
               'replies': ['El artículo titulado "Acceso de los grupos de '
                           'interés a la arena gubernamental" de Iván Medina y '
                           'Laura Chaqués-Bonafont, publicado en la revista '
                           '*Revista Española de Investigaciones '
                           'Sociológicas*, se centra en el análisis del a