In [1]:
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
from haystack.components.builders import PromptBuilder
from haystack import Pipeline
from haystack.components.generators import HuggingFaceLocalGenerator
from haystack.components.builders.answer_builder import AnswerBuilder

document_store = ChromaDocumentStore(
    collection_name="eidc_datasets", persist_path="chroma-data"
)
retriever = ChromaQueryTextRetriever(document_store, top_k=3)
print("Creating prompt template...")

template = """
Given the following information, answer the question.

Question: {{query}}

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Answer:
"""

prompt_builder = PromptBuilder(template=template)

models = [
    "openai-community/gpt2",
    "google/flan-t5-large",
    "MBZUAI/LaMini-Flan-T5-783M",
    "google/long-t5-tglobal-base",
]
model_name = models[1]
print(f"Setting up model ({model_name})...")
llm = HuggingFaceLocalGenerator(
    model=model_name,
    task="text2text-generation",
    generation_kwargs={"max_new_tokens": 100, "temperature": 0.9},
)
print("Warming up model...")
llm.warm_up()

answer_builder = AnswerBuilder()

rag_pipe = Pipeline()

rag_pipe.add_component("retriever", retriever)
rag_pipe.add_component("prompt_builder", prompt_builder)
rag_pipe.add_component("llm", llm)
rag_pipe.add_component('answer_builder', answer_builder)

rag_pipe.connect("retriever.documents", "prompt_builder.documents")
rag_pipe.connect("retriever.documents", "answer_builder.documents")

rag_pipe.connect("prompt_builder", "llm")

rag_pipe.connect('llm.replies', 'answer_builder.replies')


  from .autonotebook import tqdm as notebook_tqdm


Creating prompt template...
Setting up model (google/flan-t5-large)...
Warming up model...


<haystack.core.pipeline.pipeline.Pipeline object at 0x795443edf3b0>
🚅 Components
  - retriever: ChromaQueryTextRetriever
  - prompt_builder: PromptBuilder
  - llm: HuggingFaceLocalGenerator
  - answer_builder: AnswerBuilder
🛤️ Connections
  - retriever.documents -> prompt_builder.documents (List[Document])
  - retriever.documents -> answer_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)
  - llm.replies -> answer_builder.replies (List[str])

In [4]:
with open('test.yml', 'w') as f:
    rag_pipe.dump(f)

In [3]:
query = 'Who collected the land cover map data?'
rag_pipe.run(
    {
        "retriever": {"query": query},
        "prompt_builder": {"query": query},
        "answer_builder": {"query": query}
    }
)

Token indices sequence length is longer than the specified maximum sequence length for this model (1482 > 512). Running this sequence through the model will result in indexing errors


{'answer_builder': {'answers': [GeneratedAnswer(data='the Institute of Terrestrial Ecology', query='Who collected the land cover map data?', documents=[Document(id=doc_1833, content: 'Land Cover Map 1990 (1km percentage target class, GB)
   The Land Cover Map of Great Britain 1990 (1km ...', meta: {'description': "The Land Cover Map of Great Britain 1990 (1km percentage target class, GB), is a raster digital dataset, providing a classification of land cover types into 25 classes, at a 1km resolution. The dataset consists of a set of 1km bands, each containing one of 25 target classes (or 'sub' classes). Each band of the dataset contains the percentage of the specified habitat class per 1km, derived from a higher resolution (25m) dataset. The map was produced using supervised maximum likelihood classifications of Landsat 5 Thematic Mapper satellite data. The 25 mapped classes include sea and inland waters, bare, suburban and urban areas, arable farmland, pastures and meadows, rough gras