In [1]:
from util import load_eidc_data

In [2]:
texts = load_eidc_data.load_title_description_lineage('data/catalogue_metadata.json')

In [3]:
from haystack.document_stores import InMemoryDocumentStore

doc_store = InMemoryDocumentStore(use_gpu=False, use_bm25=True)

In [4]:
from haystack.schema import Document

docs = [Document(content=d, id=i) for i, d in enumerate(texts)]
doc_store.write_documents(docs)

Updating BM25 representation...: 100%|██████████| 1867/1867 [00:00<00:00, 3876.07 docs/s]


In [5]:
import torch
from haystack.nodes import PromptNode, PromptTemplate, BM25Retriever, SentenceTransformersRanker

retriever = BM25Retriever(document_store=doc_store, top_k=5)
reranker = SentenceTransformersRanker(model_name_or_path='cross-encoder/ms-marco-MiniLM-L-12-v2', top_k=1)

lfqa_prompt = PromptTemplate(prompt='Answer the question using the provided context. If you are not sure about the answer, answer with "I do not know". Your answer should be in your own words and be no longer than 100 words. \\n\\n Context: {join(documents)} \\n\\n Question: {query} \\n\\n Answer:\",\n', output_parser={'type': 'AnswerParser'})
prompt = PromptNode(model_name_or_path='MBZUAI/LaMini-Flan-T5-783M', default_prompt_template=lfqa_prompt,
                    model_kwargs={'model_max_length': 2048, 'torch_dtype': torch.bfloat16})




In [6]:
from haystack import Pipeline
p = Pipeline()
p.add_node(component=retriever, name="Retriever", inputs=["Query"])
p.add_node(component=reranker, name="Reranker", inputs=["Retriever"])
p.add_node(component=prompt, name="prompt_node", inputs=["Reranker"])

In [7]:
a = p.run('Who collected the land cover map data?')
a['answers'][0].answer

'The data on surface height and reflectance were collected by the NERC Airborne Research and Survey Facility (ARSF).'

In [8]:
a = p.run('What can affect butterfly populations?')
a['answers'][0].answer

'Climate change and habitat destruction can affect butterfly populations.'

In [9]:
a = p.run('How old is the oldest man in the world?')
a['answers'][0].answer

'The provided context does not provide information about the age of the oldest man in the world.'

In [10]:
a = p.run('Where is the wettest soil in the UK?')
a['answers'][0].answer

'The provided context does not mention which site has the wettest soil in the UK.'