In [None]:
!pip install haystack-ai
!pip install sentence-transformers

In [1]:
import os
from getpass import getpass

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")

Enter OpenAI API key:··········


In [40]:
import json
from typing import Dict, List

from haystack import Pipeline, component
from haystack.components.builders import PromptBuilder
from haystack.components.generators import OpenAIGenerator

@component()
class LLMMetadataQueryExtractor:

    def __init__(self):
        prompt = """
        You are part of an information system that processes users queries.
        Given a user query you extract information from it that matches a given list of metadata fields.
        The information to be extracted from the query must match the semantics associated with the given metadata fields.
        The information that you extracted from the query will then be used as filters to narrow down the search space
        when querying an index.
        Don't include the name of metadata field in the extracted metadata, just the value.
        The extracted information in 'Extracted metadata' must be returned as a valid JSON structure.
        ###
        Example 1:
        Query: "What was the revenue of Nvidia in 2022?"
        Metadata fields: {"company", "year"}
        Extracted metadata fields: {"company": "nvidia", "year": 2022}
        ###
        Example 2:
        Query: "What were the most influential publications in 2023 regarding Alzheimers disease?"
        Metadata fields: {"disease", "year"}
        Extracted metadata fields: {"disease": "Alzheimers", "year": 2023}
        ###
        Example 3:
        Query: "{{query}}"
        Metadata fields: "{{metadata_fields}}"
        Extracted metadata fields:
        """
        self.pipeline = Pipeline()
        self.pipeline.add_component(name="builder", instance=PromptBuilder(prompt))
        self.pipeline.add_component(name="llm", instance=OpenAIGenerator(model="gpt-3.5-turbo"))
        self.pipeline.connect("builder", "llm")

    @component.output_types(query=str, filters=Dict[str, str])
    def run(self, query: str, metadata_fields: List[str]):
        result = self.pipeline.run({'builder': {'query': query, 'metadata_fields': metadata_fields}})
        metadata = json.loads(result['llm']['replies'][0])

        # this will probably need to be done with specific data structures and in a more sophisticated way
        filters = []
        for key, value in metadata.items():
            field = f"meta.{key}"
            filters.append({f"field": field, "operator": "==", "value": value})

        return {"query": query, "filters": {"operator": "AND", "conditions": filters}}

In [41]:
extractor = LLMMetadataQueryExtractor()
query = "What were the most influential publications in 2022 regarding Parkinsons disease?"
metadata_fields = ["disease", "year"]
result = extractor.run(query, metadata_fields)
print(result)

{'query': 'What were the most influential publications in 2022 regarding Parkinsons disease?', 'filters': {'operator': 'AND', 'conditions': [{'field': 'meta.disease', 'operator': '==', 'value': 'Parkinsons'}, {'field': 'meta.year', 'operator': '==', 'value': 2022}]}}


In [42]:
from haystack import Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy


documents = [
    Document(
        content="some publication about Alzheimer prevention research done over 2023 patients study",
        meta={"year": 2022, "topics": "Alzheimer", "author": "Michael Butter"}),
    Document(
        content="some text about investigation and treatment of Alzheimer disease",
        meta={"year": 2023, "topics": "Alzheimer", "author": "John Bread"}),
]
document_store = InMemoryDocumentStore(bm25_algorithm="BM25Plus")
document_store.write_documents(documents=documents, policy=DuplicatePolicy.OVERWRITE)

2

In [43]:
from haystack import Pipeline, Document
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever


pipeline = Pipeline()
metadata_extractor = LLMMetadataQueryExtractor()
retriever = InMemoryBM25Retriever(document_store=document_store)

pipeline.add_component(instance=metadata_extractor, name="metadata_extractor")
pipeline.add_component(instance=retriever, name="retriever")
pipeline.connect("metadata_extractor.query", "retriever.query")
pipeline.connect("metadata_extractor.filters", "retriever.filters")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7ea7e37d41c0>
🚅 Components
  - metadata_extractor: LLMMetadataQueryExtractor
  - retriever: InMemoryBM25Retriever
🛤️ Connections
  - metadata_extractor.query -> retriever.query (str)
  - metadata_extractor.filters -> retriever.filters (Dict[str, str])

In [44]:
query = "2023 publications about Alzheimer"
metadata_fields = ["year", "author", "topics"]

pipeline.run(data={"metadata_extractor": {"query": query, "metadata_fields": metadata_fields}})

Ranking by BM25...:   0%|          | 0/1 [00:00<?, ? docs/s]

{'retriever': {'documents': [Document(id=763ca34dd9c4d46ab6f9daed6819ff9a4481b95cf9369bf70692328002f56952, content: 'some text about investigation and treatment of Alzheimer disease', meta: {'year': 2023, 'topics': 'Alzheimer', 'author': 'John Bread'}, score: 2.772588722239781)]}}