Document store with original docs

In [1]:
from qdrant_haystack import QdrantDocumentStore

document_store = QdrantDocumentStore(
    path="qdrant",
    index="Document",
    embedding_dim=768,
    recreate_index=False,
    similarity = "dot_product",
)

  from .autonotebook import tqdm as notebook_tqdm


Document store to store generated questions. We'll save original text as metadata

In [2]:
from haystack.document_stores import FAISSDocumentStore
questions_document_store = FAISSDocumentStore(faiss_index_factory_str="Flat",embedding_dim=384,faiss_config_path="questions_document_store.json")  

ValueError: The number of documents in the SQL database (1771) doesn't match the number of embeddings in FAISS (0). Make sure your FAISS configuration file points to the same database that you used when you saved the original index.

Retriever we'll use

In [7]:
from haystack.nodes import EmbeddingRetriever
question_docs_retriever = EmbeddingRetriever(
    document_store=questions_document_store,
   embedding_model="sentence-transformers/all-MiniLM-L6-v2",
   model_format="sentence_transformers",
   #embed_meta_fields = ["article_keywords"],
   use_gpu=False,
   devices=["mps"]
)

  return self.fget.__get__(instance, owner)()


Question generator node

In [None]:
from haystack.nodes import QuestionGenerator
import torch
qg = QuestionGenerator(use_gpu=False,devices=["mps"])

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
Using sep_token, but it is not set yet.


Go over all docs generating questions.

In [None]:
from tqdm import tqdm
from haystack.schema import Document

all_docs = document_store.get_all_documents()
for doc in tqdm(all_docs):
    out = qg.run([doc])
    for quest in out[0]["generated_questions"]:
        try:
            art_key = doc.meta['article_keywords']
        except:
            art_key = []
        
        try:
            ent_words = doc.meta['entity_words']
        except:
            ent_words = []
        
        try:
            summ = doc.meta["summary"]
        except:
            summ = ""

        q_d = {
            "document_id":quest["document_id"],
            "content":"\n".join(quest["questions"]),
            "article_keywords":art_key,
            "entity_words":ent_words,
            "article_content":doc.content,
            "article_summary":summ
        }
        questions_document_store.write_documents([Document.from_dict(q_d)])
        docss = []

Writing Documents: 10000it [00:00, 846598.71it/s]       
Writing Documents: 10000it [00:00, 2391824.82it/s]      
Writing Documents: 10000it [00:00, 2255851.13it/s]      
Writing Documents: 10000it [00:00, 1552525.91it/s]      
Writing Documents: 10000it [00:00, 1769599.19it/s]      
Writing Documents: 10000it [00:00, 2173327.12it/s]      
Writing Documents: 10000it [00:00, 2370064.98it/s]      
Writing Documents: 10000it [00:00, 2665250.05it/s]      
Writing Documents: 10000it [00:00, 2667962.60it/s]      
Writing Documents: 10000it [00:00, 2571142.03it/s]      
Writing Documents: 10000it [00:00, 1706250.10it/s]      
Writing Documents: 10000it [00:00, 2658829.79it/s]      
Writing Documents: 10000it [00:00, 2660347.58it/s]      
Writing Documents: 10000it [00:00, 2525776.23it/s]      
Writing Documents: 10000it [00:00, 2729245.18it/s]      
Writing Documents: 10000it [00:00, 2599990.08it/s]      
Writing Documents: 10000it [00:00, 1137160.83it/s]      
Writing Documents: 10000it [00:

In [None]:
#update embeddings
questions_document_store.update_embeddings(question_docs_retriever)

Batches: 100%|██████████| 56/56 [00:35<00:00,  1.59it/s]docs/s]
Documents Processed: 10000 docs [00:35, 280.60 docs/s]          


In [None]:
questions_document_store.save(index_path="generated_questions_docstore.faiss")

In case already processed load document store.

In [3]:
from haystack.document_stores import FAISSDocumentStore
questions_document_store = FAISSDocumentStore(faiss_index_path="generated_questions_docstore.faiss")

Integrating this new document_store retriever hack into a RAG pipeline

We have to create a custom node to swap the actual content of a document, (questions) with the original content for that document (original article text)

In [11]:
from haystack.nodes.base import BaseComponent
from haystack.nodes import EmbeddingRetriever
from haystack.schema import Document

class document_question_search_bridge(BaseComponent):
    outgoing_edges = 1

    def run(self, query: str, documents):
        
        new_docs = []
        for doc in documents:
            d = {
                "content":doc.meta["article_content"]
            }
            new_docs.append(Document.from_dict(d))
            
        
        output={
            "documents": new_docs,
            "_debug": {"anything": "you want"}
        }
        return output, "output_1"

    def run_batch(self, queries,**kwargs):
        # Insert code here to manipulate the input and produce an output dictionary
        ...
        output={
            "documents": ...,
        }
        return output, "output_1"

In [5]:
bridge = document_question_search_bridge()

Now the pipeline

In [8]:
from diversity_ranker import DiversityRanker
from haystack.nodes import TopPSampler
from haystack import Pipeline

p = Pipeline()
p.add_node(component=question_docs_retriever, name="Retriever", inputs=["Query"])
p.add_node(component=TopPSampler(top_p=0.95,use_gpu=True,devices=["mps"]), name="Sampler", inputs=["Retriever"])
p.add_node(component=DiversityRanker(use_gpu=True,devices=["mps"]), name="DiversityRanker", inputs=["Sampler"])
p.add_node(component=bridge,name="Bridge", inputs=["DiversityRanker"])

In [12]:
from haystack.nodes import PromptNode, PromptTemplate, AnswerParser
prompt_template = PromptTemplate(
prompt="""
Elaborate well written description about {query}, create it truthfully based solely on the given documents. 
Try to add as many different information as possible but avoid giving too many details. Be diverse, the description should be broad.
The description should be around 100 words. You must only use information from the given documents. 
Use an unbiased and journalistic tone. Do not repeat text. For every statement cite the documents where the information is extracted from using Document[number] notation. 
{join(documents, delimiter=new_line, pattern=new_line+'Document[$idx]: $content', str_replace={new_line: ' ', '[': '(', ']': ')'})}
Documents:{join(documents)}
Answer:
""",
output_parser=AnswerParser(),
)

prompt_node = PromptNode(
    model_name_or_path="gpt-3.5-turbo", 
    api_key="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", 
    default_prompt_template=prompt_template,
    max_length=400
)
p.add_node(component=prompt_node, name="Prompt_node", inputs=["Bridge"])

PipelineConfigError: A node named 'Prompt_node' is already in the pipeline. Choose another name.

In [13]:
place = "Oslo"
out = p.run(query =f"What is {place}famous for?\nWhat are the best things to do in {place}?\nWhat activities can I do in {place}?What are must see places in {place}? What are some tips and recommendations.",
params = {
    "Retriever":{
        "top_k":10,
        "filters": {"entity_words":[place,place.lower()]}
    },
})
out

Batches: 100%|██████████| 1/1 [00:00<00:00, 64.31it/s]
Query filters are not implemented for the FAISSDocumentStore.


{'answers': [<Answer {'answer': "Oslo, the capital of Norway, is famous for its innovative architecture, museums, and vibrant neighborhoods. The city is constantly featured in international media for its thriving food, fashion, art, and music scenes [Document 1]. Oslo is also known for its green spaces, with over half of the municipality covered by forests and parks. It was awarded the title of European Green Capital in 2019 [Document 1]. Visitors can enjoy various outdoor activities such as swimming, kayaking, island hopping, and exploring lush parks and forests [Document 1]. The city center is becoming increasingly car-free, making it easy to explore on foot or by bike. Oslo has a well-connected public transport system, making it convenient to travel around the city [Document 1]. The city offers a range of attractions and activities for families, including the TusenFryd amusement park, art museums with family-friendly activities, and the Inspiria science center [Document 4]. Oslo is 