# Hypothetical Document Embedding (HyDE)

### Imports and configs

In [2]:
from typing import List
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.schema import BaseNode, TransformComponent
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core.text_splitter import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.query_pipeline import QueryPipeline
from llama_index.core import Settings
from llama_index.core import PromptTemplate
from llama_index.llms.openai import OpenAI
import faiss
import os
import sys
from dotenv import load_dotenv
import hashlib
import pickle


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

EMBED_DIMENSION = 512
CHUNK_SIZE = 200
CHUNK_OVERLAP = 50

load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=EMBED_DIMENSION)

path = "../data/"
node_parser = SimpleDirectoryReader(input_dir=path, required_exts=['.txt', '.pdf'])
documents = node_parser.load_data()
### Set up vector store retriever
class TextCleaner(TransformComponent):
    """
    Transformation to be used within the ingestion pipeline.
    Cleans clutters from texts.
    """
    def __call__(self, nodes, **kwargs) -> List[BaseNode]:
        
        for node in nodes:
            node.text = node.text.replace('\t', ' ') # Replace tabs with spaces
            node.text = node.text.replace(' \n', ' ') # Replace paragraph seperator with spacaes
            
        return nodes
CACHE_DIR = "../cache"
VECTOR_STORE_PATH = os.path.join(CACHE_DIR, "faiss_index.pkl")
HASH_PATH = os.path.join(CACHE_DIR, "documents_hash.txt")

def hash_documents(documents):
    # combine all the texts into a single string
    all_titles = [doc.metadata['file_name'] for doc in documents]
    all_titles_distinct = list(set(all_titles))
    all_titles_distinct.sort()
    all_titles_str = " ".join(all_titles_distinct)
    # return a hash of the combined text which will stay consistent if the text is the same across multiple runs
    return hashlib.md5(all_titles_str.encode('utf-8')).hexdigest()

def load_or_create_vector_store(documents, embed_dim, chunk_size, chunk_overlap):
    os.makedirs(CACHE_DIR, exist_ok=True)
    
    current_hash = hash_documents(documents)
    
    if os.path.exists(HASH_PATH) and os.path.exists(VECTOR_STORE_PATH):
        with open(HASH_PATH, 'r') as f:
            stored_hash = f.read().strip()

        if stored_hash == current_hash:
            print("Loading vector store from cache...")
            with open(VECTOR_STORE_PATH, 'rb') as f:
                return pickle.load(f)
    
    print("Creating new vector store...")
    faiss_index = faiss.IndexFlatL2(embed_dim)
    vector_store = FaissVectorStore(faiss_index=faiss_index)
    
    text_splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    
    pipeline = IngestionPipeline(
        transformations=[
            TextCleaner(),
            text_splitter,
        ],
        vector_store=vector_store,
    )
    
    nodes = pipeline.run(documents=documents)
    vector_store_index = VectorStoreIndex(nodes)
    
    # Save the new vector store and hash
    with open(VECTOR_STORE_PATH, 'wb') as f:
        pickle.dump(vector_store_index, f)
    
    with open(HASH_PATH, 'w') as f:
        f.write(current_hash)
    
    return vector_store_index

vector_store_index = load_or_create_vector_store(documents, EMBED_DIMENSION, CHUNK_SIZE, CHUNK_OVERLAP)
retriever = vector_store_index.as_retriever(similarity_top_k=2)

Loading vector store from cache...


In [3]:
class HyDERetriever:
    def __init__(self, chunk_size=250, chunk_overlap=50):
        self.llm = OpenAI(temperature=0, model_name="gpt-4o", max_tokens=4000)

        self.embeddings = Settings.embed_model
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.vectore_store_retriever = retriever    
        
        self.hyde_prompt = PromptTemplate(
            """Given the question '{query}', generate a hypothetical document that directly answers this question. The document should be detailed and in-depth.
            the document size has be exactly {chunk_size} characters.""",
        )
        self.hyde_chain = QueryPipeline(chain=[self.hyde_prompt, self.llm], verbose=True)

    def generate_hypothetical_document(self, query):
        return self.hyde_chain.run(query=query, chunk_size=self.chunk_size)

    def retrieve(self, query):
        hypothetical_doc = self.generate_hypothetical_document(query)
        similar_docs = self.vectore_store_retriever.retrieve(query)
        return similar_docs, hypothetical_doc

hyde_retriever = HyDERetriever(path)
test_query = "What is the SNP's policy on climate change?"
results, hypothetical_doc = hyde_retriever.retrieve(test_query)
hypothetical_doc
results

[1;3;38;2;155;135;227m> Running module d1cffccb-c461-441d-a960-4c3e86dcf190 with input: 
query: What is the SNP's policy on climate change?
chunk_size: ../data/

[0m[1;3;38;2;155;135;227m> Running module 58e458de-301b-485e-a4ea-336b872cd024 with input: 
messages: Given the question 'What is the SNP's policy on climate change?', generate a hypothetical document that directly answers this question. The document should be detailed and in-depth.
            the do...

[0m

[NodeWithScore(node=TextNode(id_='d0d6edf8-e094-432b-8dae-eadf23fb70a2', embedding=None, metadata={'page_label': '22', 'file_name': '2024-06-20b-SNP-General-Election-Manifesto-2024_interactive.pdf', 'file_path': '/Users/user/Projects/ragbrag_pycon_ie_24/notebooks/../data/2024-06-20b-SNP-General-Election-Manifesto-2024_interactive.pdf', 'file_type': 'application/pdf', 'file_size': 3559498, 'creation_date': '2024-09-24', 'last_modified_date': '2024-09-24'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='60ca30b3-62aa-46d9-bf0c-25fe52165bcb', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '22', 'file_name': '2024-06-20b-SNP-General-Election-Manifesto-2024_interactive.pdf', 'file_path': '/Users/