# Hypothetical Document Embedding (HyDE)

### Imports and configs

In [2]:
from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.query_pipeline import QueryPipeline
from llama_index.core import Settings
from llama_index.core import PromptTemplate
from llama_index.llms.openai import OpenAI
import os
import sys
from dotenv import load_dotenv
from utils import load_or_create_vector_store


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

EMBED_DIMENSION = 512
CHUNK_SIZE = 250
CHUNK_OVERLAP = 25

load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=EMBED_DIMENSION)

path = "../data/"
node_parser = SimpleDirectoryReader(input_dir=path, required_exts=['.txt', '.pdf'])
documents = node_parser.load_data()
### Set up vector store retriever
CACHE_DIR = "../cache"
VECTOR_STORE_PATH = os.path.join(CACHE_DIR, "faiss_index.pkl")
HASH_PATH = os.path.join(CACHE_DIR, "documents_hash.txt")

vector_store_index = load_or_create_vector_store(documents, EMBED_DIMENSION, CHUNK_SIZE, CHUNK_OVERLAP, cache_dir=CACHE_DIR, vector_store_path=VECTOR_STORE_PATH, hash_path=HASH_PATH)
retriever = vector_store_index.as_retriever(similarity_top_k=2)

Loading vector store from cache...


In [3]:
class HyDERetriever:
    def __init__(self, chunk_size=250, chunk_overlap=50, retriever=None):
        self.llm = OpenAI(temperature=0, model_name="gpt-4o", max_tokens=4000)
        self.embeddings = Settings.embed_model
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.vectore_store_retriever = retriever    
        
        self.hyde_prompt = PromptTemplate(
            """Given the question '{query}', generate a hypothetical document 
            that directly answers this question. The document should be 
            detailed and in-depth.the document size has be exactly 
            {chunk_size} characters.""",
        )
        self.hyde_chain = QueryPipeline(chain=[self.hyde_prompt, self.llm], verbose=True)

    def generate_hypothetical_document(self, query):
        return self.hyde_chain.run(query=query, chunk_size=self.chunk_size)

    def retrieve(self, query):
        hypothetical_doc = self.generate_hypothetical_document(query)
        similar_docs = self.vectore_store_retriever.retrieve(query)
        return similar_docs, hypothetical_doc

hyde_retriever = HyDERetriever(
    chunk_size=CHUNK_SIZE, 
    chunk_overlap=CHUNK_OVERLAP, 
    retriever=retriever
)
test_query = "What is the SNP's policy on climate change?"
results, hypothetical_doc = hyde_retriever.retrieve(test_query)

[1;3;38;2;155;135;227m> Running module b4649b05-da32-4af6-bef3-65fd06e3ade8 with input: 
query: What is the SNP's policy on climate change?
chunk_size: ../data/

[0m[1;3;38;2;155;135;227m> Running module 6b5b13eb-2924-457a-8d5e-9d27b4ed6b7c with input: 
messages: Given the question 'What is the SNP's policy on climate change?', generate a hypothetical document that directly answers this question. The document should be detailed and in-depth.
            the do...

[0m

[NodeWithScore(node=TextNode(id_='13da08e2-6989-45d8-bd49-75da6745c23b', embedding=None, metadata={'page_label': '23', 'file_name': '2024-06-20b-SNP-General-Election-Manifesto-2024_interactive.pdf', 'file_path': '/Users/user/Projects/ragbrag_pycon_ie_24/notebooks/../data/2024-06-20b-SNP-General-Election-Manifesto-2024_interactive.pdf', 'file_type': 'application/pdf', 'file_size': 3559498, 'creation_date': '2024-09-24', 'last_modified_date': '2024-09-24'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='be0b9553-2008-4c84-9a14-b267a3aa2ce4', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '23', 'file_name': '2024-06-20b-SNP-General-Election-Manifesto-2024_interactive.pdf', 'file_path': '/Users/

In [4]:
hypothetical_doc

ChatResponse(message=ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content="The SNP's policy on climate change is centered around the goal of achieving net-zero greenhouse gas emissions by 2045. This ambitious target is in line with the recommendations of the Intergovernmental Panel on Climate Change (IPCC) to limit global warming to 1.5 degrees Celsius above pre-industrial levels. To achieve this target, the SNP has outlined a comprehensive plan that includes a range of measures to reduce emissions across all sectors of the economy.\n\nOne key aspect of the SNP's climate change policy is the promotion of renewable energy sources. The party has committed to increasing the share of renewable energy in Scotland's electricity mix to 100% by 2030. This will involve investing in wind, solar, and hydroelectric power, as well as supporting the development of new technologies such as tidal and wave energy. The SNP also aims to phase out the use of fossil fuels for electricity generati

In [5]:
results

[NodeWithScore(node=TextNode(id_='13da08e2-6989-45d8-bd49-75da6745c23b', embedding=None, metadata={'page_label': '23', 'file_name': '2024-06-20b-SNP-General-Election-Manifesto-2024_interactive.pdf', 'file_path': '/Users/user/Projects/ragbrag_pycon_ie_24/notebooks/../data/2024-06-20b-SNP-General-Election-Manifesto-2024_interactive.pdf', 'file_type': 'application/pdf', 'file_size': 3559498, 'creation_date': '2024-09-24', 'last_modified_date': '2024-09-24'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='be0b9553-2008-4c84-9a14-b267a3aa2ce4', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '23', 'file_name': '2024-06-20b-SNP-General-Election-Manifesto-2024_interactive.pdf', 'file_path': '/Users/