# Hypothetical Document Embedding (HyDE)

### Imports and configs

In [13]:
from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.query_pipeline import QueryPipeline
from llama_index.core import Settings
from llama_index.core import PromptTemplate
from llama_index.llms.openai import OpenAI
import os
import sys
from dotenv import load_dotenv
from utils import load_or_create_vector_store


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

EMBED_DIMENSION = 512
CHUNK_SIZE = 250
CHUNK_OVERLAP = 25

load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=EMBED_DIMENSION)

path = "../data/"
node_parser = SimpleDirectoryReader(input_dir=path, required_exts=['.txt', '.pdf'])
documents = node_parser.load_data()
### Set up vector store retriever
CACHE_DIR = "../cache"
VECTOR_STORE_PATH = os.path.join(CACHE_DIR, "faiss_index.pkl")
HASH_PATH = os.path.join(CACHE_DIR, "documents_hash.txt")

vector_store_index = load_or_create_vector_store(documents, EMBED_DIMENSION, CHUNK_SIZE, CHUNK_OVERLAP, cache_dir=CACHE_DIR, vector_store_path=VECTOR_STORE_PATH, hash_path=HASH_PATH)
retriever = vector_store_index.as_retriever(similarity_top_k=2)

Creating new vector store...




In [18]:
class HyDERetriever:
    def __init__(self, chunk_size=250, chunk_overlap=50, retriever=None):
        self.llm = OpenAI(temperature=0, model_name="gpt-4o", max_tokens=4000)
        self.embeddings = Settings.embed_model
        self.chunk_size = str(chunk_size)
        self.chunk_overlap = chunk_overlap
        self.vectore_store_retriever = retriever    
        
        self.hyde_prompt = PromptTemplate(
            """Given the question '{query}', generate a hypothetical document 
            that directly answers this question. The document should be 
            detailed and in-depth.the document size has be exactly 
            {chunk_size} words.""",
        )
        self.hyde_chain = QueryPipeline(chain=[self.hyde_prompt, self.llm], verbose=True)

    def generate_hypothetical_document(self, query):
        return self.hyde_chain.run(query=query, chunk_size=self.chunk_size)

    def retrieve(self, query):
        hypothetical_doc = self.generate_hypothetical_document(query)
        similar_docs = self.vectore_store_retriever.retrieve(query)
        return similar_docs, hypothetical_doc

hyde_retriever = HyDERetriever(
    chunk_size=CHUNK_SIZE, 
    chunk_overlap=CHUNK_OVERLAP, 
    retriever=retriever
)
test_query = "What is the SNP's policy on climate change?"
results, hypothetical_doc = hyde_retriever.retrieve(test_query)

[1;3;38;2;155;135;227m> Running module 732a6bd2-b8db-4d35-9248-40e56d2d8300 with input: 
query: What is the SNP's policy on climate change?
chunk_size: 250

[0m[1;3;38;2;155;135;227m> Running module afce82b9-a3a6-4c1f-bd93-c0efd60c8b2c with input: 
messages: Given the question 'What is the SNP's policy on climate change?', generate a hypothetical document 
            that directly answers this question. The document should be 
            detailed and in...

[0m

In [19]:
hypothetical_doc.message.content

"The Scottish National Party (SNP) has a strong commitment to tackling climate change and has set ambitious targets to reduce greenhouse gas emissions. The SNP's policy on climate change is centered around transitioning to a low-carbon economy, investing in renewable energy sources, and promoting sustainable practices.\n\nOne of the key initiatives of the SNP is the Climate Change (Emissions Reduction Targets) (Scotland) Act, which sets legally binding targets to reduce emissions by at least 75% by 2030 and achieve net-zero emissions by 2045. The SNP also supports the development of renewable energy sources such as wind, solar, and hydro power, with the goal of generating 100% of Scotland's electricity from renewable sources by 2020.\n\nIn addition to reducing emissions and promoting renewable energy, the SNP is committed to promoting sustainable practices in areas such as transportation, agriculture, and land use. The SNP has introduced policies to encourage the use of electric vehicl

In [20]:
results[0].text

'DECISIONS MADE IN SCOTLAND, FOR SCOTLAND.      21SNP General Election Manifesto  2024\nBan new coal licences. Follow the SNP Scottish Government’s lead and commit to no support for new coal mines, which would undermine  our action to reach net zero.\nProvide fair funding for climate. Scotland has over two thirds of the UK’s peatland, and currently plants over 60% of trees in the UK, yet funds restoration and planting within our budget, with no help from the UK Government. Westminster must ensure fair funding flows to devolved nations to enable our, and their, climate ambition given that for the whole of the UK to reach net zero by 2050, Scotland must do so  by 2045.\nEstablish a Four Nations Climate Response Group to agree climate plans across the UK  that deliver on our net-zero targets and ensure  the UK Government stops backtracking on climate ambition.'

In [21]:
results[1].text

'20BUILDING A FAIRER, GREENER ECONOMY Under the SNP, Scotland’s economy is already one of the best performing parts of the UK with both GDP per head and productivity growing faster in Scotland than the UK as a whole.  But we want to go further. Our commitment to tackling the twin crises of climate change and nature loss is unwavering and we believe emissions reduction and economic prosperity go hand in hand. We want  to share in the enormous economic opportunities of the global transition to net zero. SNP MPs will demand the UK Government:\nBring forward an immediate emergency budget following the election to reverse cuts to public spending and deliver meaningful investment in economic growth, including green energy.\nWork at pace with the Acorn Project and Scottish Cluster to secure the fastest possible deployment following the UK Government’s failure to support the Acorn carbon capture, utilisation and storage project at track 1.'