In [1]:
# Libraries 

from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

from transformers import AutoModelForCausalLM, AutoTokenizer
import re

import warnings
warnings.filterwarnings("ignore")

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Embedding Model

embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [3]:
# Documents to embed

documents = [
    "The Eiffel Tower is located in Paris.",
    "The Great Wall of China is one of the Seven Wonders.",
    "Python is a programming language known for its simplicity.",
    "AI is transforming industries with machine learning.",
]

In [4]:
# Encode the documents

document_embeddings = embedder.encode(documents)

In [5]:
# Initialize a FAISS index

embedding_dim = document_embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(np.array(document_embeddings))


In [6]:
# Function for retrieval

def retrieve(query, k=2):
    query_embedding = embedder.encode([query])
    return retrieve_with_embedding(query_embedding, k)

In [7]:
# Function for retrieval with embeddings

def retrieve_with_embedding(query_embedding, k=2):
    query_embedding = query_embedding.reshape(1, 384)
    distances, indices = index.search(query_embedding, k)
    return [(documents[idx], round(float(distances[0][i]),2)) for i, idx in enumerate(indices[0])]

In [8]:
# Initializing LLM
model_name = "Qwen/Qwen2.5-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


In [9]:
# HyDE System Prompt

system_prompt = """
You are a hypothetical document generator. Given a document your task is to create a Hypothetical Document that might resemble the actual document 
that should be retrieved where the correct answer would be present.

YOU ARE NOT SUPPOSED TO MENTION ANYWHERE IN THE GENERATED DOCUMENT THAT IT IS A HYPOTHETICAL DOCUMENT.
""".strip()

In [10]:
# LLM Setup

class LLM:
    def __init__(self, model, tokenizer, system: str = "", max_tokens: int = 512) -> None:
        self.model = model
        self.tokenizer = tokenizer
        self.system = system
        self.max_tokens = max_tokens
        self.messages: list = []

        # Initialize with system message if provided
        if self.system:
            self.system_prompt_message = {"role": "system", "content": system}

    def __call__(self, message: str) -> str:
        if message:
            self.messages = [self.system_prompt_message, {"role": "user", "content": message}]
        
        # Execute and get the response
        result = self.execute()
        return result
        
    def execute(self) -> str:
        """Generates a response from the model based on the conversation history."""
        try:
            # Prepare input text and tokenize
            text = self.prepare_input_text()
            model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
    
            # Generate response
            generated_ids = self.model.generate(
                **model_inputs,
                temperature=0.7,
                max_new_tokens=self.max_tokens
            )
    
            # Trim input tokens to get only the generated output
            generated_ids = [
                output_ids[len(input_ids):] 
                for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
            ]
    
            # Decode the response
            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            return response
        except Exception as e:
            print(f"Error during execution: {e}")
            return "Error: Unable to generate a response."
    
    def prepare_input_text(self) -> str:
        """Prepares the input text using the tokenizer's chat template."""
        return self.tokenizer.apply_chat_template(
            self.messages,
            tokenize=False,
            add_generation_prompt=True
        )

In [11]:
# Hypothetical Document Generator

def hypothetical_document_generator(query):
    HyDE = LLM(model=model, tokenizer=tokenizer, system=system_prompt)
    docs = []
    for i in range(5):
        docs.append(HyDE(query))
    return docs

In [12]:
# Query

query = "Where is the Eiffel Tower located?"

In [13]:
# Generate Hypothetical Documents

docs = hypothetical_document_generator(query)

In [14]:
# Embeddings for Hypothetical Documents

hypothetical_embeddings = embedder.encode(docs)

In [15]:
# Average embedding representation of Hypothetical Documents

hyde = np.mean(hypothetical_embeddings, axis=0)

In [16]:
# Normal Retrieval Result

retrieved_docs = retrieve(query, k=1)
retrieved_docs[0] 

('The Eiffel Tower is located in Paris.', 25.29)

In [17]:
# HyDE Retrieval Result

retrieved_docs = retrieve_with_embedding(hyde, k=1)
retrieved_docs[0]

('The Eiffel Tower is located in Paris.', 18.28)