# Query Transformations

### Imports and configs

In [1]:
from typing import List
from llama_index.core import SimpleDirectoryReader
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
from llama_index.core import PromptTemplate
from llama_index.llms.openai import OpenAI
import os
import sys
from dotenv import load_dotenv
import hashlib
from utils import load_or_create_vector_store


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

EMBED_DIMENSION = 512
CHUNK_SIZE = 250
CHUNK_OVERLAP = 25

load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small", dimensions=EMBED_DIMENSION)

path = "../data/"
node_parser = SimpleDirectoryReader(input_dir=path, required_exts=['.txt', '.pdf'])
documents = node_parser.load_data()
### Set up vector store retriever
CACHE_DIR = "../cache"
VECTOR_STORE_PATH = os.path.join(CACHE_DIR, "faiss_index.pkl")
HASH_PATH = os.path.join(CACHE_DIR, "documents_hash.txt")

def hash_documents(documents):
    # combine all the texts into a single string
    all_titles = [doc.metadata['file_name'] for doc in documents]
    all_titles_distinct = list(set(all_titles))
    all_titles_distinct.sort()
    all_titles_str = " ".join(all_titles_distinct)
    # return a hash of the combined text which will stay consistent if the text is the same across multiple runs
    return hashlib.md5(all_titles_str.encode('utf-8')).hexdigest()

vector_store_index = load_or_create_vector_store(documents, EMBED_DIMENSION, CHUNK_SIZE, CHUNK_OVERLAP, cache_dir=CACHE_DIR, vector_store_path=VECTOR_STORE_PATH, hash_path=HASH_PATH)
retriever = vector_store_index.as_retriever(similarity_top_k=2)




Loading vector store from cache...


### Query transformations
#### Query rewriting

In [2]:
test_query = "What is the SNP's policy on climate change?"

query_gen_str = """\
You are an AI assistant tasked with reformulating user queries to improve retrieval in a RAG system. 
Given the original query, rewrite it to be more specific, detailed, and likely to retrieve relevant information.
Original Query: {query}
Rewritten Query:"""
query_gen_prompt = PromptTemplate(query_gen_str)

llm = OpenAI(model="gpt-4o", temperature=0, max_tokens=4000)

def generate_query(query: str, llm, query_gen_prompt):
    response = llm.predict(
        query_gen_prompt, query=query
    )
    return response

generated_query = generate_query(test_query, llm, query_gen_prompt)
print(f"original query: {test_query}")
print(f"generated query: {generated_query}")
# Compare improved query response
query_engine = vector_store_index.as_query_engine()
response_simple = query_engine.query(test_query)
response_improved = query_engine.query(generated_query)
print(f"Simple query response: {response_simple}")
print(f"Improved query response: {response_improved}")

original query: What is the SNP's policy on climate change?
generated query: What specific measures and initiatives does the Scottish National Party (SNP) propose in their policy to address climate change, including their targets for reducing carbon emissions and transitioning to renewable energy sources?
Simple query response: The SNP's policy on climate change includes banning new coal licenses, ensuring fair funding for climate initiatives, establishing a Four Nations Climate Response Group to meet net-zero targets, devolving powers for a bespoke migration system, mitigating the harm of Brexit on productivity, providing sustainable funding for farming, and giving Scotland its rightful share of marine funding.
Improved query response: The Scottish National Party (SNP) proposes specific measures and initiatives to address climate change by advocating for an immediate emergency budget to reverse cuts to public spending and invest in green energy, supporting projects like the Acorn carb

#### Step-back Prompting: Generating broader queries for better context retrieval.

In [3]:
test_query = "What is the SNP's policy on carbon emissions?"
step_back_template = """You are an AI assistant tasked with generating broader, more general queries to improve context retrieval in a RAG system.
Given the original query, generate a step-back query that is more general and can help retrieve relevant background information.

Original query: {query}

Step-back query:"""
query_gen_prompt = PromptTemplate(step_back_template)
generated_query = generate_query(test_query, llm, query_gen_prompt)
print(f"original query: {test_query}")
print(f"generated query: {generated_query}")

original query: What is the SNP's policy on carbon emissions?
generated query: What are the general policies and positions of the SNP on environmental issues?


#### Sub-query decomposition

In [4]:
test_query = "What is the Conservative Party's stance on immigration?"

subquery_decomposition_template = """You are an AI assistant tasked with breaking down complex queries into simpler sub-queries for a RAG system.
Given the original query, decompose it into 2-4 simpler sub-queries that, when answered together, would provide a comprehensive response to the original query.

Original query: {query}

example: What are the impacts of climate change on the environment?

Sub-queries:
1. What are the impacts of climate change on biodiversity?
2. How does climate change affect the oceans?
3. What are the effects of climate change on agriculture?
4. What are the impacts of climate change on human health?"""

query_gen_prompt = PromptTemplate(subquery_decomposition_template)
generated_query = generate_query(test_query, llm, query_gen_prompt)
print(f"original query: {test_query}")
print(f"generated query: {generated_query}")

original query: What is the Conservative Party's stance on immigration?
generated query: Sub-queries:
1. What are the key policies of the Conservative Party regarding immigration?
2. How has the Conservative Party's stance on immigration evolved over recent years?
3. What are the main arguments the Conservative Party uses to support its immigration policies?
4. How do the Conservative Party's immigration policies compare to those of other major political parties?
