In [1]:
# import nest_asyncio
# nest_asyncio.apply()

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

import json
import torch
from pathlib import Path
import pandas as pd
pd.set_option("display.max_colwidth", 90)

from copy import deepcopy

# transformers
from transformers import BitsAndBytesConfig

# llama_index
from llama_index.core.prompts import PromptTemplate
# from llama_index.llms import HuggingFaceLLM
from llama_index.core import download_loader, Document, VectorStoreIndex, ServiceContext
from llama_index.core.node_parser import SentenceSplitter
from langchain.embeddings import HuggingFaceEmbeddings

from llama_index.core.indices.query.query_transform import HyDEQueryTransform
from llama_index.core.query_engine.transform_query_engine import TransformQueryEngine

from IPython.display import Markdown, display
from llama_index.core.response.notebook_utils import display_source_node

from llama_index.core.query_engine import RetrieverQueryEngine
from IPython.display import Markdown, display, HTML
from llama_index.core.retrievers import VectorIndexRetriever

from sentence_transformers import SentenceTransformer

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
from dotenv import load_dotenv

#https://github.com/akashmathur-2212/LLMs-playground/blob/main/LlamaIndex-applications/Advanced-RAG/advanced_query_transformations/Advanced_Query_Transformations.ipynb

  from .autonotebook import tqdm as notebook_tqdm


# Load Data

In [2]:
PDFReader = download_loader("PDFReader")
loader = PDFReader()
docs = loader.load_data(file=Path(r"D:\Git\NLP\LLMA_Index\data\QLORA.pdf"))

load_dotenv(dotenv_path=r"D:\RAG\git_tmp\llmapps\web_scrapper\.env")


  PDFReader = download_loader("PDFReader")


True

In [3]:
node_parser = SentenceSplitter(chunk_size=256)
nodes = node_parser.get_nodes_from_documents(docs)

In [4]:
len(nodes)

383

# Models

## LLM (`zephyr-7b-alpha`)

In [5]:
# from google.colab import userdata

# # huggingface api token
# hf_token = userdata.get('hf_token')

# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True,
# )


# def messages_to_prompt(messages):
#   prompt = ""
#   for message in messages:
#     if message.role == 'system':
#       prompt += f"<|system|>\n{message.content}</s>\n"
#     elif message.role == 'user':
#       prompt += f"<|user|>\n{message.content}</s>\n"
#     elif message.role == 'assistant':
#       prompt += f"<|assistant|>\n{message.content}</s>\n"

#   # ensure we start with a system prompt, insert blank if needed
#   if not prompt.startswith("<|system|>\n"):
#     prompt = "<|system|>\n</s>\n" + prompt

#   # add final assistant prompt
#   prompt = prompt + "<|assistant|>\n"

#   return prompt


# llm = HuggingFaceLLM(
#     model_name="HuggingFaceH4/zephyr-7b-alpha",
#     tokenizer_name="HuggingFaceH4/zephyr-7b-alpha",
#     query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
#     context_window=3900,
#     max_new_tokens=256,
#     model_kwargs={"quantization_config": quantization_config},
#     # tokenizer_kwargs={},
#     generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95, "do_sample":True},
#     messages_to_prompt=messages_to_prompt,
#     device_map="auto",
# )

## Embedding (`bge-large-en-v1.5`)

We will use **BGE embedding**. It is a general Embedding Model.

In [6]:
# embed_model = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")

## Configure Index and Retriever

In [7]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-4")


# ServiceContext
# service_context = ServiceContext.from_defaults(llm=llm,
#                                                embed_model=embed_model
#                                                )
service_context = ServiceContext.from_defaults(llm=llm,
                                               
                                               )

# index
vector_index = VectorStoreIndex(
    nodes, service_context=service_context
)

  service_context = ServiceContext.from_defaults(llm=llm,


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


# 1. HyDE Query Transformation

First, we query without transformation. Then, the same query string is used for embedding lookup and also summarization.

## Example 1

In [8]:
query_str = "What are the different approaches to reduce memory usage without sacrificing performance?"

In [9]:
query_engine = vector_index.as_query_engine()
response = query_engine.query(query_str)
display(Markdown(f"<b>{response}</b>"))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


<b>The different approaches to reduce memory usage without sacrificing performance include the use of 4-bit NormalFloat, which is an information theoretically optimal quantization data type for normally distributed data that yields better empirical results than 4-bit Integers and 4-bit Floats. Another approach is Double Quantization, a method that quantizes the quantization constants, saving an average of about 0.37 bits per parameter. Lastly, Paged Optimizers are used, which utilize NVIDIA unified memory to avoid the gradient checkpointing memory spikes that occur when processing a mini-batch with a long sequence length. These contributions are combined into a better tuned LoRA approach that includes adapters at every network layer and thereby avoids almost all of the accuracy tradeoffs seen in prior work.</b>

In [10]:
hyde = HyDEQueryTransform(include_original=True, llm=llm)
hyde_query_engine = TransformQueryEngine(query_engine, hyde)
response = hyde_query_engine.query(query_str)
display(Markdown(f"<b>{response}</b>"))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


<b>The different approaches to reduce memory usage without sacrificing performance include the use of 4-bit NormalFloat, which is an optimal quantization data type for normally distributed data that provides better results than 4-bit Integers and 4-bit Floats. Another approach is Double Quantization, a method that quantizes the quantization constants, saving an average of about 0.37 bits per parameter. Lastly, Paged Optimizers are used, which utilize NVIDIA unified memory to avoid the gradient checkpointing memory spikes that occur when processing a mini-batch with a long sequence length. These methods are combined into a better tuned LoRA approach that includes adapters at every network layer, thereby avoiding almost all of the accuracy tradeoffs seen in prior work.</b>

## Example 2

In [11]:
query_str = "How QLORA differentiate itself from earlier finetuning approachs and how it is better?"

In [12]:
query_engine = vector_index.as_query_engine()
response = query_engine.query(query_str)
display(Markdown(f"<b>{response}</b>"))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


<b>QLORA differentiates itself from earlier finetuning approaches by enabling privacy-preserving usage of Language Learning Models (LLMs). It allows users to own and manage their own data and models, making LLMs easier to deploy. While finetuning is a dual-use technology that can potentially be misused, QLORA aims to equalize access to this rapidly proliferating technology. This approach allows for more independent analysis, as opposed to keeping the power of LLMs in the hands of large corporations that do not release models or source code for auditing. Overall, QLORA is believed to have a broadly positive impact by making the finetuning of high-quality LLMs more widely and easily accessible.</b>

In [13]:
hyde = HyDEQueryTransform(include_original=True, llm=llm)
hyde_query_engine = TransformQueryEngine(query_engine, hyde)
response = hyde_query_engine.query(query_str)
display(Markdown(f"<b>{response}</b>"))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


<b>QLORA differentiates itself from earlier finetuning approaches by making the process more accessible and widely available. It enables privacy-preserving usage of Language Learning Models (LLMs), allowing users to own and manage their own data and models. This makes LLMs easier to deploy. QLORA also levels the playing field by making high-quality LLMs accessible not just to large corporations, but also to researchers with fewer resources. 

One of the significant improvements QLORA brings is its potential for mobile deployment. It might enable the critical milestone of finetuning LLMs on phones and other low-resource settings. For instance, with an iPhone 12 Plus, QLORA can finetune 3 million tokens per night while the phone is charging. This is a first, as 7B models were shown to be able to run on phones before, but QLORA is the first method that would enable the finetuning of such models. 

Moreover, QLORA's best 33B model trained on the Open Assistant dataset can rival ChatGPT on the Vicuna benchmark, demonstrating its high quality. This makes it a valuable tool for transforming raw pretrained LLMs into ChatGPT-like chatbots, further enhancing the accessibility of state-of-the-art NLP technology.</b>

## Example 3

In [14]:
query_str = "Describe the trade-offs between using BFloat16 as the computation data type and other possible choices. When would you choose one over the other?"

In [15]:
query_engine = vector_index.as_query_engine()
response = query_engine.query(query_str)
display(Markdown(f"<b>{response}</b>"))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


<b>BFloat16 (BF16), 8-bit Integer (Int8), 4-bit Float (FP4), and 4-bit NormalFloat (NF4) are all computation data types that can be used in machine learning models. Each has its own advantages and trade-offs. 

BF16 offers a good balance of performance and accuracy, as shown in the experiments on the GLUE and Super-NaturalInstructions datasets. It provides consistent results, with the BF16 replication showing similar performance to the original BF16.

Int8, used in QLORA, shows comparable performance to BF16 on the same datasets. It might be a good choice when you need to balance performance and memory usage, as it uses less memory than BF16.

FP4, another data type used in QLORA, also shows similar performance to BF16. It uses even less memory than Int8, making it a good choice when memory footprint is a concern.

NF4, when combined with Double Quantization (DQ), offers similar performance to the other data types. However, it can lead to performance degradation relative to 16-bit. Despite this, NF4 is information-theoretically optimal and might still be a good choice when bit-for-bit accuracy is a priority.

In conclusion, the choice between these data types depends on the specific requirements of your machine learning model, including performance, memory usage, and accuracy.</b>

In [16]:
hyde = HyDEQueryTransform(include_original=True, llm=llm)
hyde_query_engine = TransformQueryEngine(query_engine, hyde)
response = hyde_query_engine.query(query_str)
display(Markdown(f"<b>{response}</b>"))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


<b>BFloat16 (BF16), 8-bit Integer (Int8), 4-bit Float (FP4), and 4-bit NormalFloat (NF4) are all computation data types that can be used in machine learning models. Each has its own trade-offs. 

BF16 offers a good balance between performance and accuracy, as seen in the RoBERTa-large and T5 models. However, it may not always provide the highest accuracy. For example, in the T5-3B model, the Int8 data type from QLORA outperforms BF16.

Int8, on the other hand, provides a higher accuracy in some cases, such as the T5-3B model, but it doesn't always outperform other data types. 

FP4 and NF4, both 4-bit data types, offer a more compact memory footprint, which can be beneficial for fitting larger models into certain GPUs. However, their performance can be slightly lower compared to BF16 and Int8. 

The choice between these data types depends on the specific requirements of the task. If memory footprint is a concern, then 4-bit data types like FP4 and NF4 might be more suitable. If accuracy is the primary concern, then BF16 or Int8 might be a better choice. However, it's important to note that the optimal data type can vary depending on the specific model and dataset.</b>

In [17]:
query_bundle = hyde(query_str)
hyde_doc = query_bundle.embedding_strs[0]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Let's look at the hypothetical document.
We use HyDEQueryTransform to generate a hypothetical document and use it for embedding lookup.

In [18]:
hyde_doc

"BFloat16 is a floating-point data type that is commonly used in machine learning and artificial intelligence computations. It is a 16-bit format that provides a good balance between precision and dynamic range, making it suitable for many machine learning tasks. However, there are trade-offs to consider when choosing BFloat16 over other data types.\n\nOne of the main advantages of BFloat16 is that it requires less memory and computational resources than other data types, such as the 32-bit single-precision floating-point format (FP32). This can lead to faster computation times and lower power consumption, which is particularly beneficial in large-scale machine learning applications.\n\nHowever, the reduced precision of BFloat16 can also be a disadvantage. While it is generally sufficient for machine learning tasks, it may not provide enough precision for certain scientific computations or other applications that require high numerical accuracy. In these cases, a higher-precision data 

**Conclusion** - In Example 1, 2 and 3, HyDE improves output quality significantly, by hallucinating accurately, thus improving the embedding quality, and final output.

# 2. Sub-Question Query Engine

Now, we will see how to use a sub question query engine to tackle the problem of answering a complex query.

It first breaks down the complex query into sub questions for each relevant data source, then gather all the intermediate reponses and synthesizes a final response.

In [20]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler

import nest_asyncio
nest_asyncio.apply()

In [21]:
# Using the LlamaDebugHandler to print the trace of the sub questions and calls made to the LLM and final response
# captured by the SUB_QUESTION callback event type
llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

# ServiceContext
service_context = ServiceContext.from_defaults(llm=llm,
                                            #    embed_model=embed_model,
                                               callback_manager=callback_manager
                                               )

# index
vector_query_engine = VectorStoreIndex.from_documents(
    docs, service_context=service_context, use_async=True
).as_query_engine()

  service_context = ServiceContext.from_defaults(llm=llm,


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
**********
Trace: index_construction
    |_node_parsing -> 0.071751 seconds
      |_chunking -> 0.00251 seconds
      |_chunking -> 0.00265 seconds
      |_chunking -> 0.000998 seconds
      |_chunking -> 0.001008 seconds
      |_chunking -> 0.002 seconds
      |_chunking -> 0.003002 seconds
      |_chunking -> 0.004 seconds
      |_chunking -> 0.003003 seconds
      |_chunking -> 0.003 seconds
      |_chunking -> 0.003997 seconds
      |_chunking -> 0.001001 seconds
      |_chunking -> 0.0 seconds
      |_chunking -> 0.000988 seconds
      |_chunking -> 0.001 seconds
      |_chunking -> 0.002999 seconds
      |_chunking -> 0.0 seconds
      |_chunking -> 0.009024 seconds
      |_chunking -> 0.009108 seconds
      |_chunking -> 0.004992 seconds
      |_chunking -> 0.005992 seconds
      |_chunking -> 0.0 seconds
      |_chunking 

In [22]:
# setup base query engine as tool
query_engine_tools = [
    QueryEngineTool(
        query_engine=vector_query_engine,
        metadata=ToolMetadata(
            name="qlora_paper",
            description="Efficient Finetuning of Quantized LLMs",
        ),
    ),
]

query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    service_context=service_context,
    use_async=True,
)

In [23]:
response = query_engine.query("Describe the trade-offs between using BFloat16 as the computation data type and other possible choices. When would you choose one over the other?")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Generated 3 sub questions.
[1;3;38;2;237;90;200m[qlora_paper] Q: What is the efficiency of finetuning Quantized LLMs using BFloat16 as the computation data type?
[0m[1;3;38;2;90;149;237m[qlora_paper] Q: What are the trade-offs mentioned in the paper when using BFloat16 as the computation data type in Quantized LLMs?
[0m[1;3;38;2;11;159;203m[qlora_paper] Q: Under what circumstances, according to the paper, would BFloat16 be chosen as the computation data type over other options in Quantized LLMs?
[0mINFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
I

In [24]:
print(response)

The context does not provide specific information about the trade-offs between using BFloat16 as the computation data type and other possible choices. Additionally, it does not specify circumstances under which one might be chosen over the other. However, it does mention that in the QLORA system, BFloat16 is the standard computation data type used for performing the forward and backward pass after dequantizing the storage data type.


# 3. Router Query Engine

Now, we will define a custom router query engine that selects one out of several candidate query engines to execute a query.

In [26]:
from llama_index.core import VectorStoreIndex, SummaryIndex, SimpleKeywordTableIndex

In [27]:
service_context = ServiceContext.from_defaults(llm=llm,
                                            #    embed_model=embed_model
)

  service_context = ServiceContext.from_defaults(llm=llm,


In [28]:
## Define all the different indexes over same data

# vector index
vector_index = VectorStoreIndex(
    nodes, service_context=service_context
)

# summary index - Summary Index. The summary index is a simple data structure where nodes are stored in a sequence. During index construction, the document texts are chunked up, converted to nodes, and stored in a list. During query time, the summary index iterates through the nodes with some optional filter parameters, and synthesizes an answer from all the nodes.
summary_index = SummaryIndex(
    nodes, service_context=service_context
    )

# keyword index - This index uses a simple regex extractor to extract keywords from the text.
keyword_index = SimpleKeywordTableIndex(nodes, service_context=service_context)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


## Define Query Engines and Set Metadata

In [29]:
summary_query_engine = summary_index.as_query_engine(
    response_mode="tree_summarize",
    service_context=service_context
)

vector_query_engine = vector_index.as_query_engine(service_context=service_context)

keyword_query_engine = keyword_index.as_query_engine(service_context=service_context)

## Define Query Engine and Tool for these Indices
We define a Query Engine for each Index. We then wrap these with our QueryEngineTool.

In [30]:
from llama_index.core.tools.query_engine import QueryEngineTool

summary_tool = QueryEngineTool.from_defaults(
    query_engine=summary_query_engine,
    description=(
        "Useful for summarization questions related to Efficient Finetuning QLORA reserach paper"
    ),
)

vector_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description=(
        "Useful for retrieving specific context from QLORA reserach paper related to Efficient Finetuning "
    ),
)

keyword_tool = QueryEngineTool.from_defaults(
    query_engine=keyword_query_engine,
    description=(
        "Useful for retrieving specific context from QLORA reserach paper related to Efficient Finetuning "
        "using entities mentioned in query"
    ),
)

## Define Router Query Engine
There are several selectors available, each with some distinct attributes.

1. The `LLM selectors` use the LLM to output a JSON that is parsed, and the corresponding indexes are queried.

2. The `Pydantic selectors` (currently only supported by gpt-4-0613 and gpt-3.5-turbo-0613 (the default)) use the OpenAI Function Call API to produce pydantic selection objects, rather than parsing raw JSON.

3. For each type of selector, there is also the option to select `1 index to route to, or multiple`.

4. Then, define the `RouterQueryEngine` with a desired selector module. Here, we use the `LLMSingleSelector`, which uses LLM to choose a underlying query engine to route the query to.

## LLMSingleSelector
We can use OpenAI or any other LLM to parse generated JSON under the hood to select a sub-index for routing.

In [31]:
from llama_index.core.query_engine.router_query_engine import RouterQueryEngine
from llama_index.core.selectors.llm_selectors import LLMSingleSelector, LLMMultiSelector

In [32]:
router_query_engine  = RouterQueryEngine(
    selector=LLMSingleSelector.from_defaults(service_context=service_context),
    query_engine_tools=[
        summary_tool,
        vector_tool,
        keyword_tool,
    ],
    service_context=service_context,
)

In [33]:
response = router_query_engine.query("What is Double Quantization?")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 1: This choice is most relevant as it involves retrieving specific context from the QLORA research paper, which may include the term 'Double Quantization'..
Selecting query engine 1: This choice is most relevant as it involves retrieving specific context from the QLORA research paper, which may include the term 'Double Quantization'..
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [34]:
print(str(response))

Double Quantization, also known as DQ, is a process that quantizes the quantization constants for additional memory savings. It involves treating the quantization constants of the first quantization as inputs to a second quantization. This second step results in the quantized quantization constants and the second level of quantization constants. The process is beneficial as it helps reduce the memory footprint of quantization constants.


In [35]:
# [optional] look at selected results
print(str(response.metadata["selector_result"]))

selections=[SingleSelection(index=1, reason="This choice is most relevant as it involves retrieving specific context from the QLORA research paper, which may include the term 'Double Quantization'.")]


## LLMMultiSelector
If we want to route our query to multiple indexes, we can use a multi selector. The multi selector sends to query to multiple sub-indexes, and then aggregates all responses using a summary index to form a complete answer.

In [36]:
router_query_engine  = RouterQueryEngine(
    selector=LLMMultiSelector.from_defaults(service_context=service_context),
    query_engine_tools=[
        summary_tool,
        vector_tool,
        keyword_tool,
    ],
    service_context=service_context,
)

In [50]:
response = router_query_engine.query("What is Double Quantization?")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 1: This choice is most relevant as it involves retrieving specific context from the QLORA research paper, which may include the term 'Double Quantization'..
Selecting query engine 1: This choice is most relevant as it involves retrieving specific context from the QLORA research paper, which may include the term 'Double Quantization'..
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:llama_index.core.query_engine.router_query_engine:Selecting query engine 2: Thi

In [51]:
# [optional] look at selected results
print(str(response.metadata["selector_result"]))

selections=[SingleSelection(index=1, reason="This choice is most relevant as it involves retrieving specific context from the QLORA research paper, which may include the term 'Double Quantization'."), SingleSelection(index=2, reason="This choice could also be relevant if 'Double Quantization' is considered an entity mentioned in the query, and the context from the QLORA research paper is needed.")]


# 4. Multi-Step Query Engine

Multi-step query engine is able to decompose a complex query into sequential subquestions.

In [46]:
from llama_index.core.indices.query.query_transform.base import StepDecomposeQueryTransform
from llama_index.core.query_engine.multistep_query_engine import MultiStepQueryEngine

# set Logging to DEBUG for more detailed outputs
from llama_index.core.query_engine.multistep_query_engine import (
    MultiStepQueryEngine,
)

step_decompose_transform = StepDecomposeQueryTransform(llm=llm, verbose=True)
query_engine = vector_index.as_query_engine(service_context=service_context)

In [47]:
query_engine = MultiStepQueryEngine(
    query_engine=query_engine,
    query_transform=step_decompose_transform
)

In [48]:
query_engine.query("What is Double Quantization and what makes it different?")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
[1;3;33m> Current query: What is Double Quantization and what makes it different?
[0m[1;3;38;5;200m> New query: None
[0m

Response(response='Empty Response', source_nodes=[], metadata={'sub_qa': []})

ValueError:
******
Could not load OpenAI model. If you intended to use OpenAI, please check your OPENAI_API_KEY.
Original error:
No API key found for OpenAI.
Please set either the OPENAI_API_KEY environment variable or openai.api_key prior to initialization.
API keys can be found or created at https://platform.openai.com/account/api-keys

**Note**: I was getting the above error. It looks like MultiStepQueryEngine supports ***only*** OpenAI `GPT-4` and `GPT-3.5` model.

Please correct me if there is a way to run MultiStepQueryEngine using any other open source model.

# END

In [52]:
import pandas as pd

In [60]:
tmp=pd.read_parquet(r"C:\Users\sri.karan\Documents\Train_data_LAC_Linking_PL.parquet")

In [61]:
tmp.shape

(7685397, 14)

In [62]:
tmp.head(2)

Unnamed: 0,F_XCI_NAN_KEY,F_NAN_MOD_CODE,F_XCI_XCODE_GROUP_CODE,R_XAD_GROUP_PROD_DESC,IS_BEST_DESC,F_XCI_CODE_TYPE_MNEMONIC,F_XCI_EXTERNAL_CODE,R_XAD_DEPARTMENT_CODE,R_XAD_DEPARTMENT_DESC,R_XAD_SUPPLIER_DESC,R_XAD_SUPPLIER_CODE,R_XAD_CAT_PRICE,R_XAD_CREATED_DATE,F_NAN_CREATED_DATE
0,33634762,617,PL171,kawa rozpuszczalna nescafe classic saszetka/100g,Y,EAN,7613035782112,,new kawa rozpuszczalna__new kawa rozpuszczalna__new kawy rozpuszczalne__new kawy__new ...,,,13.296,2021-05-04 11:37:50,2016-12-21 07:56:47
1,33634762,617,PL121,"nescafe classic saszetka 100g ,",Y,EAN,7613035782112,10303.0,01_delikatesy__kawy__kawa rozpuszczalna,,,13.99,2020-04-27 11:10:54,2016-12-21 07:56:47


: 