In [None]:
%%capture
!pip install llama-index==0.10.37 llama-index-embeddings-openai==0.1.9 qdrant-client==1.9.1 llama-index-vector-stores-qdrant==0.2.8 llama-index-llms-openai==0.1.19

In [None]:
import os
import sys
from getpass import getpass
import nest_asyncio

from IPython.display import Markdown, display

from dotenv import load_dotenv

nest_asyncio.apply()

load_dotenv("")

sys.path.append('../helpers')

from utils import setup_llm, setup_embed_model, setup_vector_store

In [None]:
OPENAI_API_KEY = os.environ['OPENAI_API_KEY'] or getpass("Enter your OpenAI API key: ")

In [None]:
QDRANT_URL = ":memory:"

In [None]:
QDRANT_API_KEY = os.environ['QDRANT_API_KEY'] or  getpass("Enter your Qdrant API Key:")

In [None]:
from llama_index.core.settings import Settings
from utils import setup_llm, setup_embed_model

setup_llm(
    provider="openai",
    api_key=OPENAI_API_KEY, 
    model="gpt-4o", 
    temperature=0.75, 
    system_prompt="""Use ONLY the provided context and generate a complete, coherent answer to the user's query. 
    Your response must be grounded in the provided context and relevant to the essence of the user's query.
    """
    )

setup_embed_model(provider="openai", api_key=OPENAI_API_KEY)

In [None]:
import random
from utils import get_documents_from_docstore, group_documents_by_author, sample_documents

documents = get_documents_from_docstore("../data/words-of-the-senpais")

random.seed(42)

documents_by_author = group_documents_by_author(documents)

senpai_documents = sample_documents(documents_by_author, num_samples=10)

In [None]:
from llama_index.core import StorageContext
from llama_index.core.settings import Settings

from llama_index.core.node_parser import SentenceSplitter
from utils import create_index, create_query_engine, ingest, setup_vector_store

COLLECTION_NAME = "flare"

vector_store = setup_vector_store(QDRANT_URL, QDRANT_API_KEY, COLLECTION_NAME)

sentence_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=16)

nodes = ingest(
    documents=senpai_documents,
    transformations=[sentence_splitter, Settings.embed_model],
    vector_store=vector_store
)
index = create_index(
    from_where="vector_store",
    vector_store=vector_store,
    embed_model=Settings.embed_model,
    )

# üåü FLARE: Forward-Looking Active REtrieval augmented generation

[Active Retrieval Augmented Generation](https://arxiv.org/pdf/2305.06983.pdf) is a promising approach to enhance the factual accuracy of LLMs by retrieving relevant information from external knowledge sources throughout the generation process. 

FLARE (Forward-Looking Active REtrieval augmented generation) is a novel ARAG method that actively decides when and what to retrieve, leading to improved performance in long-form knowledge-intensive generation tasks.

#### üîç The Limitations of Single-Retrieval Approaches

- LLMs often hallucinate and generate factually inaccurate output

- Existing retrieval-augmented LMs mostly retrieve information only once based on the input

- Single retrieval is insufficient for generating long texts, where continually gathering information is essential

#### ‚ú® Actively Retrieving Information as Needed

- FLARE iteratively predicts the upcoming sentence to anticipate future content

- The predicted sentence is used as a query to retrieve relevant documents

- If the predicted sentence contains low-confidence tokens, FLARE regenerates it using the retrieved documents

- This process continues until the entire response is generated

#### üéØ Two Variants of FLARE

1. `FLAREinstruct`: Prompts the LM to generate retrieval queries when necessary using retrieval-encouraging instructions

2. `FLAREdirect`: Directly uses the LM's generated sentence as the retrieval query if it contains uncertain tokens

####  üîé Confidence-Based Retrieval and Query Formulation

- FLARE employs confidence-based active retrieval, triggering document retrieval only when the LM lacks necessary knowledge

- Confidence-based query formulation methods include using masked sentences as implicit queries and generating questions as explicit queries

The provided code defines a class called `FLAREInstructQueryEngine` which is a query engine based on the FLARE (Active Retrieval Augmented Generation) paper.

# [`FLAREInstructQueryEngine`](https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/query_engine/flare/base.py)

Combines retrieval and generation capabilities to generate responses based on the FLARE approach, leveraging retrieval-encouraging instructions and iterative refinement.

## Arguments you need to know

- `query_engine`: The underlying query engine to use for retrieval.

- `llm` (optional): The language model to use for generating responses.

- `instruct_prompt` (optional): The prompt template for generating instructions.

- `lookahead_answer_inserter` (optional): The component for inserting lookahead answers.

- `done_output_parser` (optional): The parser for determining if the response is complete.

- `query_task_output_parser` (optional): The parser for extracting query tasks from the response.

- `max_iterations` (optional): The maximum number of iterations for generating the response.

- `max_lookahead_query_tasks` (optional): The maximum number of query tasks to consider for lookahead.


### Under the hood

The key idea behind the `FLAREInstructQueryEngine` is to break down the query answering process into smaller steps. Instead of generating a complete response in one shot, it generates a partial response with placeholders, retrieves specific information to fill those placeholders, and iteratively refines the response.


1. It receives a query from the user.

2. It generates a "lookahead response" based on the query and the current state of the response. The lookahead response is a tentative response that includes placeholders for additional information to be retrieved.

3. It analyzes the lookahead response to identify specific sub-queries or "query tasks" that need to be answered to complete the response.

4. It sends these query tasks to an underlying query engine to retrieve relevant information from a knowledge base or corpus.

5. It incorporates the retrieved information into the lookahead response, replacing the placeholders with the actual retrieved content.

6. It updates the current response by appending the relevant parts of the updated lookahead response.

7. It repeats steps 2-6 iteratively, refining the response with each iteration until a maximum number of iterations is reached or the response is considered complete.


The "instructions" in the name refer to the prompts and templates used to guide the language model in generating the lookahead responses and identifying the query tasks. These instructions encourage the model to focus on retrieving relevant information rather than generating everything from scratch.

### Instantiate query engine and FLARE query engine

In [None]:
from llama_index.core.query_engine import FLAREInstructQueryEngine

index_query_engine = index.as_query_engine(similarity_top_k=3)

flare_query_engine = FLAREInstructQueryEngine(
    query_engine=index_query_engine,
    max_iterations=7,
    verbose=True,
)

In [None]:
from utils import display_prompt_dict

display_prompt_dict(flare_query_engine.get_prompts())

In [None]:
from utils import create_query_pipeline
from llama_index.core.query_pipeline import InputComponent

input_component = InputComponent()

flare_chain = [input_component,  flare_query_engine]

flare_query_pipeline = create_query_pipeline(flare_chain)

In [None]:
flare_query_pipeline.run(input="How can I ensure that outward distractions do not interrupt my good thoughts and focus?")