In [None]:
%%capture
!pip install llama-index==0.10.37 llama-index-embeddings-openai==0.1.9 qdrant-client==1.9.1 llama-index-vector-stores-qdrant==0.2.8 llama-index-llms-openai==0.1.19

In [2]:
import os
import sys
from getpass import getpass
import nest_asyncio

from IPython.display import Markdown, display

from dotenv import load_dotenv

nest_asyncio.apply()

load_dotenv("")

sys.path.append('../helpers')

from utils import setup_llm, setup_embed_model, setup_vector_store

In [3]:
OPENAI_API_KEY = os.environ['OPENAI_API_KEY'] or getpass("Enter your OpenAI API key: ")

In [4]:
QDRANT_URL = ":memory:"

In [5]:
QDRANT_API_KEY = os.environ['QDRANT_API_KEY'] or  getpass("Enter your Qdrant API Key:")

In [None]:
from llama_index.core.settings import Settings
from utils import setup_llm, setup_embed_model

setup_llm(
    provider="openai",
    api_key=OPENAI_API_KEY, 
    model="gpt-4o", 
    temperature=0.75, 
    system_prompt="""Use ONLY the provided context and generate a complete, coherent answer to the user's query. 
    Your response must be grounded in the provided context and relevant to the essence of the user's query.
    """
    )

setup_embed_model(provider="openai", model="text-embedding-3-small", api_key=OPENAI_API_KEY)

In [None]:
import random
from utils import get_documents_from_docstore, group_documents_by_author, sample_documents

documents = get_documents_from_docstore("../data/words-of-the-senpais")

random.seed(42)

documents_by_author = group_documents_by_author(documents)

senpai_documents = sample_documents(documents_by_author, num_samples=10)

In [None]:
from llama_index.core import StorageContext
from llama_index.core.settings import Settings

from llama_index.core.node_parser import SentenceSplitter
from utils import create_index, create_query_engine, ingest, setup_vector_store

COLLECTION_NAME = "prompt-compression"

vector_store = setup_vector_store(QDRANT_URL, QDRANT_API_KEY, COLLECTION_NAME)

sentence_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=16)

nodes = ingest(
    documents=senpai_documents,
    transformations=[sentence_splitter, Settings.embed_model],
    vector_store=vector_store
)
index = create_index(
    from_where="vector_store",
    vector_store=vector_store,
    embed_model=Settings.embed_model,
    )

# Prompt Compression using LongLLMLingua

[LongLLMLingua](https://llmlingua.com/) is a method for improving the performance and efficiency of RAG and long context scenarios by compressing prompts. It addresses issues such as the "lost in the middle" problem, high costs, and context window limitations in RAG.

<img src="https://llmlingua.com/videos/figures/motivation.png" style="width:80%; height:auto;">

Image Source: [LongLLMLingua Project](https://llmlingua.com/)

The key components of LongLLMLingua are:

1. Question-aware Coarse-Grained prompt compression: Evaluates the relevance between the context and the question based on the perplexity corresponding to the question.

2. Question-aware Fine-grained Prompt Compression: Uses contrastive perplexity to extract key tokens from documents that are relevant to the question.

3. Adaptive granular control during compression: Dynamically allocates different compression ratios to different documents based on the rank information obtained from the coarse-grained compression.

4. Subsequence recovery: Recovers the original prompt content by establishing the mapping relationship between the response subsequence that appears in the compressed prompt and the subsequence of the original prompt.

Experiments show that LongLLMLingua can improve performance by up to 21.4 points at a 4x compression rate in RAG scenarios, and it outperforms retrieval-based and compression-based methods in long context benchmarks like LongBench and ZeroScrolls.

<img src="https://llmlingua.com/videos/figures/LLMLingua.png"  style="width:80%; height:auto;">


# [`LongLLMLinguaPostprocessor`](https://github.com/run-llama/llama_index/blob/main/llama-index-integrations/postprocessor/llama-index-postprocessor-longllmlingua/llama_index/postprocessor/longllmlingua/base.py)

The `LongLLMLinguaPostprocessor` is a postprocessor that optimizes the nodes by compressing the context using the LongLLMLingua method described in the paper. It aims to shorten the node text based on the given query to improve efficiency and reduce computational costs.

**Arguments you need to know:**

- `model_name`: The name of the pre-trained language model to use for compression (default: `"NousResearch/Llama-2-7b-hf"`).

- `device_map`: The device to use for running the model (default: `"cuda"`).

- `model_config`: Additional configuration options for the language model (default: empty dictionary).

- `open_api_config`: Configuration options for the OpenAI API, if used (default: empty dictionary).

- `metadata_mode`: The mode for handling metadata during postprocessing (default: `MetadataMode.ALL`).

- `instruction_str`: The instruction string to provide context for the compression (default: "Given the context, please answer the final question").

- `target_token`: The target number of compressed tokens (default: 300).

- `rank_method`: The ranking method to use for compression (default: "longllmlingua").

- `additional_compress_kwargs`: Additional keyword arguments to pass to the compression method (default: empty dictionary).

**What happens under the hood:**

- Extracts the content of each node based on the `metadata_mode`.

- Splits the context texts by "\n\n" to create a new list of context texts.

- Calls the `compress_prompt` method of the `PromptCompressor` instance, passing the new context texts, instruction string, query string, target token count, ranking method, and additional compression keyword arguments.

- The `compress_prompt` method returns a compressed prompt, which is then split by "\n\n" to separate the individual compressed context texts.

- The question and instruction, which are appended to the top and bottom of the compressed prompt, are removed.

- The remaining compressed context texts are used to create new `TextNode` instances, which are then wrapped in `NodeWithScore` objects and returned as the optimized nodes.


The **question-aware fine-grained compression** feature is NOT implemented in LlamaIndex. The compression is primarily based on the ***coarse-grained approach*** and the ranking method specified.


In [12]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.response_synthesizers import CompactAndRefine
from llama_index.postprocessor.longllmlingua import LongLLMLinguaPostprocessor


In [None]:
node_postprocessor = LongLLMLinguaPostprocessor(
    instruction_str="Given the context, please answer the final question",
    target_token=300,
    rank_method="longllmlingua",
    additional_compress_kwargs={
        "condition_compare": True,
        "condition_in_question": "after",
        "context_budget": "+100",
        "reorder_context": "sort",  # enable document reorder
        "dynamic_context_compression_ratio": 0.4, # enable dynamic compression ratio
    },
)

It  has the same usage as the other node postprocessors:


```python
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.postprocessor import YourPostProcessOfChoice

... # prior steps to this point: index is defined

your_post_processor = YourPostProcessOfChoice(WhateverArgumentsYouNeedToPass)

query_engine = index.as_query_engine(
    ..., #all your othe query engine arguments
    node_postprocessors = your_post_processor
)

# for just a single query
response = query_engine.query("your query")

# use the query pipeline and apply to your entire dataset if you'd like
```


