# In this notebook, we showcase how to use the improve retrieval performance using per-layer compression.

In [1]:
import numpy as np
import torch
from transformers import pipeline

from kvpress import (
    ExpectedAttentionPress,
    KnormPress,
    ObservedAttentionPress,
    RandomPress,
    SnapKVPress,
    StreamingLLMPress,
    ExpectedAttentionScorer,
    PerLayerCompressionPress
)

  from .autonotebook import tqdm as notebook_tqdm


# Load the pipeline and data

In [2]:
# Load pipeline

device = "cuda:0"
ckpt = "microsoft/Phi-3.5-mini-instruct"
attn_implementation = "flash_attention_2"
pipe = pipeline("kv-press-text-generation", model=ckpt, device=device, torch_dtype="auto", model_kwargs={"attn_implementation":attn_implementation})

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  4.81it/s]


In [3]:
import datasets 

df = datasets.load_dataset("simonjegou/ruler", "4096")["test"].to_pandas()
df = df.loc[df["task"] == "niah_single_3"].reset_index(drop=True)

# Use the pipeline with a press

In [4]:
# Pick a press with a compression ratio, you can run the following cells with different presses
compression_ratio = 0.3
press = ExpectedAttentionPress(compression_ratio)

In [5]:
# Run the pipeline on a single question
idx = 0
context = df.iloc[idx]["context"] 
question = df.iloc[idx]["question"] 
true_answer = df.iloc[idx]["answer"][0]

pred_answer = pipe(context, question=question, press=press)["answer"]

print(f"Question:   {question}")
print(f"Answer:     {true_answer}")
print(f"Prediction: {pred_answer}")
print(f"Correctly predicted: {true_answer in pred_answer}")

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


Question:   What is the special magic uuid for amused-quart mentioned in the provided text? 
Answer:     1ff49b78-8946-4e85-b59c-de66bacfb3d0
Prediction: The special magic uuid for amused-quart mentioned in the text is: 1ff49b78-8946-4e85-b63d-a7e3c0a1c
Correctly predicted: False


# Apply per-layer-compression with the same overall compression ratio

In [6]:
# Each layer is compressed differently, some layers have higher compression ratios, other less.
# The mean compression ratio is the same as for the original press

PHI_35_COMPRESSION_RATIOS = [0.37, 0.3, 0.37, 0.37, 0.37, 0.37, 0.07, 0.37, 0.29, 0.37, 0.36,
                             0.13, 0.37, 0.0, 0.37, 0.37, 0.37, 0.36, 0.28, 0.0, 0.09, 0.37,
                             0.37, 0.37, 0.37, 0.37, 0.04, 0.37, 0.37, 0.37, 0.37, 0.37]
print(np.mean(PHI_35_COMPRESSION_RATIOS))

0.3028125


In [7]:
press_per_layer = PerLayerCompressionPress(compression_ratios=PHI_35_COMPRESSION_RATIOS,
                                           press=ExpectedAttentionScorer())

Per layer compression wrapper is an experimental feature and only works with flash attention. Please make sure that the model uses flash attention.


In [8]:
pred_answer = pipe(context, question=question, press=press_per_layer)["answer"]

print(f"Question:   {question}")
print(f"Answer:     {true_answer}")
print(f"Prediction: {pred_answer}")
print(f"Correctly predicted: {true_answer in pred_answer}")

Question:   What is the special magic uuid for amused-quart mentioned in the provided text? 
Answer:     1ff49b78-8946-4e85-b59c-de66bacfb3d0
Prediction: The special magic uuid mentioned in the text for amused-quart is: 1ff49b78-8946-4e85-b59c-de66bacfb3d0
Correctly predicted: True
