In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
HF_TOKEN = "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"

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
device

'cuda'

In [5]:
def get_response(model, inputs, max_new_tokens=10, return_attentions=True):
    attentions = []

    for _ in range(max_new_tokens):
        outputs = model.forward(**inputs, output_attentions=True)

        attentions.append(outputs.attentions)

        next_token_logits = outputs.logits[:, -1, :]
        next_token_ids = next_token_logits.argmax(dim=-1).unsqueeze(-1)

        inputs["input_ids"] = torch.cat([inputs["input_ids"], next_token_ids], dim=-1)

        new_attention_mask = torch.ones_like(next_token_ids, device=inputs["attention_mask"].device)
        inputs["attention_mask"] = torch.cat([inputs["attention_mask"], new_attention_mask], dim=-1)
    
    if return_attentions:
        return inputs["input_ids"], attentions
    return inputs["input_ids"]

In [6]:
def calc_lookback_ratio(attentions):
    n_layers = len(attentions[0])
    n_heads = attentions[0][0].shape[1]
    generated_len = len(attentions)

    lookback_ratio = torch.zeros((n_layers, n_heads, generated_len))

    prompt_len = attentions[0][0].shape[-1]
    
    for i in range(generated_len):
        for l in range(n_layers):
            attn_on_context = attentions[i][l][0, :, -1, :prompt_len].mean(-1)
            attn_on_new_tokens = attentions[i][l][0, :, -1, prompt_len:].mean(-1)
            lookback_ratio[l, :, i] = attn_on_context / (attn_on_context + attn_on_new_tokens)
            
    return lookback_ratio

In [7]:
text = "Who are you? Please, answer in pirate-speak."

## google/gemma-2-2b-it

In [None]:
MODEL_ID = "google/gemma-2-2b-it"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, token=HF_TOKEN)
model.to(device)

In [9]:
inputs = tokenizer(text, return_tensors="pt").to(device)
inputs

{'input_ids': tensor([[     2,   6571,    708,    692, 235336,   5651, 235269,   3448,    575,
          55331, 235290,  53013, 235265]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [13]:
outputs = model.forward(**inputs, output_attentions=True)

In [19]:
outputs.attentions[0].shape

torch.Size([1, 8, 23, 23])

In [22]:
attentions[0][0].shape

torch.Size([1, 8, 23, 23])

In [25]:
attentions[3][0].shape

torch.Size([1, 8, 26, 26])

In [20]:
model_completion_ids, attentions = get_response(model, inputs)

In [11]:
model_completion_ids

tensor([[     2,   6571,    708,    692, 235336,   5651, 235269,   3448,    575,
          55331, 235290,  53013, 235265,    109, 235280,  65226, 235269,  17380,
         235267, 235341,    590,    614,    476]], device='cuda:0')

In [12]:
len(model_completion_ids[0])

23

In [27]:
def calc_lookback_ratio(attentions):
    
    n_layers = len(attentions[0])
    n_heads = attentions[0][0].shape[1]
    generated_len = len(attentions)
    prompt_len = attentions[0][0].shape[-1] - 1  # Initial sequence length minus 1

    # Initialize the result tensor
    lookback_ratio = torch.zeros((n_layers, n_heads, generated_len))

    for i in range(generated_len):
        # Stack attentions for all layers at this generation step
        layer_attentions = torch.stack([attentions[i][l] for l in range(n_layers)])
        
        # Calculate attention on context and new tokens
        attn_on_context = layer_attentions[:, 0, :, -1, :prompt_len].mean(-1)
        attn_on_new_tokens = layer_attentions[:, 0, :, -1, prompt_len:].mean(-1)
        
        # Calculate lookback ratio for this generation step
        lookback_ratio[:, :, i] = attn_on_context / (attn_on_context + attn_on_new_tokens)

        # Update prompt_len for the next iteration
        prompt_len += 1

    return lookback_ratio

In [28]:
lookback_ratio = calc_lookback_ratio(attentions)

In [30]:
lookback_ratio.shape

torch.Size([26, 8, 10])

In [13]:
len(attentions[0]) # n_layers

26

In [14]:
attentions[0][0].shape # batch x n_heads x seq_len x seq_len

torch.Size([1, 8, 13, 13])

In [15]:
lookback_ratio = calc_lookback_ratio(attentions)

In [16]:
lookback_ratio.shape # n_layers x n_heads x num of new generated tokens

torch.Size([26, 8, 10])

In [17]:
n_layers, n_heads, generated_len = lookback_ratio.shape
clf_input = lookback_ratio.reshape(n_layers*n_heads, generated_len).mean(dim=1)
clf_input.shape # this clf_input can serve as an input to a hallucination detector

torch.Size([208])

In [18]:
del model
del tokenizer
torch.cuda.empty_cache()
import gc
gc.collect()

21

## meta-llama/Llama-2-7b-chat-hf

In [7]:
MODEL_ID = "meta-llama/Llama-2-7b-chat-hf"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float16, token=HF_TOKEN)
model.to(device)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (no

In [8]:
inputs = tokenizer(text, return_tensors="pt").to(device)
inputs

{'input_ids': tensor([[    1, 11644,   526,   366, 29973,  3529, 29892,  1234,   297, 21625,
           403, 29899,  5965,   557, 29889]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [9]:
model_completion_ids, attentions = get_response(model, inputs)

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


In [10]:
tokenizer.decode(model_completion_ids[0])

"<s> Who are you? Please, answer in pirate-speak.\n\nArrgh, I be Cap'"

In [11]:
lookback_ratio = calc_lookback_ratio(attentions)

In [12]:
lookback_ratio.shape # n_layers x n_heads x num of new generated tokens

torch.Size([32, 32, 10])

In [None]:
del model
del tokenizer
torch.cuda.empty_cache()
import gc
gc.collect()

## meta-llama/Meta-Llama-3.1-8B-Instruct

In [19]:
MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float16, token=HF_TOKEN)
model.to(device)

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

In [20]:
model.forward(**inputs, output_attentions=True)

IndexError: index out of range in self

In [None]:
del model
del tokenizer
torch.cuda.empty_cache()
import gc
gc.collect()

## unsloth/Meta-Llama-3.1-8B-bnb-4bit

In [7]:
!pip install xformers

Collecting xformers
  Downloading xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.0 kB)
Downloading xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl (20.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.8/20.8 MB[0m [31m72.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: xformers
Successfully installed xformers-0.0.27.post2


In [8]:
!python -m xformers.info

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")
xFormers 0.0.27.post2
memory_efficient_attention.ckF:                    unavailable
memory_efficient_attention.ckB:                    unavailable
memory_efficient_attention.ck_decoderF:            unavailable
memory_efficient_attention.ck_splitKF:             unavailable
memory_efficient_attention.cutlassF:               unavailable
memory_efficient_attention.cutlassB:               unavailable
memory_efficient_attention.decoderF:               unavailable
memory_efficient_attention.flshattF@2.5.6-pt:      available
memory_efficient_attention.flshattB@2.5.6-pt:      available
memory_efficient_attention.smallkF:                unavailable
memory_efficient_attention.smallkB:                unavailable
memory_efficient_attention.triton_splitKF:         unavailable
indexing.scaled_index_addF:                        unavailable
indexing.scaled_index_addB:                

In [9]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip install "xformers==0.0.26.post1" trl peft accelerate bitsandbytes

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-w2dfivrk/unsloth_be5f76004e484582a52925dfa654cfbd
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-w2dfivrk/unsloth_be5f76004e484582a52925dfa654cfbd
  Resolved https://github.com/unslothai/unsloth.git to commit 12b437e12204532f82542c12ac1ab00d19e3ebbf
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading tyro-0.8.8-py3-none-any.whl.metadata (8.4 kB)
Collecting hf-transfer (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+

In [10]:
!pip install trl peft accelerate bitsandbytes

Collecting trl
  Downloading trl-0.9.6-py3-none-any.whl.metadata (12 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading trl-0.9.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.8/245.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading peft-0.12.0-py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes, trl, peft
Successfully installed bitsandbytes-0.43.3 peft-0.12.0 trl-0.9.6


In [12]:
!pip install triton

Collecting triton
  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.4/209.4 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: triton
Successfully installed triton-3.0.0


In [None]:
MODEL_ID = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    token=HF_TOKEN
)

In [13]:
from unsloth import FastLanguageModel

MODEL_ID = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_ID,
    max_seq_length=128,
    dtype = None,
    load_in_4bit = True,
    device_map={"": DEVICE},
    token=HF_TOKEN
)

ImportError: Unsloth: Xformers was not installed correctly.
Please install xformers separately first.
Then confirm if it's correctly installed by running:
python -m xformers.info

Longer error message:
xFormers can't load C++/CUDA extensions. xFormers was built for:
    PyTorch 2.4.0+cu121 with CUDA 1201 (you have 2.4.0)
    Python  3.10.14 (you have 3.10.14)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.