In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
HF_TOKEN = "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"

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
def get_response(model, inputs, max_new_tokens=10, return_attentions=True):
    attentions = []

    for _ in range(max_new_tokens):
        outputs = model.forward(**inputs, output_attentions=True)

        attentions.append(outputs.attentions)

        next_token_logits = outputs.logits[:, -1, :]
        next_token_ids = next_token_logits.argmax(dim=-1).unsqueeze(-1)

        inputs["input_ids"] = torch.cat([inputs["input_ids"], next_token_ids], dim=-1)

        new_attention_mask = torch.ones_like(next_token_ids, device=inputs["attention_mask"].device)
        inputs["attention_mask"] = torch.cat([inputs["attention_mask"], new_attention_mask], dim=-1)
    
    if return_attentions:
        return inputs["input_ids"], attentions
    return inputs["input_ids"]

In [5]:
def calc_lookback_ratio(attentions):
    n_layers = len(attentions[0])
    n_heads = attentions[0][0].shape[1]
    generated_len = len(attentions)

    lookback_ratio = torch.zeros((n_layers, n_heads, generated_len))

    prompt_len = attentions[0][0].shape[-1]
    
    for i in range(generated_len):
        for l in range(n_layers):
            attn_on_context = attentions[i][l][0, :, -1, :prompt_len].mean(-1)
            attn_on_new_tokens = attentions[i][l][0, :, -1, prompt_len:].mean(-1)
            lookback_ratio[l, :, i] = attn_on_context / (attn_on_context + attn_on_new_tokens)
            
    return lookback_ratio

In [6]:
text = "Who are you? Please, answer in pirate-speak."

## google/gemma-2-2b-it

In [7]:
MODEL_ID = "google/gemma-2-2b-it"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, token=HF_TOKEN)
model.to(device)

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2SdpaAttention(
          (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (post_attention_layernorm): Gemma2RMSNorm((2304,), 

In [8]:
inputs = tokenizer(text, return_tensors="pt").to(device)
inputs

{'input_ids': tensor([[     2,   6571,    708,    692, 235336,   5651, 235269,   3448,    575,
          55331, 235290,  53013, 235265]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [9]:
model_completion_ids, attentions = get_response(model, inputs)



In [10]:
model_completion_ids

tensor([[     2,   6571,    708,    692, 235336,   5651, 235269,   3448,    575,
          55331, 235290,  53013, 235265,    109, 235280,  65226, 235269,  17380,
         235267, 235341,    590,    614,    476]], device='cuda:0')

In [11]:
tokenizer.decode(model_completion_ids[0])

'<bos>Who are you? Please, answer in pirate-speak.\n\nAhoy, matey! I be a'

In [12]:
len(attentions) # num of new generated tokens

10

In [13]:
len(attentions[0]) # n_layers

26

In [14]:
attentions[0][0].shape # batch x n_heads x seq_len x seq_len

torch.Size([1, 8, 13, 13])

In [15]:
lookback_ratio = calc_lookback_ratio(attentions)

In [16]:
lookback_ratio.shape # n_layers x n_heads x num of new generated tokens

torch.Size([26, 8, 10])

In [17]:
n_layers, n_heads, generated_len = lookback_ratio.shape
clf_input = lookback_ratio.reshape(n_layers*n_heads, generated_len).mean(dim=1)
clf_input.shape # this clf_input can serve as an input to a hallucination detector

torch.Size([208])

In [18]:
del model
del tokenizer
torch.cuda.empty_cache()
import gc
gc.collect()

21

## meta-llama/Meta-Llama-3.1-8B-Instruct

In [19]:
MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float16, token=HF_TOKEN)
model.to(device)

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 99.12 MiB is free. Process 2228 has 15.79 GiB memory in use. Of the allocated memory 15.38 GiB is allocated by PyTorch, and 125.34 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
model.forward(**inputs, output_attentions=True)

In [None]:
del model
del tokenizer
torch.cuda.empty_cache()
import gc
gc.collect()

## unsloth/Meta-Llama-3.1-8B-bnb-4bit

In [20]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install "xformers==0.0.27" trl peft accelerate bitsandbytes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-vogaz1q7/unsloth_e07d5eb0497646d799f34c8c707eb402
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-vogaz1q7/unsloth_e07d5eb0497646d799f34c8c707eb402
  Resolved https://github.com/unslothai/unsloth.git to commit d0ca3497eb5911483339be025e9924cf73280178
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting tyro (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading tyro-0.8.8-py3-none-any.whl.metadata (8.4 kB)
Collecting hf-transfer (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting xformers==0.0.27
  Downloading xformers-0.0.27-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.0 kB)
Collecting trl
  Downloading trl-0.9.6-py3-none-any.whl.metadata (12 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting torch==2.3.1 (from xformers==0.0.27)
  Downloading torch-2.3.1-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.1->xformers==0.0.27)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.1->xformers==0.0.27)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.3.1->xformers==0.0.27)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylin

In [21]:
# MODEL_ID = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"

# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_ID,
#     torch_dtype=torch.bfloat16,
#     token=HF_TOKEN
# )

In [22]:
from unsloth import FastLanguageModel

MODEL_ID = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_ID,
    max_seq_length=128,
    dtype = None,
    load_in_4bit = True,
    device_map={"": DEVICE},
    token=HF_TOKEN
)

  return torch.library.impl_abstract(qualname, func, _stacklevel=2)


ImportError: Unsloth: Xformers was not installed correctly.
Please install xformers separately first.
Then confirm if it's correctly installed by running:
python -m xformers.info

Longer error message:
xFormers can't load C++/CUDA extensions. xFormers was built for:
    PyTorch 2.3.1+cu121 with CUDA 1201 (you have 2.4.0)
    Python  3.10.14 (you have 3.10.14)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.