In [36]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# --- Setup ---
model_name = "meta-llama/Llama-2-7b-chat-hf" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    dtype=torch.float16, 
    device_map="auto"          
)

# Force the 'eager' implementation
model.set_attn_implementation('eager')

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.52it/s]


In [44]:
text = "The Mona Lisa was painted by"
inputs = tokenizer(text, return_tensors='pt').to(model.device)
original_tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0])
num_original_tokens = len(original_tokens)

In [45]:
with torch.no_grad():
    outputs = model(**inputs, output_attentions=True)

In [46]:
attention = outputs.attentions
last_layer_attention = attention[-1].cpu().float()
tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0])

In [47]:
last_token_scores = last_layer_attention[0, :, -1, :]
s_token_scores = last_token_scores[:, 0]
best_head_index = s_token_scores.argmin()
specialist_scores = last_token_scores[best_head_index]
probabilities = torch.softmax(specialist_scores, dim=0)

In [48]:
print(f"Found 'Specialist' Head: #{best_head_index.item()}\n")
print(f"Influence on predicting the token *after* '{tokens[-1]}':\n")
print("Token".ljust(12) + "Influence Score")
print("-" * 30)

for token, score in zip(tokens, probabilities):
    print(f"{token.ljust(12)}: {score.item():.4f} ({(score.item() * 100):.2f}%)")

Found 'Specialist' Head: #24

Influence on predicting the token *after* '▁by':

Token       Influence Score
------------------------------
<s>         : 0.1102 (11.02%)
▁The        : 0.1402 (14.02%)
▁Mon        : 0.1233 (12.33%)
a           : 0.1216 (12.16%)
▁Lisa       : 0.1449 (14.49%)
▁was        : 0.1134 (11.34%)
▁painted    : 0.1226 (12.26%)
▁by         : 0.1239 (12.39%)


In [42]:
# head_view(
#     [a.cpu() for a in attention], # Move tensors to CPU for viz
#     tokens
# )