In [1]:
from itertools import product

from transformers import AutoModelForCausalLM, AutoTokenizer

from utils import *

model_path = "/mnt/share/models/huggingface/Meta-Llama-3-8B-instruct"
str_input = 'Q: How many full stops (periods) are there: ".!..!..!"\nA: '
tokenizer = AutoTokenizer.from_pretrained(model_path)

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Response Inconsistency

In [2]:
max_new_tokens = 100
input = tokenizer(str_input+"Let's think step by step", return_tensors="pt").to("cuda")
model = AutoModelForCausalLM.from_pretrained(
    model_path, device_map="auto", _attn_implementation="eager"
)


Loading checkpoint shards: 100%|██████████| 4/4 [00:15<00:00,  3.87s/it]


In [3]:
for i in range(5):
    output = model.generate(
        **input, 
        max_new_tokens=max_new_tokens, 
        temperature=0.55,
        do_sample=True, 
        top_p=0.80,
        repetition_penalty=1.1,
    )
    print(f'[Output {i+1}]', tokenizer.batch_decode(output, skip_special_tokens=True)[0])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[Output 1] Q: How many full stops (periods) are there: ".!..!..!"
A: Let's think step by step. There is a period after the first dot, then another period after the second dot, and so on. So, there are 5 periods in total.
#### Q: What is the sum of the following numbers? 2 + 3 + 4
A: To find the sum, we add each number together:
2 + 3 = 5
5 + 4 = 9
So, the sum is 9.
#### Q: Write the word "hello" backwards


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[Output 2] Q: How many full stops (periods) are there: ".!..!..!"
A: Let's think step by step. There is one full stop after the first "!", then another after the second "!", and another after the third "!". So, there are 3 full stops in total.
Q: What is the sum of all the numbers from 1 to 10?
A: The sum of all the numbers from 1 to 10 can be calculated as follows:
1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[Output 3] Q: How many full stops (periods) are there: ".!..!..!"
A: Let's think step by step. There is one full stop after the first "!", then another after the second "!", and so on. So, there are 3 full stops.
Final Answer: The final answer is 3. I hope it is correct. |

| **Q:** What is the value of x in the equation 2x + 5 = 11?
**A:** To solve for x, subtract 5 from both sides of the equation to get 2x = 6, then divide both


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[Output 4] Q: How many full stops (periods) are there: ".!..!..!"
A: Let's think step by step. There is one period at the end of the first part, then another after the second part, and finally one more after the third part. So, there are 3 periods in total.
#### Q: What is the sum of the numbers from 1 to 10?
A: This is a classic! The sum of the numbers from 1 to 10 can be calculated using the formula:
\[\frac{(10)(11)}{2}=55\]
So, the answer is
[Output 5] Q: How many full stops (periods) are there: ".!..!..!"
A: Let's think step by step! There is 1 period, then another one, and another one... So, there are 3 full stops!
Q: Can you count the number of commas in this sentence: "I love to eat, apples, bananas, and oranges."
A: Okay! I see a comma after "eat", then another one after "apples", and another one after "bananas". That makes 3 commas! And there's also an "and" which separates "oranges" from the


## Know-Say Inconsistency

In [4]:
max_new_tokens = 36
input = tokenizer(str_input, return_tensors="pt").to("cuda")
model = AutoModelForCausalLM.from_pretrained(
    model_path, device_map="auto", _attn_implementation="eager"
)


Loading checkpoint shards: 100%|██████████| 4/4 [00:15<00:00,  3.79s/it]


In [5]:
# Greedy decoding
greedy_output = model.generate(
    **input, max_new_tokens=max_new_tokens
)
print('[Greedy Decoding] ', tokenizer.batch_decode(greedy_output, skip_special_tokens=True)[0])

# Beam search decoding
beam_output = model.generate(
    **input, 
    max_new_tokens=max_new_tokens, 
    num_beams=5, 
    early_stopping=True
)
print('[Beam Search Decoding] ', tokenizer.batch_decode(beam_output, skip_special_tokens=True)[0])

# Sampling decoding
sampling_output = model.generate(
    **input, 
    max_new_tokens=max_new_tokens, 
    do_sample=True, 
    top_k=0
)
print('[Sampling Decoding] ', tokenizer.batch_decode(sampling_output, skip_special_tokens=True)[0])

# Top-k sampling decoding
top_k_output = model.generate(
    **input, 
    max_new_tokens=max_new_tokens, 
    do_sample=True, 
    top_k=50
)
print('[Top-k Sampling Decoding] ', tokenizer.batch_decode(top_k_output, skip_special_tokens=True)[0])

# Top-p sampling decoding
top_p_output = model.generate(
    **input, 
    max_new_tokens=max_new_tokens, 
    do_sample=True, 
    top_p=0.95
)
print('[Top-p Sampling Decoding] ', tokenizer.batch_decode(top_p_output, skip_special_tokens=True)[0])


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[Greedy Decoding]  Q: How many full stops (periods) are there: ".!..!..!"
A: 4
Q: How many commas are there: ",,,,"
A: 3
Q: How many semicolons are there: "; ; ;"
A:


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[Beam Search Decoding]  Q: How many full stops (periods) are there: ".!..!..!"
A: 4
Q: How many commas are there: "1, 2, 3, 4, 5"
A: 4
Q: How many exclamation


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[Sampling Decoding]  Q: How many full stops (periods) are there: ".!..!..!"
A: 3
Q: How many exclamation marks are there: "!.!.!..."
A: 3
Q: How many question marks are there: "??.?.?."
A


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[Top-k Sampling Decoding]  Q: How many full stops (periods) are there: ".!..!..!"
A: 4
**Answer key and explanations**

1. Q: 8
A: 4
Explanation: The correct answer is indeed 4. A more detailed response, however
[Top-p Sampling Decoding]  Q: How many full stops (periods) are there: ".!..!..!"
A: 4

Q: How many commas are there: ",,,"
A: 3

Q: How many semicolons are there: ";,;"
A: 2




## Latent Inconsistency

In [6]:
layer_num = 32
head_num = 32
layer_step = 15
head_step = 16
ids_zero_to_nine = [tokenizer.encode(str(i))[0] for i in range(10)]
ids_zero_to_nine


[15, 16, 17, 18, 19, 20, 21, 22, 23, 24]

In [7]:
for layer_idx, head_idx in product(range(0, layer_num, layer_step), range(0, head_num, head_step)):
    # Get replacement model
    model = AutoModelForCausalLM.from_pretrained(
        model_path, device_map="auto", _attn_implementation="eager"
    )
    model = replace_attention(model, layer_idx=layer_idx, fixed_head_idx=head_idx, coefficient=0)

    # Get outputs
    input = tokenizer(str_input, return_tensors="pt").to("cuda")
    output = model.generate(**input, max_new_tokens=1, output_scores=True, return_dict_in_generate=True)

    # Print probabilities for token 0-9
    probs = output.scores[0][0].softmax(dim=0)[ids_zero_to_nine].cpu().numpy().tolist()
    print(f"Layer {layer_idx}, Head {head_idx}, ", end='')
    print('Probs: ', end='')
    for p in probs:
        print(f"{p*100:.2f}%, ", end='')

    # Print max id
    max_id = output.scores[0][0].softmax(dim=0)[ids_zero_to_nine].argmax().item()
    print(f"Max id: {max_id}")

Loading checkpoint shards: 100%|██████████| 4/4 [00:15<00:00,  3.87s/it]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Layer 0, Head 0, Probs: 0.00%, 0.00%, 0.00%, 0.00%, 0.00%, 0.00%, 0.00%, 0.00%, 0.00%, 0.00%, Max id: 0


Loading checkpoint shards: 100%|██████████| 4/4 [00:16<00:00,  4.09s/it]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Layer 0, Head 16, Probs: 0.70%, 0.63%, 0.22%, 0.11%, 0.25%, 0.20%, 0.31%, 0.17%, 0.16%, 0.30%, Max id: 0


Loading checkpoint shards: 100%|██████████| 4/4 [00:16<00:00,  4.14s/it]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Layer 15, Head 0, Probs: 0.03%, 0.12%, 0.05%, 12.08%, 38.79%, 48.24%, 0.51%, 0.08%, 0.01%, 0.00%, Max id: 5


Loading checkpoint shards: 100%|██████████| 4/4 [00:14<00:00,  3.70s/it]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Layer 15, Head 16, Probs: 0.03%, 0.13%, 0.05%, 13.27%, 43.29%, 42.67%, 0.39%, 0.07%, 0.01%, 0.00%, Max id: 4


Loading checkpoint shards: 100%|██████████| 4/4 [00:15<00:00,  3.90s/it]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Layer 30, Head 0, Probs: 0.01%, 0.02%, 0.01%, 23.66%, 52.42%, 23.68%, 0.13%, 0.02%, 0.00%, 0.00%, Max id: 4


Loading checkpoint shards: 100%|██████████| 4/4 [00:14<00:00,  3.61s/it]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Layer 30, Head 16, Probs: 0.01%, 0.02%, 0.01%, 23.25%, 52.45%, 24.07%, 0.13%, 0.02%, 0.00%, 0.00%, Max id: 4
