In [1]:
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
import json
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = "Qwen/QwQ-32B-Preview"
base_model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit = True, device_map='cuda')
tokenizer = AutoTokenizer.from_pretrained(model_path)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 17/17 [00:20<00:00,  1.20s/it]


In [3]:
mnli_model = AutoModelForSequenceClassification.from_pretrained("cross-encoder/nli-deberta-v3-xsmall")
mnli_tokenizer = AutoTokenizer.from_pretrained("cross-encoder/nli-deberta-v3-xsmall")

data_dir = "data/val"
data_file = "mushroom.en-val.v2.jsonl"
data_path = os.path.join(data_dir, data_file)

with open(data_path, "r") as f:
    data = [json.loads(line) for line in f]



In [4]:
sentence_idx = 2
print(data[sentence_idx])

{'id': 'val-en-3', 'lang': 'EN', 'model_input': 'Do all arthropods have antennae?', 'model_output_text': 'Yes, all arachnids have antennas. However, not all of them are visible to the naked eye.', 'model_id': 'tiiuae/falcon-7b-instruct', 'soft_labels': [{'start': 0, 'prob': 0.6, 'end': 3}, {'start': 3, 'prob': 0.4, 'end': 8}, {'start': 8, 'prob': 0.2, 'end': 9}, {'start': 9, 'prob': 0.6, 'end': 18}, {'start': 18, 'prob': 0.3, 'end': 24}, {'start': 24, 'prob': 0.4, 'end': 32}, {'start': 32, 'prob': 0.1, 'end': 34}, {'start': 34, 'prob': 0.4, 'end': 43}, {'start': 43, 'prob': 0.5, 'end': 63}, {'start': 63, 'prob': 0.7, 'end': 70}, {'start': 70, 'prob': 0.5, 'end': 78}, {'start': 78, 'prob': 0.7, 'end': 87}], 'hard_labels': [[0, 3], [9, 18], [63, 70], [78, 87]], 'model_output_logits': [-4.8190689087, -16.5279369354, -10.1344690323, -9.9476661682, -7.045940876, -12.0485277176, -14.586689949, -12.6738815308, -7.2347912788, -8.1060018539, -7.1135058403, -15.4829864502, -6.7274436951, -7.6964

In [5]:
# print answer and where it hallucinates (hard_labels)

print("Answer:", data[sentence_idx]["model_output_text"])

# hard labels are boundaries that work characterwise 
hallucination_boundaries = data[sentence_idx]["hard_labels"]
print("Hallucination boundaries:", hallucination_boundaries)
for hallucination_boundary in hallucination_boundaries:
    hallucination = data[sentence_idx]["model_output_text"][hallucination_boundary[0]:hallucination_boundary[1]]
    print("Hallucination:", hallucination)

Answer: Yes, all arachnids have antennas. However, not all of them are visible to the naked eye.
Hallucination boundaries: [[0, 3], [9, 18], [63, 70], [78, 87]]
Hallucination: Yes
Hallucination: arachnids
Hallucination: visible
Hallucination: naked eye


In [6]:
def get_mnli_label(sentence_1, sentence_2, model, tokenizer):
    inputs = tokenizer(sentence_1, sentence_2, return_tensors="pt")
    # make a prediction
    outputs = model(**inputs)
    # get the predicted class
    predicted_class_idx = outputs.logits.argmax().item()
    # get the predicted class name
    predicted_class_name = model.config.id2label[predicted_class_idx]
    return predicted_class_name

In [7]:
# get the probability distribution for each token generated by the model

input = data[sentence_idx]['model_input'] + " " + data[sentence_idx]['model_output_text']
input_ids = tokenizer.encode(input, return_tensors="pt").to(base_model.device)
print(input_ids.shape)
output = base_model(input_ids, return_dict=True)
print(output.logits.shape)

torch.Size([1, 32])




torch.Size([1, 32, 152064])


In [8]:
# do the softmax to get the probabilities for each token and keep only the topk
probabilities = output.logits.softmax(dim=-1)

topk = 10
topk_probabilities, topk_indices = probabilities.topk(topk, dim=-1)

# mask the probabilities of the input tokens
input_token_length = len(tokenizer.encode(data[0]['model_input']))

In [9]:
import torch
def generate_full_word(input_ids, model, tokenizer, threshold=0.99):
    generated_ids = input_ids
    while True:
        outputs = model(generated_ids)
        next_token_logits = outputs.logits[:, -1, :]
        next_token_probs = torch.softmax(next_token_logits, dim=-1)
        next_token_id = torch.argmax(next_token_probs, dim=-1)
        next_token_prob = next_token_probs[0, next_token_id]

        if next_token_prob < threshold:
            break

        generated_ids = torch.cat((generated_ids, next_token_id.unsqueeze(0)), dim=1)

    return generated_ids

In [10]:
print(f"Full sentence: {tokenizer.decode(input_ids[0])}")
print("\n")

for i, sample_to_evaluate in enumerate(input_ids[0]):
    if i == len(input_ids[0]) - 1:
        continue

    positive_influence = 0
    total_influence = 0
    # print the actual token and the next one generated 
    print(f"Actual token: {tokenizer.decode(sample_to_evaluate.item())}")
    next_generated_token = tokenizer.decode(input_ids[0][i+1].item())
    print(f"Next generated token: {next_generated_token}")
    for j in range(topk): 
        token_id = topk_indices[0][i][j].item()
        token_prob = topk_probabilities[0][i][j].item()
        token = tokenizer.decode(token_id)
        relateness = get_mnli_label(next_generated_token, token, mnli_model, mnli_tokenizer)
        if relateness == "entailment":
            positive_influence += token_prob
            total_influence += token_prob
        elif relateness == "contradiction":
            total_influence += token_prob
        print(f"Token: {token}, Relateness: {relateness}, Probability: {token_prob}")
    print(f"Hallucination Score: {1 - (positive_influence/total_influence)}")
    print("\n")
    

Full sentence: Do all arthropods have antennae? Yes, all arachnids have antennas. However, not all of them are visible to the naked eye.


Actual token: Do
Next generated token:  all
Token: ,, Relateness: contradiction, Probability: 0.0095977783203125
Token: ᐉ, Relateness: contradiction, Probability: 0.00945281982421875
Token:  (, Relateness: neutral, Probability: 0.00830841064453125
Token:  an, Relateness: contradiction, Probability: 0.008148193359375
Token:  the, Relateness: entailment, Probability: 0.00804901123046875
Token:  and, Relateness: entailment, Probability: 0.00789642333984375
Token:  meiden, Relateness: contradiction, Probability: 0.00675201416015625
Token:  ontvang, Relateness: contradiction, Probability: 0.0061492919921875
Token:  you, Relateness: contradiction, Probability: 0.005687713623046875
Token:  a, Relateness: contradiction, Probability: 0.005619049072265625
Hallucination Score: 0.7632532850022655


Actual token:  all
Next generated token:  ar
Token:  the, Relat

In [11]:
print(f"Full sentence: {tokenizer.decode(input_ids[0])}")
print("\n")

for i, sample_to_evaluate in enumerate(input_ids[0]):
    if i == len(input_ids[0]) - 1:
        continue

    positive_influence = 0
    total_influence = 0
    # print the actual token and the next one generated 
    print(f"Actual token: {tokenizer.decode(sample_to_evaluate.item())}")
    
    # Genera l'intera parola per il prossimo token
    print(input_ids[:, :i+2])
    next_generated_token = tokenizer.decode(input_ids[0][i+1].item())
    print(f"Next generated token: {next_generated_token}")
    
    for j in range(topk): 
        token_id = topk_indices[0][i][j].item()
        token_prob = topk_probabilities[0][i][j].item()

        topk_token_ids = generate_full_word(torch.cat((input_ids[:, :i+1], torch.tensor([[token_id]]).to(base_model.device)), dim=1), base_model, tokenizer)
        token = tokenizer.decode(topk_token_ids[0][i+1:], skip_special_tokens=True)

        relateness = get_mnli_label(next_generated_token, token, mnli_model, mnli_tokenizer)
        if relateness == "entailment":
            positive_influence += token_prob
            total_influence += token_prob
        elif relateness == "contradiction":
            total_influence += token_prob
        print(f"Token: {token}, Relateness: {relateness}, Probability: {token_prob}")
    print(f"Hallucination Score: {1 - (positive_influence/total_influence)}")
    print("\n")
    

Full sentence: Do all arthropods have antennae? Yes, all arachnids have antennas. However, not all of them are visible to the naked eye.


Actual token: Do
tensor([[5404,  678]], device='cuda:0')
Next generated token:  all
Token: ,, Relateness: contradiction, Probability: 0.0095977783203125
Token: ᐉ, Relateness: contradiction, Probability: 0.00945281982421875
Token:  (, Relateness: neutral, Probability: 0.00830841064453125
Token:  an, Relateness: contradiction, Probability: 0.008148193359375
Token:  the, Relateness: entailment, Probability: 0.00804901123046875
Token:  and, Relateness: entailment, Probability: 0.00789642333984375
Token:  meiden, Relateness: contradiction, Probability: 0.00675201416015625
Token:  ontvang, Relateness: contradiction, Probability: 0.0061492919921875
Token:  you, Relateness: contradiction, Probability: 0.005687713623046875
Token:  a, Relateness: contradiction, Probability: 0.005619049072265625
Hallucination Score: 0.7632532850022655


Actual token:  all
tens

In [None]:
mnli_model