In [1]:
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
import json
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
def generate_full_word(input_ids, model, tokenizer, threshold=0.99):
    generated_ids = input_ids
    while True:
        outputs = model(generated_ids)
        next_token_logits = outputs.logits[:, -1, :]
        next_token_probs = torch.softmax(next_token_logits, dim=-1)
        next_token_id = torch.argmax(next_token_probs, dim=-1)
        next_token_prob = next_token_probs[0, next_token_id]

        if next_token_prob < threshold:
            break

        generated_ids = torch.cat((generated_ids, next_token_id.unsqueeze(0)), dim=1)

    return generated_ids

def get_mnli_label(sentence_1, sentence_2, model, tokenizer):
    inputs = tokenizer(sentence_1, sentence_2, return_tensors="pt")
    # make a prediction
    outputs = model(**inputs)
    # get the predicted class
    predicted_class_idx = outputs.logits.argmax().item()
    # get the predicted class name
    predicted_class_name = model.config.id2label[predicted_class_idx]
    return predicted_class_name

In [3]:
model_path = "Qwen/QwQ-32B-Preview"
base_model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit = True, device_map='cuda')
base_tokenizer = AutoTokenizer.from_pretrained(model_path)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 17/17 [00:16<00:00,  1.04it/s]


In [9]:
mnli_model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/nli-deberta-v3-large')
mnli_tokenizer = AutoTokenizer.from_pretrained('cross-encoder/nli-deberta-v3-large')

data_dir = "data/val"
data_file = "mushroom.en-val.v2.jsonl"
data_path = os.path.join(data_dir, data_file)

with open(data_path, "r") as f:
    data = [json.loads(line) for line in f]



In [None]:
sentence_1 = "Petra van Stoveren won a silver medal in the 2008 Summer Olympics in Beijing, China."
sentence_2 = "Petra van Stoveren won a golden medal in the 2008 Summer Olympics in Beijing, China."

features = mnli_tokenizer(sentence_1, sentence_2,  padding=True, truncation=True, return_tensors="pt")
features
mnli_model.eval()
with torch.no_grad():
    outputs = mnli_model(**features)
    logits = outputs.logits
    predicted_class_idx = logits.argmax().item()
    predicted_class_name = mnli_model.config.id2label[predicted_class_idx]
    print(predicted_class_name)

In [None]:
sentence_1 = "Yes, all arachnids have antennas. However, not all of them are visible to the naked eye."
sentence_2 = "Yes, all arthropods have antennas. However, not all of them are visible to the naked eye."

features = mnli_tokenizer(sentence_1, sentence_2,  padding=True, truncation=True, return_tensors="pt")
features
mnli_model.eval()
with torch.no_grad():
    outputs = mnli_model(**features)
    logits = outputs.logits
    predicted_class_idx = logits.argmax().item()
    predicted_class_name = mnli_model.config.id2label[predicted_class_idx]
    print(predicted_class_name)

In [None]:
sentence_idx = 0
print(data[sentence_idx])

In [6]:
# print answer and where it hallucinates (hard_labels)

sentence_idx = 0
print(data[sentence_idx])

print("Answer:", data[sentence_idx]["model_output_text"])

# hard labels are boundaries that work characterwise 
hallucination_boundaries = data[sentence_idx]["hard_labels"]
print("Hallucination boundaries:", hallucination_boundaries)
for hallucination_boundary in hallucination_boundaries:
    hallucination = data[sentence_idx]["model_output_text"][hallucination_boundary[0]:hallucination_boundary[1]]
    print("Hallucination:", hallucination)

# get the probability distribution for each token generated by the model

input = data[sentence_idx]['model_input'] + " " + data[sentence_idx]['model_output_text']
input_words = input.split(" ")
words_to_skip = len(data[sentence_idx]['model_input'].split(" "))
input_ids = base_tokenizer.encode(input, return_tensors="pt").to(base_model.device)
print(input_ids.shape)
output = base_model(input_ids, return_dict=True)
print(output.logits.shape)

# do the softmax to get the probabilities for each token and keep only the topk
probabilities = output.logits.softmax(dim=-1)

topk = 10
topk_probabilities, topk_indices = probabilities.topk(topk, dim=-1)

# mask the probabilities of the input tokens
input_token_length = len(base_tokenizer.encode(data[0]['model_input']))

{'id': 'val-en-1', 'lang': 'EN', 'model_input': 'What did Petra van Staveren win a gold medal for?', 'model_output_text': 'Petra van Stoveren won a silver medal in the 2008 Summer Olympics in Beijing, China.', 'model_id': 'tiiuae/falcon-7b-instruct', 'soft_labels': [{'start': 10, 'prob': 0.2, 'end': 12}, {'start': 12, 'prob': 0.3, 'end': 13}, {'start': 13, 'prob': 0.2, 'end': 18}, {'start': 25, 'prob': 0.9, 'end': 31}, {'start': 31, 'prob': 0.1, 'end': 37}, {'start': 45, 'prob': 1.0, 'end': 49}, {'start': 49, 'prob': 0.3, 'end': 65}, {'start': 65, 'prob': 0.2, 'end': 69}, {'start': 69, 'prob': 0.9, 'end': 83}], 'hard_labels': [[25, 31], [45, 49], [69, 83]], 'model_output_logits': [-5.5669536591, -11.90533638, -13.0743436813, -9.9514026642, -8.8359375, -5.2216725349, -8.8481779099, -9.2853775024, -7.6449022293, -8.7612609863, -9.1256427765, -5.7042989731, -5.7393956184, -8.409078598, -10.6083183289, -11.707988739, -5.3747014999, -6.5602250099, -5.1362328529, -5.7765812874, -8.4669551849



torch.Size([1, 36, 152064])


In [8]:
# Entailment sulla parola

print(f"Full sentence: {base_tokenizer.decode(input_ids[0])}")
print("\n") 

for i, sample_to_evaluate in enumerate(input_words):
    if i == len(input_words) - 1 or i < words_to_skip-1:
        continue

    positive_influence = 0
    total_influence = 0
    # print the actual token and the next one generated 
    print(f"Actual token: {sample_to_evaluate}")
    next_generated_token = input_words[i+1]
    print(f"Next generated token: {next_generated_token}")

    sentence_until_now = " ".join(input_words[:i+1])
    print(f"Sentence until now: {sentence_until_now}")
    token_id_until_now = base_tokenizer.encode(sentence_until_now)
    token_id_until_now = torch.tensor(token_id_until_now).to(base_model.device).unsqueeze(0)
    len_token_id_until_now = token_id_until_now.shape[1]

    # evaluate top k tokens for the next word after until_now
    probabilities = base_model(token_id_until_now, return_dict=True).logits.softmax(dim=-1)
    probabilities = probabilities[:, -1, :]
    topk_probabilities, topk_indices = probabilities.topk(10, dim=-1)

    for j in range(10): 
        token_id = topk_indices[0][j].item()
        token_prob = topk_probabilities[0][j].item()
        topk_token_ids = generate_full_word(torch.cat((token_id_until_now, torch.tensor([[token_id]]).to(base_model.device)), dim=1), base_model, base_tokenizer)
        token = base_tokenizer.decode(topk_token_ids[0][len_token_id_until_now:], skip_special_tokens=True)

        relateness = get_mnli_label(next_generated_token, token, mnli_model, mnli_tokenizer)
        if relateness == "entailment":
            positive_influence += token_prob
            total_influence += token_prob
        elif relateness == "contradiction":
            total_influence += token_prob
        print(f"Token: {token}, Relateness: {relateness}, Probability: {token_prob}")
    print(f"Hallucination Score: {1 - (positive_influence/total_influence)}")
    print("\n")
    

Full sentence: What did Petra van Staveren win a gold medal for? Petra van Stoveren won a silver medal in the 2008 Summer Olympics in Beijing, China.


Actual token: for?
Next generated token: Petra
Sentence until now: What did Petra van Staveren win a gold medal for?
Token:  I, Relateness: contradiction, Probability: 0.1500244140625
Token:  Petra, Relateness: entailment, Probability: 0.10150146484375
Token:  As, Relateness: contradiction, Probability: 0.08221435546875
Token:  (, Relateness: contradiction, Probability: 0.033203125
Token:  , Relateness: entailment, Probability: 0.0243072509765625
Token:  The, Relateness: contradiction, Probability: 0.02301025390625
Token:  She, Relateness: neutral, Probability: 0.018341064453125
Token:  In, Relateness: contradiction, Probability: 0.0168304443359375
Token:  A, Relateness: contradiction, Probability: 0.01568603515625
Token:  To, Relateness: contradiction, Probability: 0.014068603515625
Hallucination Score: 0.7270048341169459


Actual toke

In [10]:
# Entailment sulla frase
# sulla frase potrebbe essere interessante se faccio generare tutta la risposta al modello grande e poi valuto la relateness
print(f"Full sentence: {base_tokenizer.decode(input_ids[0])}")

output_sentence = data[sentence_idx]['model_output_text']
print(f"Output sentence: {output_sentence}")
print("\n") 

for i, sample_to_evaluate in enumerate(input_words):
    if i == len(input_words) - 1 or i < words_to_skip-1:
        continue

    positive_influence = 0
    total_influence = 0
    # print the actual token and the next one generated 
    print(f"Actual token: {sample_to_evaluate}")
    next_generated_token = input_words[i+1]
    print(f"Next generated token: {next_generated_token}")

    sentence_until_now = " ".join(input_words[:i+1])
    print(f"Sentence until now: {sentence_until_now}")
    token_id_until_now = base_tokenizer.encode(sentence_until_now)
    token_id_until_now = torch.tensor(token_id_until_now).to(base_model.device).unsqueeze(0)
    len_token_id_until_now = token_id_until_now.shape[1]

    # evaluate top k tokens for the next word after until_now
    probabilities = base_model(token_id_until_now, return_dict=True).logits.softmax(dim=-1)
    probabilities = probabilities[:, -1, :]
    topk_probabilities, topk_indices = probabilities.topk(10, dim=-1)

    for j in range(10): 
        token_id = topk_indices[0][j].item()
        token_prob = topk_probabilities[0][j].item()
        topk_token_ids = generate_full_word(torch.cat((token_id_until_now, torch.tensor([[token_id]]).to(base_model.device)), dim=1), base_model, base_tokenizer)
        token = base_tokenizer.decode(topk_token_ids[0][len_token_id_until_now:], skip_special_tokens=True)

        # substitute the token in the output sentence
        output_sentence_temp = output_sentence.replace(next_generated_token, token)
        print(f"Output sentence temp: {output_sentence_temp}")

        relateness = get_mnli_label(output_sentence, output_sentence_temp, mnli_model, mnli_tokenizer)
        if relateness == "entailment":
            positive_influence += token_prob
            total_influence += token_prob
        elif relateness == "contradiction":
            total_influence += token_prob
        print(f"Token: {token}, Relateness: {relateness}, Probability: {token_prob}")
    print(f"Hallucination Score: {1 - (positive_influence/total_influence)}")
    print("\n")
    

Full sentence: What did Petra van Staveren win a gold medal for? Petra van Stoveren won a silver medal in the 2008 Summer Olympics in Beijing, China.
Output sentence: Petra van Stoveren won a silver medal in the 2008 Summer Olympics in Beijing, China.


Actual token: for?
Next generated token: Petra
Sentence until now: What did Petra van Staveren win a gold medal for?
Output sentence temp:  I van Stoveren won a silver medal in the 2008 Summer Olympics in Beijing, China.
Token:  I, Relateness: contradiction, Probability: 0.1500244140625
Output sentence temp:  Petra van Stoveren won a silver medal in the 2008 Summer Olympics in Beijing, China.
Token:  Petra, Relateness: entailment, Probability: 0.10150146484375
Output sentence temp:  As van Stoveren won a silver medal in the 2008 Summer Olympics in Beijing, China.
Token:  As, Relateness: contradiction, Probability: 0.08221435546875
Output sentence temp:  ( van Stoveren won a silver medal in the 2008 Summer Olympics in Beijing, China.
Tok

In [None]:
# print answer and where it hallucinates (hard_labels)

sentence_idx = 2
print(data[sentence_idx])

print("Answer:", data[sentence_idx]["model_output_text"])

# hard labels are boundaries that work characterwise 
hallucination_boundaries = data[sentence_idx]["hard_labels"]
print("Hallucination boundaries:", hallucination_boundaries)
for hallucination_boundary in hallucination_boundaries:
    hallucination = data[sentence_idx]["model_output_text"][hallucination_boundary[0]:hallucination_boundary[1]]
    print("Hallucination:", hallucination)

# get the probability distribution for each token generated by the model

input = data[sentence_idx]['model_input'] + " " + data[sentence_idx]['model_output_text']
input_words = input.split(" ")
words_to_skip = len(data[sentence_idx]['model_input'].split(" "))
input_ids = tokenizer.encode(input, return_tensors="pt").to(base_model.device)
print(input_ids.shape)
output = base_model(input_ids, return_dict=True)
print(output.logits.shape)

# do the softmax to get the probabilities for each token and keep only the topk
probabilities = output.logits.softmax(dim=-1)

topk = 10
topk_probabilities, topk_indices = probabilities.topk(topk, dim=-1)

# mask the probabilities of the input tokens
input_token_length = len(tokenizer.encode(data[0]['model_input']))

In [None]:
print(f"Full sentence: {tokenizer.decode(input_ids[0])}")
print("\n") 

for i, sample_to_evaluate in enumerate(input_words):
    if i == len(input_words) - 1 or i < words_to_skip-1:
        continue

    positive_influence = 0
    total_influence = 0
    # print the actual token and the next one generated 
    print(f"Actual token: {sample_to_evaluate}")
    next_generated_token = input_words[i+1]
    print(f"Next generated token: {next_generated_token}")

    sentence_until_now = " ".join(input_words[:i+1])
    print(f"Sentence until now: {sentence_until_now}")
    token_id_until_now = tokenizer.encode(sentence_until_now)
    token_id_until_now = torch.tensor(token_id_until_now).to(base_model.device).unsqueeze(0)
    len_token_id_until_now = token_id_until_now.shape[1]

    # evaluate top k tokens for the next word after until_now
    probabilities = base_model(token_id_until_now, return_dict=True).logits.softmax(dim=-1)
    probabilities = probabilities[:, -1, :]
    topk_probabilities, topk_indices = probabilities.topk(10, dim=-1)

    for j in range(10): 
        token_id = topk_indices[0][j].item()
        token_prob = topk_probabilities[0][j].item()
        topk_token_ids = generate_full_word(torch.cat((token_id_until_now, torch.tensor([[token_id]]).to(base_model.device)), dim=1), base_model, tokenizer)
        token = tokenizer.decode(topk_token_ids[0][len_token_id_until_now:], skip_special_tokens=True)

        relateness = get_mnli_label(next_generated_token, token, mnli_model, mnli_tokenizer)
        if relateness == "entailment":
            positive_influence += token_prob
            total_influence += token_prob
        elif relateness == "contradiction":
            total_influence += token_prob
        print(f"Token: {token}, Relateness: {relateness}, Probability: {token_prob}")
    print(f"Hallucination Score: {1 - (positive_influence/total_influence)}")
    print("\n")
    