In [1]:
import pandas as pd
from scipy.stats import spearmanr
import numpy as np

import argparse as ap

def recompute_hard_labels(soft_labels):
    """optionally, infer hard labels from the soft labels provided"""
    hard_labels = [] 
    prev_end = -1
    for start, end in (
        (lbl['start'], lbl['end']) 
        for lbl in sorted(soft_labels, key=lambda span: (span['start'], span['end']))
        if lbl['prob'] > 0.5
    ):
        if start == prev_end:
            hard_labels[-1][-1] = end
        else:
            hard_labels.append([start, end])
        prev_end = end
    return hard_labels


def infer_soft_labels(hard_labels):
    """reformat hard labels into soft labels with prob 1"""
    return [
        {
            'start': start,
            'end': end,
            'prob': 1.0,
        }
        for start, end in hard_labels
    ]

def load_jsonl_file_to_records(filename, is_ref=True):
    """read data from a JSONL file and format that as a `pandas.DataFrame`.
    Performs minor format checks (ensures that some labels are present,
    optionally compute missing labels on the fly)."""
    df = pd.read_json(filename, lines=True)
    if not is_ref:
        assert ('hard_labels' in df.columns) or ('soft_labels' in df.columns), \
            f'File {filename} contains no predicted label!'
        if 'hard_labels' not in df.columns:
            df['hard_labels'] = df.soft_labels.apply(recompute_hard_labels)
        elif 'soft_labels' not in df.columns:
            df['soft_labels'] = df.hard_labels.apply(infer_soft_labels)
    # adding an extra column for convenience
    columns = ['id', 'soft_labels', 'hard_labels']
    if is_ref:
        df['text_len'] = df.model_output_text.apply(len)
        columns += ['text_len']
    df = df[columns]
    return df.sort_values('id').to_dict(orient='records')

In [25]:
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
import json
import os
import torch

data_dir = "data/test"
data_file = "mushroom.en-tst.v1.jsonl"
data_path = os.path.join(data_dir, data_file)

# check if the sentence was already processed
output_path = os.path.join(data_dir, "results_mult.jsonl")
if os.path.exists(output_path):
    with open(output_path, "r") as f:
        processed_data = [json.loads(line) for line in f]

In [None]:
processed_data

In [None]:
results = []
n_mismatches = 0
for sentence in processed_data:
    print(sentence)
    output = sentence['model_output_text']
    n_words_output = len(output.split())
    print(n_words_output, len(sentence['hallucination_scores_evaluated']))
    if n_words_output != len(sentence['hallucination_scores_evaluated']):
        n_mismatches += 1
    #assert n_words_output == len(sentence['hallucination_scores_evaluated']), f"Output length {n_words_output} does not match hallucination scores length {len(sentence['hallucination_scores_evaluated'])}"
    #break
print(n_mismatches, n_mismatches/len(processed_data))

In [None]:
output.split()

In [None]:
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
import json
import os
import torch

def get_mnli_label(sentence_1, sentence_2, model, tokenizer):
    inputs = tokenizer(sentence_1, sentence_2, return_tensors="pt")
    # make a prediction
    outputs = model(**inputs)
    # get the predicted class
    predicted_class_idx = outputs.logits.argmax().item()
    # get the predicted class name
    predicted_class_name = model.config.id2label[predicted_class_idx]
    return predicted_class_name

def generate_full_word_cache(input_ids, model, tokenizer, past_key_values):

    generated_ids = input_ids
    generated_tokens = [input_ids[0][0].item()]
    pkv = past_key_values

    prob = 1.0
    
    while True:
        # Get model outputs with caching enabled
        outputs = model(
            input_ids=generated_ids, 
            use_cache=True,
            return_dict=True,
            past_key_values=pkv
        )
        
        # Get logits for the next token
        next_token_logits = outputs.logits[:, -1, :]
        pkv = outputs.past_key_values
        
        # Calculate token probabilities
        next_token_probs = torch.softmax(next_token_logits, dim=-1)
        next_token_id = torch.argmax(next_token_probs, dim=-1)
        next_token_prob = next_token_probs[0, next_token_id]
        
        # Add the token to our generated list
        
        
        # Check if we've completed a word (token ends with space)
        decoded_token = tokenizer.decode([next_token_id.item()])
        if decoded_token.startswith(" "):
            break

        generated_tokens.append(next_token_id.item())    
        prob *= next_token_prob.item()
        generated_ids = torch.cat((generated_ids, next_token_id.unsqueeze(0)), dim=1)
        
    return generated_tokens, prob

def evaluate_hallucination_cache(sentence, base_tokenizer, base_model, mnli_model, mnli_tokenizer):
    input = sentence['model_input'] + " " + sentence['model_output_text']
    input_words = input.split(" ")
    words_to_skip = len(sentence['model_input'].split(" "))

    words = []
    labels = []


    print(f"Full sentence: {' '.join(input_words)}")
    print("\n") 
    past_key_values = None

    for i, sample_to_evaluate in enumerate(input_words):
        if i == len(input_words) - 1 or i < words_to_skip-1:
            continue

        positive_influence = 0
        total_influence = 0
        print(f"Actual token: {sample_to_evaluate}")
        next_generated_token = input_words[i+1]
        print(f"Next generated token: {next_generated_token}")
        words.append(next_generated_token)

        sentence_until_now = " ".join(input_words[:i+1])
        print(f"Sentence until now: {sentence_until_now}")
        token_id_until_now = base_tokenizer.encode(sentence_until_now)
        token_id_until_now = torch.tensor(token_id_until_now).to(base_model.device).unsqueeze(0)

        # evaluate top k tokens for the next word after until_now
        outputs = base_model(token_id_until_now, past_key_values=past_key_values, return_dict=True, use_cache=True)
        probabilities = outputs.logits.softmax(dim=-1)
        probabilities = probabilities[:, -1, :]
        past_key_values = outputs.past_key_values
        # keep all the inferences except the last one
        topk_probabilities, topk_indices = probabilities.topk(10, dim=-1)

        for j in range(10): 
            token_id = topk_indices[0][j].item()
            token_prob = topk_probabilities[0][j].item()
            if token_prob < 0.01:
                break
            topk_token_ids, prob = generate_full_word_cache(torch.tensor([[token_id]]).to(base_model.device), base_model, base_tokenizer, past_key_values)
            token = base_tokenizer.decode(topk_token_ids, skip_special_tokens=True)
            if " " in token:
                token = token.split(" ")[1]

            prob = token_prob * prob
            relateness = get_mnli_label(next_generated_token, token, mnli_model, mnli_tokenizer)
            if relateness == "entailment":
                positive_influence += prob
                total_influence += prob
            elif relateness == "contradiction":
                total_influence += prob
            print(f"Token: {token}, Relateness: {relateness}, Probability: {prob}, Token Probability: {token_prob}")
            torch.cuda.empty_cache()
        
        if total_influence == 0:
            hallucination_score = 0
        else:
            hallucination_score = 1 - (positive_influence/total_influence)
        labels.append(hallucination_score)
        print(f"Hallucination Score: {hallucination_score}")
        print("\n")
    return labels

# Load the model and tokenizer
model_path = "Qwen/QwQ-32B-Preview"
base_model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit = True, device_map='cuda')
tokenizer = AutoTokenizer.from_pretrained(model_path)

mnli_model = AutoModelForSequenceClassification.from_pretrained("cross-encoder/nli-deberta-v3-xsmall")
mnli_tokenizer = AutoTokenizer.from_pretrained("cross-encoder/nli-deberta-v3-xsmall")

# only cuda visible devices 1 
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

data_dir = "data/test"
data_file = "mushroom.en-tst.v1.jsonl"
data_path = os.path.join(data_dir, data_file)

with open(data_path, "r") as f:
    data = [json.loads(line) for line in f]

print(len(data))

In [None]:
for i in range(len(data)):
        sentence = data[i]
        print(f"Processing sample {i}")
        words, labels = evaluate_hallucination_cache(sentence, tokenizer, base_model, mnli_model, mnli_tokenizer)
        sentence['words evaluated'] = words
        sentence['hallucination_scores_evaluated'] = labels

print(len(data))