In [1]:
import pandas as pd
from scipy.stats import spearmanr
import numpy as np

import argparse as ap

def recompute_hard_labels(soft_labels):
    """optionally, infer hard labels from the soft labels provided"""
    hard_labels = [] 
    prev_end = -1
    for start, end in (
        (lbl['start'], lbl['end']) 
        for lbl in sorted(soft_labels, key=lambda span: (span['start'], span['end']))
        if lbl['prob'] > 0.5
    ):
        if start == prev_end:
            hard_labels[-1][-1] = end
        else:
            hard_labels.append([start, end])
        prev_end = end
    return hard_labels


def infer_soft_labels(hard_labels):
    """reformat hard labels into soft labels with prob 1"""
    return [
        {
            'start': start,
            'end': end,
            'prob': 1.0,
        }
        for start, end in hard_labels
    ]

def load_jsonl_file_to_records(filename, is_ref=True):
    """read data from a JSONL file and format that as a `pandas.DataFrame`.
    Performs minor format checks (ensures that some labels are present,
    optionally compute missing labels on the fly)."""
    df = pd.read_json(filename, lines=True)
    if not is_ref:
        assert ('hard_labels' in df.columns) or ('soft_labels' in df.columns), \
            f'File {filename} contains no predicted label!'
        if 'hard_labels' not in df.columns:
            df['hard_labels'] = df.soft_labels.apply(recompute_hard_labels)
        elif 'soft_labels' not in df.columns:
            df['soft_labels'] = df.hard_labels.apply(infer_soft_labels)
    # adding an extra column for convenience
    columns = ['id', 'soft_labels', 'hard_labels']
    if is_ref:
        df['text_len'] = df.model_output_text.apply(len)
        columns += ['text_len']
    df = df[columns]
    return df.sort_values('id').to_dict(orient='records')

In [2]:
def calculate_soft_labels(words, hallucination_scores):
    soft_labels = []
    
    # Initialize the starting position of the first word
    start_position = 0

    for word, score in zip(words, hallucination_scores):
        word_length = len(word)
        
        # Calculate the ending position
        end_position = start_position + word_length
        
        # Append the soft label entry
        soft_labels.append({
            "start": start_position,
            "prob": score,
            "end": end_position
        })
        
        # Update the starting position for the next word (accounting for space)
        start_position = end_position  # Add 2 for the space between words

    return soft_labels

# Example usage
data = {
    "words evaluated": [" ", "No,", "Albero", "Foulois", "was", "not", "in", "any", "of", "the", "FIFA", "World", "Cup", "finals.\n"],
    "hallucination_scores_evaluated": [0, 1.0, 0.06837508948018978, 0.9810742402775567, 0.5219719747190859, 0.8440420620456921, 0.039483340157654756, 0.7283849860862854, 0.0, 0.12374613816374336, 0.9193932406560029, 0.0, 0.0, 0.0]
}

soft_labels = calculate_soft_labels(data["words evaluated"], data["hallucination_scores_evaluated"])

# Output the result
print(soft_labels)

hard_labels = recompute_hard_labels(soft_labels)
print(hard_labels)


[{'start': 0, 'prob': 0, 'end': 1}, {'start': 1, 'prob': 1.0, 'end': 4}, {'start': 4, 'prob': 0.06837508948018978, 'end': 10}, {'start': 10, 'prob': 0.9810742402775567, 'end': 17}, {'start': 17, 'prob': 0.5219719747190859, 'end': 20}, {'start': 20, 'prob': 0.8440420620456921, 'end': 23}, {'start': 23, 'prob': 0.039483340157654756, 'end': 25}, {'start': 25, 'prob': 0.7283849860862854, 'end': 28}, {'start': 28, 'prob': 0.0, 'end': 30}, {'start': 30, 'prob': 0.12374613816374336, 'end': 33}, {'start': 33, 'prob': 0.9193932406560029, 'end': 37}, {'start': 37, 'prob': 0.0, 'end': 42}, {'start': 42, 'prob': 0.0, 'end': 45}, {'start': 45, 'prob': 0.0, 'end': 53}]
[[1, 4], [10, 23], [25, 28], [33, 37]]


In [1]:
import json 
import os

data_dir = "data/test"
output_path = os.path.join(data_dir, "results_full.jsonl")
if os.path.exists(output_path):
    with open(output_path, "r") as f:
        processed_data = [json.loads(line) for line in f]

In [6]:
processed_data[0]['results']

[[{'token_id': 64681,
   'token_prob': 0.6198559403419495,
   'full_word': 'Alberto',
   'full_prob': 1.0,
   'mnli_probs': [[0.0374772883951664,
     0.014802361838519573,
     0.9477202892303467]]},
  {'token_id': 1416,
   'token_prob': 0.11288414895534515,
   'full_word': 'If',
   'full_prob': 1.0,
   'mnli_probs': [[0.05394162982702255,
     0.8792806267738342,
     0.06677769869565964]]},
  {'token_id': 21149,
   'token_prob': 0.04560931771993637,
   'full_word': 'Think',
   'full_prob': 1.0,
   'mnli_probs': [[0.027831140905618668,
     0.14599108695983887,
     0.826177716255188]]},
  {'token_id': 431,
   'token_prob': 0.02899104915559292,
   'full_word': 'Rationale',
   'full_prob': 0.9981532692909241,
   'mnli_probs': [[0.018568968400359154,
     0.02881893515586853,
     0.9526121020317078]]},
  {'token_id': 472,
   'token_prob': 0.02854158543050289,
   'full_word': 'H2S',
   'full_prob': 0.31982743673495406,
   'mnli_probs': [[0.4613741636276245,
     0.2806890308856964,
   

In [3]:
import json 
import os

data_dir = "data/test"
output_path = os.path.join(data_dir, "results_mult.jsonl")
if os.path.exists(output_path):
    with open(output_path, "r") as f:
        processed_data = [json.loads(line) for line in f]
result_path = os.path.join(data_dir, "final_results.jsonl")

for entry in processed_data: 
    try:
        words_evaluated = entry['words evaluated']
        hallucination_scores_evaluated = entry['hallucination_scores_evaluated']
    except:
        words_evaluated = entry['hallucination_scores_evaluated'][0]
        hallucination_scores_evaluated = entry['hallucination_scores_evaluated'][1]
    
    # if the first element of the words evaluated is "", then remove it and the corresponding hallucination score
    if words_evaluated[0] == "":
        words_evaluated[0] = " "
        hallucination_scores_evaluated[0] = 0
    
    soft_labels = calculate_soft_labels(words_evaluated, hallucination_scores_evaluated)
    hard_labels = recompute_hard_labels(soft_labels)

    # save the hard labels to the processed data
    entry['hard_labels'] = hard_labels
    entry['soft_labels'] = soft_labels

    # save the processed data to the new file
    with open(result_path, "w") as f:
        for entry in processed_data:
            f.write(json.dumps(entry) + "\n")
        
    assert len(words_evaluated) == len(hallucination_scores_evaluated)
    