Please note: this uses the opentaal-wordlist which you can find here https://github.com/OpenTaal/opentaal-wordlist

In [None]:
%pip install -U sentence-transformers

In [2]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
import csv
from pathlib import Path
import numpy as np
import re
from tqdm import tqdm

In [None]:
hyp_path = Path("path/to/your/data/analysis/hyp") ###
ref_path = Path("path/to/your/data/analysis/ref") ###
out_path = Path("path/to/your/data/analysis/results/2. hallucination/raw") ###

In [4]:
with open("basiswoorden-gekeurd.txt", "r", encoding="utf-8") as f:
    dutch_words = [line.strip() for line in f if line.strip()]
print(f"Loaded {len(dutch_words)} Dutch words")

transformer_model = SentenceTransformer("distiluse-base-multilingual-cased-v1")

Loaded 199404 Dutch words


In [None]:
dict_embeddings = transformer_model.encode(dutch_words, convert_to_tensor=True, show_progress_bar=True)

In [None]:
#save the embeddings to a file
embeddings_np = dict_embeddings.cpu().numpy()
np.save("dutch_embeddings.npy", embeddings_np)

In [None]:
#uncomment to load the embedding model if you've generated it before
#dict_embeddings = np.load("dutch_embeddings.npy")

In [None]:
def is_nonsense_word(word, threshold=0.95):
    word_embedding = transformer_model.encode(word, convert_to_tensor=True, show_progress_bar=False)
    cosine_scores = util.cos_sim(word_embedding, dict_embeddings)
    max_score = cosine_scores.max().item()
    return max_score < threshold

def detect_nonsense_words(text):
    nonsense_words = []
    for word in text.split():
        if is_nonsense_word(word):
            nonsense_words.append(word)

    total_chars = sum(len(word) for word in text.split())
    nonsense_chars = sum(len(word) for word in text.split() if word in nonsense_words)

    if nonsense_chars > 0 and total_chars > 0:
        percentage_nonsense = nonsense_chars / total_chars
        return percentage_nonsense
    return 0.0

In [None]:
def has_repeated_sequence(text, min_repeats=3, max_phrase_len=5):
    #detect if any sequence of words (length 1 to max_phrase_len) repeats min_repeats times consecutively

    text = ' '.join(text.lower().split())

    for length in range(1, max_phrase_len + 1):
        #   (\b(?:\w+\s+){length-1}\w+\b)   captures a phrase of 'length' words
        #   (?:\s+\1){min_repeats,}         matches that phrase repeated at least (min_repeats) more times
        pattern = rf'(\b(?:\w+\s+){{{length-1}}}\w+\b)(?:\s+\1){{{min_repeats},}}'
        if re.search(pattern, text):
            return True
    return False

In [None]:
def parse_stm_file(stm_path):
    #parses stm file and returns a list of (id, reference_text)

    references = []
    with open(stm_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(maxsplit=6)
            utt_id = parts[0]  
            text = parts[6] if len(parts) > 6 else None
            references.append((utt_id, text))
    return references

NON_SPEECH_TOKENS = {
    "muziek",
    "gelach",
    "zang en muziek",
    "geluid van machines",
    "ze zucht",
    "hij zucht",
    "geluid van klaxon",
    "applaus",
    "geluid",
    "ruis",
    "kuch",
    "niezen",
    "stilte",
    "onverstaanbaar",
    "geluid van verkeer"
}

def cosine_similarity(id, ref_list, prediction):
        ref_text = ref_list.get(id)
        if ref_text is None and (prediction != "" and prediction not in NON_SPEECH_TOKENS):
            return True #if no reference text, consider it a hallucination automatically
        elif ref_text is None and (prediction == "" or prediction in NON_SPEECH_TOKENS):
            return False #correctly identified silence

        ref_vec = transformer_model.encode(ref_text, convert_to_tensor=True)
        hyp_vec = transformer_model.encode(prediction, convert_to_tensor=True)
        score = util.cos_sim(ref_vec, hyp_vec).item()
        score = round(score, 3)
        #print(f"Cosine similarity for {ref_text}, {prediction}: {score}")
        return score

In [None]:
def check_tsv(input_path, output_path):
    with open(input_path, newline='', encoding="utf-8") as infile, \
     open(output_path, "w", newline='', encoding="utf-8") as outfile:

        reader = csv.reader(infile, delimiter="\t")
        writer = csv.writer(outfile, delimiter="\t")

        subset = input_path.stem.split("_")[1]
        ref_list = dict(parse_stm_file(ref_path / f"{subset}_reference.stm"))
        num_lines = sum(1 for _ in open(input_path, encoding="utf-8")) - 1
        
        writer.writerow(["file", "nonsense_words", "repeating_words", "incorrect_sentence_meaning", "prediction"])
        next(reader)

        for row in tqdm(reader, total=num_lines, desc="Processing rows", unit="row"):
            if len(row) < 3:
                writer.writerow([row[0], "", "", "", ""])
                continue

            file_id = row[0]
            prediction = row[2]
            prediction = prediction.strip().lower()
            prediction = re.sub(r'[0-9]+', '', prediction)  #remove numbers
            prediction = re.sub(r'[^\w\s]', '', prediction)  #remove punctuation
            
            nonsense = detect_nonsense_words(prediction)
            repeating = has_repeated_sequence(prediction)
            similarity = cosine_similarity(file_id, ref_list, prediction)

            writer.writerow([file_id, str(nonsense), str(repeating), str(similarity), prediction])
        

In [14]:
model_names = [f.name for f in hyp_path.iterdir() if f.is_dir()]

for model in model_names:
    model_path = hyp_path / model
    ctm_path = model_path / "tsv"

    for hyp_file in list(ctm_path.glob("*.tsv")):
        subset = hyp_file.stem.split("_")[1]
        if int(subset) == 5 or int(subset) == 6:
            out_file = out_path / f"{model}_{subset}_hallucination.tsv"
            if out_file.exists():
                continue
            print(f"Processing {hyp_file} to {out_file}")
            check_tsv(hyp_file, out_file)

Processing C:\Users\Topicus\Documents\Datasets\analysis\hyp\whisper-small\tsv\whisper-small_6.tsv to C:\Users\Topicus\Documents\Datasets\analysis\results\2. hallucination\raw\whisper-small_6_hallucination.tsv


Processing rows: 100%|██████████| 2472/2472 [18:59:59<00:00, 27.67s/row]        
