In [1]:
from llm import qwen, kimi, gpt, llama, mistral
import util.data_processing as dp
from util import mark as mk
import json
import os
import re
import subprocess

os.environ['OPENAI_API_KEY'] = "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"

model_list = {
    "qwen": qwen.Qwen(), 
    "kimi": kimi.Kimi(), 
    "4o-mini": gpt.GPT(),
    "llama": llama.Llama(),
    "mistral": mistral.Mistral()
    }

model_name = "mistral"
model = model_list[model_name]


In [2]:
SYS_PROMPT = """Given a question and a passage (which is the LLM's response to the question), identify any hallucinations in the passage. Mark each erroneous segment by enclosing it within the <error></error> tags. Ensure that you only tag the specific words or phrases that are hallucinating, without marking entire sentences unless the entire sentence is hallucinating. If there are no errors, return the passage with no tags. Any identified errors should be highlighted using the specified tags without altering the original text. Below are the error definitions followed by an example of the required format.
*Definitions:
1. **Contradictory Entity Error:** An entity (e.g., location name) in a sentence is incorrect and changing that single entity can make the entire sentence
factually correct. Entity errors often involve noun phrases or nouns (usually 1-3 words).
2. **Contradictory Relational Error:** A sentence is semantically incorrect due to a small part. Relational errors refer to incorrect semantic relationships (e.g., verbs, prepositions, or adjectives).
3. **Contradictory Sentence Error:** A full sentence entirely contradicts the known information, and cannot be solved via word or phrase-level edits.
4. **Invented Info Error:** Errors referring to statements that are not known or do not exist. This does not include fictional characters in books or movies. Invented errors include phrases or sentences with unknown entities or misleading information.
5. **Subjective Error:** An entire sentence or phrase that is subjective and cannot be verified, so it should not be included.
6. **Unverifiable Info Error:** A sentence where the whole sentence or phrase is unlikely to be factually grounded. Although it can be true, the sentence cannot be confirmed nor denied using available information or internet search (e.g., private details).
*Example:
```
Question:
"When was the restoration of the Sándor Palace completed?"
Response:
"The restoration of Sándor Palace, also known as the Buda Castle, was completed in several phases. The most significant restoration took place between 1950 and 1961 under the supervision of Hungarian architects Győző Csaplár and Lajos Késmárki. However, it's important to note that various parts of the palace continued to be restored and renovated throughout the decades following this period. Therefore, it is not accurate to pinpoint an exact completion date for the entire restoration project."
Marked:
"The restoration of Sándor Palace, also known as the <error>Buda Castle</error>, was completed in several phases. The most significant restoration took place between <error>1950 and 1961</error> <error>under the supervision of Hungarian architects</error> <error>Győző Csaplár and Lajos Késmárki</error>. However, it's important to note that various parts of the palace continued to be restored and renovated throughout the decades following this period. Therefore, <error>it is not accurate to pinpoint an exact completion date for the entire restoration project</error>."
```
"""

In [3]:
def load(file_name):
    #load file
    input_dir_path = "./input_data/" 
    suffix = ".jsonl"
    full_file_name = input_dir_path + file_name + suffix
    input_lst = dp.load_file_jsonl(full_file_name)
    
    return input_lst

In [4]:
def run_evaluation(ref_file: str, pred_file: str, output_file: str):
    command = f"python3 util/score.py {ref_file} {pred_file} {output_file}"
    subprocess.run(command, shell=True)

In [None]:
# Load the lanugage file
file_list = ["ar", "de", "en", "es", "fi", "fr", "hi", "it", "sv", "zh"]

for lan in file_list:
    language = lan
    input_lst = load(language)

    # USER PROMPT
    prompt_user_lst = []
    for input in input_lst:
        question = input["model_input"]
        response = input["model_output_text"]
        prompt_input = {"Question": question, "Response": response}
        prompt_user_lst.append(prompt_input)

    resp_list = []

    for ele in prompt_user_lst:
        USER_PROMPT = f"""
    Extra Instructions:
    Now, given a question and the LLM's response (passage), detect errors and include tags in the passage as demonstrated in the example above.
    Focus on marking ONLY the specific erroneous words or phrases, NOT entire sentences.
    If entire sentences is not correct, make sure the words or phrases with the most probability of errors are MARKED instead of the whole sentence.
    The content in '''{ele["Question"]}''' is correct at most of the time, which means they should not be marked.
    Use <error></error> tags to replace all the <error_type></error_type> around each identified error segment.
    Please ONLY output the marked response, here is the output format:
    '''{ele["Response"]}'''
    """
        response = model.ask(USER_PROMPT, SYS_PROMPT)
        print(response)
        resp_list.append(response)
    
    # Save intermediate results
    with open(f"test/{model_name}_tmpplst.jsonl", "w", encoding="utf-8") as f:
        for ele in resp_list:
            f.write(json.dumps(ele) + "\n")

    # Remove soft_labels and hard_labels from input_lst
    tmp_list = []
    for item in input_lst:
        new_item = {k: v for k, v in item.items() if k not in ["soft_labels", "hard_labels"]}
        tmp_list.append(new_item)

    soft_lst = []
    # Extract soft labels from the response
    for i, ele in enumerate(resp_list):
        soft_labels = []
        model_text = re.findall(r"'''(.*?)'''", ele, re.DOTALL)
        if model_text:
            resp_list[i] = model_text[0]
        tmp_list[i]["hallucination_detection_output"] = resp_list[i]
        
        # Find all soft labels
        matches = list(re.finditer(r"<error>(.*?)</error>", resp_list[i]))
        # Remove <error> tags and calculate new start and end positions
        clean_text = re.sub(r"</?error>", "", resp_list[i])
        tmp_list[i]["model_output_text"] = clean_text
        offset = 0
        for match in matches:
            start = match.start() - offset
            end = match.end() - offset - len("<error></error>")
            soft_labels.append({"start": start, "end": end, "prob": float(1.0)})
            offset += len("<error></error>")
        # print(soft_labels)
        soft_lst.append(soft_labels)


    for ele, sl in zip(tmp_list, soft_lst):
        ele["soft_labels"] = sl

    with open("test/zhongjian.jsonl", "w", encoding="utf-8") as f:
        for ele in tmp_list:
            f.write(json.dumps(ele) + "\n")
    
    # Convert soft_labels to hard_labels and append to each item
    for item in tmp_list:
        hard_labels = [[label['start'], label['end']] for label in item['soft_labels']]
        item['hard_labels'] = hard_labels

    # Save the result
    with open(f"output/{model_name}/{language}_{model_name}_pred.jsonl", "w", encoding="utf-8") as f:
        for item in tmp_list:
            f.write(json.dumps(item) + "\n")

    # Run evaluation
    ref_file = f"input_data/{language}.jsonl"
    pred_file = f"output/{model_name}/{language}_{model_name}_pred.jsonl"
    output_file = f"output/{model_name}/{language}_{model_name}_scores.txt"

    run_evaluation(ref_file, pred_file, output_file)


'''جليقة، التي تعرف الآن باسم <error>كوريا الجنوبية</error>، تتألف من 16 مقاطعة.'''
'''نعم، شارک داف فرانك في فيلم ليجو (2017) في دور <error>(هالوسينغ هيلبي)</error>.'''
'''ملعب وولز سيغنهايم هو ملعبهوكي في <error>نمسا، النرويج</error>. يعتبر من بين أكبر الملاجئ في أوروبا، مع استيعاب <error>15.005</error> مشجع. ما ييزمه عن الملاعيب الأخرى في البلاد هو أنه يقدم تجربة متكاملة للمشجعين، بما في ذلك مجموعة واسعة من المتاجر والباعة المتجولة والأنشطة والتسوق. كما يوفر الملعب خدمات نقل جيدة، مما يجعل من السهل على المشجعون الوصول إليه.'''
'''الاسم الكافي لتشنسلر جون ثان بَينيت هو جون ديفيد بَنيت.'''
'''تأسست نادي هميلتن أكادميكل في عام 1856.'''
'''مسلسل "Almo'aeb" أو "The Punisher" هو مسلسل سعودي من نوعية الحكاية الدرامية، وهو من إخراج محمد العوضي ومن تأليف عبد الله العيسى.
هذا المسلسل عرض لأول مرة على منصات البث التلفزيوني عبر الإنترنت، ومن بين هذه المنصّات:
1. منصه "<error>Anghami</error>"
2. موقع "<error>Jawhara</error>"

يُمكنك الإشتراك في هذه الخدمات من خلال الدفع الشهري أو السنوي، ويمكنك 