In [1]:
from llm import qwen, kimi, gpt
import util.data_processing as dp
from util import mark as mk
import json

model_list = {
    "qwen": qwen.Qwen(), 
    "kimi": kimi.Kimi(), 
    "gpt": gpt.GPT()
    }

model_name = "gpt"
model = model_list[model_name]


In [2]:
SYS_PROMPT = """Given a question and a passage (which is the LLM's response to the question), identify any hallucinations in the passage. Mark each erroneous segment by enclosing it within the <error></error> tags. Ensure that you only tag the specific words or phrases that are hallucinating, without marking entire sentences unless the entire sentence is hallucinating. If there are no errors, return the passage with no tags. Any identified errors should be highlighted using the specified tags without altering the original text. Below are the error definitions followed by an example of the required format.
*Definitions:
1. **Contradictory Entity Error:** An entity (e.g., location name) in a sentence is incorrect and changing that single entity can make the entire sentence
factually correct. Entity errors often involve noun phrases or nouns (usually 1-3 words).
2. **Contradictory Relational Error:** A sentence is semantically incorrect due to a small part. Relational errors refer to incorrect semantic relationships (e.g., verbs, prepositions, or adjectives).
3. **Contradictory Sentence Error:** A full sentence entirely contradicts the known information, and cannot be solved via word or phrase-level edits.
4. **Invented Info Error:** Errors referring to statements that are not known or do not exist. This does not include fictional characters in books or movies. Invented errors include phrases or sentences with unknown entities or misleading information.
5. **Subjective Error:** An entire sentence or phrase that is subjective and cannot be verified, so it should not be included.
6. **Unverifiable Info Error:** A sentence where the whole sentence or phrase is unlikely to be factually grounded. Although it can be true, the sentence cannot be confirmed nor denied using available information or internet search (e.g., private details).
*Example:
```
Question:
"When was the restoration of the Sándor Palace completed?"
Response:
"The restoration of Sándor Palace, also known as the Buda Castle, was completed in several phases. The most significant restoration took place between 1950 and 1961 under the supervision of Hungarian architects Győző Csaplár and Lajos Késmárki. However, it's important to note that various parts of the palace continued to be restored and renovated throughout the decades following this period. Therefore, it is not accurate to pinpoint an exact completion date for the entire restoration project."
Marked:
"The restoration of Sándor Palace, also known as the <error>Buda Castle</error>, was completed in several phases. The most significant restoration took place between <error>1950 and 1961</error> <error>under the supervision of Hungarian architects</error> <error>Győző Csaplár and Lajos Késmárki</error>. However, it's important to note that various parts of the palace continued to be restored and renovated throughout the decades following this period. Therefore, <error>it is not accurate to pinpoint an exact completion date for the entire restoration project</error>."
```
"""

In [3]:
def load(file_name):
    #load file
    input_dir_path = "./input_data/" 
    suffix = ".jsonl"
    full_file_name = input_dir_path + file_name + suffix
    input_lst = dp.load_file_jsonl(full_file_name)
    
    return input_lst

In [4]:
# Load the lanugage file
file_list = ["ar", "de", "en", "es", "fi", "fr", "hi", "it", "sv", "zh"]

language = "zh"
input_lst = load(language)

In [5]:
# USER PROMPT
prompt_user_lst = []
for input in input_lst:
    question = input["model_input"]
    response = input["model_output_text"]
    prompt_input = {"Question": question, "Response": response}
    prompt_user_lst.append(prompt_input)

# print(prompt_user_lst)

In [6]:
resp_list = []

for ele in prompt_user_lst:
    USER_PROMPT = f"""
Extra Instructions:
Now, given a question and the LLM's response (passage), detect errors and include tags in the passage as demonstrated in the example above.
Focus on marking ONLY the specific erroneous words or phrases, NOT entire sentences.
If entire sentences is not correct, make sure the words or phrases with the most probability of errors are MARKED instead of the whole sentence.
The content in '''{ele["Question"]}''' is correct at most of the time, which means they should not be marked.
Use <error></error> tags to replace all the <error_type></error_type> around each identified error segment.
Please ONLY output the marked response, here is the output format:
'''{ele["Response"]}'''
"""
    response = model.ask(USER_PROMPT, SYS_PROMPT)
    print(response)
    resp_list.append(response)



'''
彼得·冯·斯塔芬在<error>1880年</error>出生，所以他参加第23届夏季奥林匹克运动会时是<error>64岁</error>。
'''
'''
 故事背景： 
故事发生在纽约市，纽约警察局警探尼克·查尔斯（Nick Charles）和警官乔·兰扎（Joe Lanza）是纽约警局里最出色的搭档，他们经常接到各种棘手的案件，但总能凭借着出色的智慧和勇气，将罪犯绳之以法。
故事梗概：
尼克和乔是警队里的传奇搭档。他们搭档了十年，破获了无数案件。尼克是名出色的警员，他聪明、机智、勇敢，总是能从罪犯的言语和行为中发现蛛丝马迹，从而找到破案的关键。乔则是一名出色的警察，他的身手敏捷，反应迅速，经常能在关键时刻救尼克于危难之中。他们的搭档关系非常默契，彼此信任，互相支持，是整个警队的楷模。

故事主要围绕尼克与乔的破案件经历展开，每集都会有一个新的案件挑战他们。这些案件包括但不限于：谋杀案、绑架案，甚至是涉及黑帮的复杂案件等。在破这些案的过程中，尼克会运用他的智慧，乔则会运用他出色的身法，两人配合默契。同时，故事也会穿插一些他们与同事、家人、朋友之间的互动，展现他们丰富多彩的生活。整个故事充满了紧张刺激的气氛，同时也充满了幽默和温情。
'''
'''
瑞典皇家海军（Kungliga flottan）是瑞典军队的一部分，负责保护瑞典的国家利益和领土。瑞典皇室海军的主要职责包括海上安全、海上执法、搜救行动以及参与国际军事行动。以下是瑞典皇宫海军的一些主要组成机构和部门：

1. 海军司令部（Marinen）：这是瑞典军方海军的最高指挥机构，负责对海军部队进行战略规划、组织和指挥。
2. 海战司令部(Flottans)：负责指挥和管理瑞典海战部队的日常运作，包括舰艇、潜艇和其他作战平台。 
3.海军航空兵司令部 (<error>Flygvapnet</error>) : 负责管理和指挥瑞典空军部队，其中包括海军战斗机、直升机和其他飞行器。  
4.海上战斗训练中心 (Sjöstridsskolan): 主要负责培训瑞典水兵和海军陆战队的战斗人员，以准备他们执行各种海上任务。   
5.海事安全局 (Marinedistrikt) 和地区海军指挥部 (<error>Mekaniserade amfibieregementet</error>)等：这些机构负责管理瑞

In [7]:
# Save intermediate results
with open(f"test/{model_name}_tmpplst.jsonl", "w", encoding="utf-8") as f:
    for ele in resp_list:
        f.write(json.dumps(ele) + "\n")

In [8]:
import re

# Remove soft_labels and hard_labels from input_lst
tmp_list = []
for item in input_lst:
    new_item = {k: v for k, v in item.items() if k not in ["soft_labels", "hard_labels"]}
    tmp_list.append(new_item)

soft_lst = []
# Extract soft labels from the response
for i, ele in enumerate(resp_list):
    soft_labels = []
    model_text = re.findall(r"'''(.*?)'''", ele, re.DOTALL)
    if model_text:
        resp_list[i] = model_text[0]
    tmp_list[i]["hallucination_detection_output"] = resp_list[i]
    
    # Find all soft labels
    matches = list(re.finditer(r"<error>(.*?)</error>", resp_list[i]))
    # Remove <error> tags and calculate new start and end positions
    clean_text = re.sub(r"</?error>", "", resp_list[i])
    tmp_list[i]["model_output_text"] = clean_text
    offset = 0
    for match in matches:
        start = match.start() - offset
        end = match.end() - offset - len("<error></error>")
        soft_labels.append({"start": start, "end": end, "prob": float(1.0)})
        offset += len("<error></error>")
    # print(soft_labels)
    soft_lst.append(soft_labels)


for ele, sl in zip(tmp_list, soft_lst):
    ele["soft_labels"] = sl

with open("test/zhongjian.jsonl", "w", encoding="utf-8") as f:
    for ele in tmp_list:
        f.write(json.dumps(ele) + "\n")

In [9]:
# Convert soft_labels to hard_labels and append to each item
for item in tmp_list:
    hard_labels = [[label['start'], label['end']] for label in item['soft_labels']]
    item['hard_labels'] = hard_labels

# Check the result
# for item in tmp_list:
#     print(item)


In [10]:
# Save the result
with open(f"output/{model_name}/{language}_{model_name}_pred.jsonl", "w", encoding="utf-8") as f:
    for item in tmp_list:
        f.write(json.dumps(item) + "\n")

In [11]:
import subprocess

# Run evaluation
ref_file = f"input_data/{language}.jsonl"
pred_file = f"output/{model_name}/{language}_{model_name}_pred.jsonl"
output_file = f"output/{model_name}/{language}_{model_name}_scores.txt"

def run_evaluation(ref_file: str, pred_file: str, output_file: str):
    command = f"python3 util/score.py {ref_file} {pred_file} {output_file}"
    subprocess.run(command, shell=True)

run_evaluation(ref_file, pred_file, output_file)
