In [1]:
from llm import qwen, kimi, gpt
import util.data_processing as dp
from util import mark as mk

model_list = {
    "qwen": qwen.Qwen(), 
    "kimi": kimi.Kimi(), 
    "gpt": gpt.GPT()
    }


In [2]:
SYS_PROMPT = """Given a question and a passage (which is the LLM's response to the question), identify any <entity>, <relation>, <contradictory>, <invented>, <subjective>, and <unverifiable> hallucinations in the passage. Mark each erroneous segment by enclosing it within the corresponding <error_type></error_type> tags. Ensure that you only tag the specific words or phrases that are hallucinating, without marking entire sentences unless the entire sentence is hallucinating. If there are no errors, return the passage with no tags. Any identified errors should be highlighted using the specified tags without altering the original text. Below are the error definitions followed by an example of the required format.
*Definitions:
1. **Contradictory Entity Error (`<entity>`):** An entity  (e.g., location name) in a sentence is incorrect and changing that single entity can make the entire sentence
factually correct. Entity errors often involve noun phrases or nouns (usually 1-3 words).
2. **Contradictory Relational Error (`<relation>`):** A sentence is semantically incorrect due to a small part. Relational errors refer to incorrect semantic relationships (e.g., verbs, prepositions, or adjectives).
3. **Contradictory Sentence Error (`<contradictory>`):** A full sentence entirely contradicts the known information, and cannot be solved via word or phrase-level edits.
4. **Invented Info Error (`<invented>`):** Errors referring to statements that are not known or do not exist. This does not include fictional characters in books or movies. Invented errors include phrases or sentences with unknown entities or misleading information.
5. **Subjective Error (`<subjective>`):** An entire sentence or phrase that is subjective and cannot be verified, so it should not be included.
6. **Unverifiable Info Error (`<unverifiable>`):** A sentence where the whole sentence or phrase is unlikely to be factually grounded. Although it can be true, the sentence cannot be confirmed nor denied using available information or internet search (e.g., private details).
*Example:
```
Question:
"When was the restoration of the Sándor Palace completed?"
Response:
"The restoration of Sándor Palace, also known as the Buda Castle, was completed in several phases. The most significant restoration took place between 1950 and 1961 under the supervision of Hungarian architects Győző Csaplár and Lajos Késmárki. However, it's important to note that various parts of the palace continued to be restored and renovated throughout the decades following this period. Therefore, it is not accurate to pinpoint an exact completion date for the entire restoration project."
Marked:
"The restoration of Sándor Palace, also known as the <entity>Buda Castle</entity>, was completed in several phases. The most significant restoration took place between <entity>1950 and 1961</entity> <relation>under the supervision of Hungarian architects</relation> <invented>Győző Csaplár and Lajos Késmárki</invented>. However, it's important to note that various parts of the palace continued to be restored and renovated throughout the decades following this period. Therefore, <invented>it is not accurate to pinpoint an exact completion date for the entire restoration project</invented>."
```
"""




In [3]:
def load(file_name):
    #load file
    input_dir_path = "./input_data/" 
    suffix = ".jsonl"
    full_file_name = input_dir_path + file_name + suffix
    input_lst = dp.load_file_jsonl(full_file_name)
    
    return input_lst

In [4]:
file_list = ["ar", "de", "en", "es", "fi", "fr", "hi", "it", "sv", "zh"]
input_lst = load('en')


In [5]:
prompt_user_lst = []
for input in input_lst:
    question = input["model_input"]
    response = input["model_output_text"]
    prompt_input = {"Question": question, "Response": response}
    #print(prompt_user_lst[10])
    prompt_user_lst.append(prompt_input)

# print(prompt_user_lst)

In [None]:
resp_list = []

for ele in prompt_user_lst:
    USER_PROMPT = f"""
Extra Instructions:
Now, given a question and the LLM's response (passage), detect errors and include tags in the passage as demonstrated in the example above.
Focus on marking ONLY the specific erroneous words or phrases, NOT entire sentences.
If entire sentences is not correct, make sure the words or phrases with the most probability of errors are MARKED instead of the whole sentence.
The content in '''{ele["Question"]}''' is correct at most of the time, which means they should not be marked.
Use <error></error> tags to replace all the <error_type></error_type> around each identified error segment.
Please ONLY output the marked response, here is the output format:
'''{ele["Response"]}'''
"""
    # chaneg the model name
    # print(USER_PROMPT)
    response = model_list["qwen"].ask(USER_PROMPT, SYS_PROMPT)
    resp_list.append(response)

# with open("final_ans_gpt4o.txt", "w", encoding="utf-8") as f:
#     for ele in resp_list:
#         f.write(ele + '\n')


In [6]:
import re

# Remove soft_labels and hard_labels from input_lst
tmp_list = []
for item in input_lst:
    new_item = {k: v for k, v in item.items() if k not in ["soft_labels", "hard_labels"]}
    tmp_list.append(new_item)


with open("final_ans_gpt4o.txt", "r", encoding="utf-8") as f:
    resp_list = f.readlines()

soft_lst = []
# resp_list
for i, ele in enumerate(resp_list):
    soft_labels = []
    model_text = re.findall(r"'''(.*?)'''", ele, re.DOTALL)
    if model_text:
        resp_list[i] = model_text[0]
    
    # Find all soft labels
    matches = list(re.finditer(r"<error>(.*?)</error>", resp_list[i]))
    # Remove <error> tags and calculate new start and end positions
    clean_text = re.sub(r"</?error>", "", resp_list[i])
    offset = 0
    for match in matches:
        start = match.start() - offset
        end = match.end() - offset - len("<error></error>")
        soft_labels.append({"start": start, "end": end, "prob": float(1.0)})
        offset += len("<error></error>")
    # print(soft_labels)
    soft_lst.append(soft_labels)

for ele, sl in zip(tmp_list, soft_lst):
    ele["soft_labels"] = sl

# for i in range(len(tmp_list)):
#     print(tmp_list[i])

In [7]:
# Convert soft_labels to hard_labels and append to each item
for item in tmp_list:
    hard_labels = [[label['start'], label['end']] for label in item['soft_labels']]
    item['hard_labels'] = hard_labels

# Check the result
# for item in tmp_list:
#     print(item)


In [8]:
import json

with open("final_ans_gpt4o.jsonl", "w", encoding="utf-8") as f:
    for item in tmp_list:
        f.write(json.dumps(item) + "\n")

In [12]:
import subprocess

ref_file = "input_data/en.jsonl"
pred_file = "final_ans_gpt4o.jsonl"
output_file = "gpt4_scores.txt"

def run_evaluation(ref_file: str, pred_file: str, output_file: str):
    command = f"python3 util/score.py {ref_file} {pred_file} {output_file}"
    subprocess.run(command, shell=True)

run_evaluation(ref_file, pred_file, output_file)


10 0.2
11 0.2
12 0.30000000000000004
13 0.2
14 0.2
15 0.2
16 0.2
17 0.2
25 0.9
26 0.9
27 0.9
28 0.9
29 0.9
30 0.9
31 0.1
32 0.1
33 0.1
34 0.1
35 0.1
36 0.1
45 1.0
46 1.0
47 1.0
48 1.0
49 0.30000000000000004
50 0.30000000000000004
51 0.30000000000000004
52 0.30000000000000004
53 0.30000000000000004
54 0.30000000000000004
55 0.30000000000000004
56 0.30000000000000004
57 0.30000000000000004
58 0.30000000000000004
59 0.30000000000000004
60 0.30000000000000004
61 0.30000000000000004
62 0.30000000000000004
63 0.30000000000000004
64 0.30000000000000004
65 0.2
66 0.2
67 0.2
68 0.2
69 0.9
70 0.9
71 0.9
72 0.9
73 0.9
74 0.9
75 0.9
76 0.9
77 0.9
78 0.9
79 0.9
80 0.9
81 0.9
82 0.9
25 1.0
26 1.0
27 1.0
28 1.0
29 1.0
30 1.0
0 0.0909090909
1 0.0909090909
2 0.0909090909
3 0.0909090909
4 0.0909090909
5 0.0909090909
6 0.0909090909
7 0.0909090909
8 0.0909090909
9 0.0909090909
10 0.0909090909
11 0.0909090909
12 0.0909090909
13 0.0909090909
14 0.0909090909
15 0.0909090909
16 0.0909090909
17 0.0909090909
18

Traceback (most recent call last):
  File "/Users/franky_mac/nlp_team/util/score.py", line 95, in <module>
    _ = main(a.ref_file, a.pred_file, a.output_file)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/franky_mac/nlp_team/util/score.py", line 82, in main
    cors = np.array([score_cor(r, d) for r, d in zip(ref_dicts, pred_dicts)])
                     ^^^^^^^^^^^^^^^
  File "/Users/franky_mac/nlp_team/util/score.py", line 72, in score_cor
    pred_vec[idx] = span['prob']
    ~~~~~~~~^^^^^
IndexError: list assignment index out of range
