In [4]:
import re
import json
from concurrent.futures import ThreadPoolExecutor, as_completed  
import os
import pandas as pd
from gpt4o import Openai, API_INFOS
from datasets import load_dataset, Dataset
from tqdm import tqdm
from utils import read_jsonl

system_template = \
"""
You are an impartial judge tasked with evaluating the textual aesthetics of responses provided by two AI assistants to the user prompt displayed below. The responses are presented as images. Your goal is to determine which response is more aesthetically pleasing and easier to read and understand, considering both textual and visual factors.

Evaluate each response based on the following criteria:

1. **Readability**: Is the text easy to read and understand? Are the sentences of appropriate length and complexity?
2. **Visual Organization**: Is the text visually organized in a logical manner? Are there appropriate headings, subheadings, lists, and other formatting elements?
3. **Consistency**: Does the text maintain a consistent style and format throughout?
4. **Overall Structure**: Are the paragraphs well-structured and logically connected? Is there appropriate spacing between paragraphs?
5. **Visual Clarity**: Is the text in the image clear and legible? Are the fonts and spacing visually appealing and easy on the eyes?

Follow these steps for your evaluation:
1. **Analyze each response**: Carefully examine both images based on the criteria provided.
2. **Compare both responses**: Determine which response excels in textual and visual aesthetics considering all aspects.
3. **Make a final decision**: Choose the response that is better in terms of textual and visual aesthetics and justify your choice.

Output your final verdict with one of the following labels:
1. Assistant A is significantly better: [[A>>B]]
2. Assistant A is slightly better: [[A>B]]
3. Tie, relatively the same: [[A=B]]
4. Assistant B is slightly better: [[B>A]]
5. Assistant B is significantly better: [[B>>A]]

Example output: "My final verdict is Assistant A is slightly better: [[A>B]]."
"""

## user prompt
user_template  = \
"""
<|User Prompt|>{instruction}
Below are two images: the first one is Assistant A's response, and the second one is Assistant B's response. Please evaluate them based on the criteria provided and give the final verdict answer.
"""

# def extract_final_verdict(llm_output):  
#     """  
#     Extracts the final verdict from the LLM output.  
  
#     Parameters:  
#     llm_output (str): The output string from the LLM.  
  
#     Returns:  
#     str: The final verdict in the format [[A>>B]], [[A>B]], [[A=B]], [[B>A]], or [[B>>A]].  
#     """  
#     # Define the regex pattern to match the final verdict  
#     pattern = r'\[\[A>>B\]\]|\[\[A>B\]\]|\[\[A=B\]\]|\[\[B>A\]\]|\[\[B>>A\]\]'  
  
#     # Search for the pattern in the LLM output  
#     match = re.search(pattern, llm_output)  
  
#     if match:  
#         return match.group(0)  
#     else:  
#         return None 
    
# def get_judged_answer(client, instruction, answer_1, answer_2, user_template, system_template, max_tokens=2048):  
#     # 格式化用户模板，插入指令和完成的文本  
#     content = user_template.format(instruction=instruction, answer_1=answer_1, answer_2=answer_2)  
      
#     # 从客户端获取响应  
#     gpt_answer = client.get_response(content=content, system=system_template, max_tokens=max_tokens)  
      
#     if gpt_answer is None:  
#         gpt_answer = ""  
#     gpt_answer = gpt_answer.strip()  
    
#     score = extract_final_verdict(gpt_answer)

#     return score, gpt_answer  

# def get_judge(client, row, user_template, system_template, max_tokens=2048, output_file="judges.jsonl"):  
#     # prompt = row['prompt']  
#     # response = row['response']  
#     # need_modification, revised_text, gpt_answer = get_revised_text(client, prompt, response, user_template, system_template, max_tokens=max_tokens)  
#     # print(f"index {index}")
#     prompt = row['prompt'] 
#     answer_1 = row['response']
#     answer_2 = row['revised_text']
#     score, judgment = get_judged_answer(client, prompt, answer_1, answer_2, user_template, system_template, max_tokens=2048) 
#     result = row
#     result['judge'] = judgment
#     result['score'] = score    
#     with open(output_file, 'a') as f:  
#         f.write(json.dumps(result) + "\n")  
#     return result 

# def main():  
#     cur_dir = os.path.dirname(os.path.abspath(__file__))
#     clients = [Openai(apis=[API_INFOS[i]]) for i in range(len(API_INFOS))]  
#     print(f"clients number: {len(clients)}")
#     # Initialize multiple clients  
#     revised_data = read_jsonl(os.path.join(cur_dir, "revised_data/output_sorted.jsonl"))
#     sample_data = revised_data[:10]
#     # sample_data = export_data # all
#     # user_template = "User: {instruction}\nCompletion: {completion}"  
#     # system_template = "You are a helpful assistant."  
#     max_tokens = 2048  

#     # data_path = os.path.join(cur_dir, "revised_data/output_sorted.jsonl")
#     output_file = "revised_data/output_judge.jsonl"  
#     output_file = os.path.join(cur_dir, output_file)
  
#     # Clear the output file before starting  
#     if os.path.exists(output_file):  
#         os.remove(output_file)  
  
#     revised_data = []  
  
#     with ThreadPoolExecutor(max_workers=len(clients)) as executor:  
#         # Create a future for each row in the dataset  
#         futures = [executor.submit(get_judge, clients[i % len(clients)], row, user_template, system_template, max_tokens, output_file) for i, row in enumerate(sample_data)]  
  
#         # Collect the results as they complete  
#         for future in tqdm(as_completed(futures), total=len(futures)):  
#             revised_data.append(future.result())  

  
#     # Load results from JSONL file and ensure the order is preserved  
#     with open(output_file, 'r') as f:  
#         revised_data = [json.loads(line) for line in f]  
  
#     # Sort by the original index  
#     revised_dataset = revised_data.sort(key=lambda x: x['index'])  
  
#     # Create a new Dataset  
#     revised_dataset = Dataset.from_pandas(pd.DataFrame(revised_data))  
#     sorted_output_path = os.path.join(cur_dir, "revised_data/output_judge_sorted.jsonl")
#     revised_dataset.to_json(sorted_output_path) 


'data_process/output/revised_response/output_17.png'

In [2]:
revised_data = read_jsonl("revised_data/output_sorted.jsonl")
revised_data[:10]

[{'index': 0,
  'prompt': 'how can i develop a habit of drawing daily',
  'response': "Developing a daily habit of drawing can be challenging but with consistent practice and a few tips, it can become an enjoyable and rewarding part of your daily routine. Here are some strategies to help you develop the habit of drawing daily:\n\n1. Set a specific time: Allocate a specific time of the day to draw. It could be in the morning, afternoon, or evening. Make drawing a part of your daily routine.\n2. Set a specific duration: Determine the amount of time you want to spend on drawing each day. It can be as little as 10 minutes or as long as an hour. Be consistent with the duration to help build the habit.\n3. Start small and simple: Don't try to create a masterpiece every day, start with simple and easy-to-do sketches. Focus on improving your skills gradually.\n4. Use a variety of tools and mediums: Experiment with different tools like pencils, pens, markers, and different mediums like paper, c

In [6]:
image_name = 17
image1 = f'output/revised_response/output_{image_name}.png'
image2 = f'output/original_response/output_original_{image_name}.png'
# image1
# print()
temp_instruct = revised_data[image_name-1]['prompt']
print(temp_instruct)
content = user_template.format(instruction=temp_instruct)
print(content)
oai_clients = Openai(
    apis=API_INFOS
)
res = oai_clients.get_image_response(content=content, image1=image1, image2=image2, system=system_template)
print(res)

act as the project manager for the Calderdale Year of Culture 2024. Identify market failures and opportunities providing numerical data where possible that could be addressed by a cultural programme?

<|User Prompt|>act as the project manager for the Calderdale Year of Culture 2024. Identify market failures and opportunities providing numerical data where possible that could be addressed by a cultural programme?
Below are two images: the first one is Assistant A's response, and the second one is Assistant B's response. Please evaluate them based on the criteria provided and give the final verdict answer.



My final verdict is Assistant A is significantly better: [[A>>B]].

### Justification:

1. **Readability**:
   - **Assistant A**: The text is concise and to the point. The use of bullet points makes it easy to read and understand.
   - **Assistant B**: The text is more verbose and less concise, making it harder to follow.

2. **Visual Organization**:
   - **Assistant A**: The use of bullet points and clear separation of examples makes the information easy to digest.
   - **Assistant B**: The text is presented in a paragraph format, which is less visually organized and harder to scan quickly.

3. **Consistency**:
   - **Assistant A**: Maintains a consistent style and format throughout the response.
   - **Assistant B**: The style is consistent but the format is less effective for quick comprehension.

4. **Overall Structure**:
   - **Assistant A**: The structure is clear with well-defined sections and appropriate spacing.
   - **Assistant B**: The structure is less clear due to the para

In [17]:
revised_data[16]['prompt']

'act as the project manager for the Calderdale Year of Culture 2024. Identify market failures and opportunities providing numerical data where possible that could be addressed by a cultural programme?'

In [5]:
oai_clients = Openai(
    apis=API_INFOS
)
# res = oai_clients.call("你是gpt几？")
with open("temp_prompy.txt", 'r') as f:
    prompt = f.read()
res = oai_clients.get_response(prompt, max_tokens=4096)
with open("temp_output.txt", 'w') as f:
    f.write(res)

In [10]:
with open("temp_prompt2.txt", 'r') as f:
    prompt = f.read()
res = oai_clients.get_response(prompt, max_tokens=4096)
with open("temp_output.txt", 'w') as f:
    f.write(res)