In [1]:
import json
from concurrent.futures import ThreadPoolExecutor, as_completed  
import os
import pandas as pd
from gpt4o import Openai, API_INFOS
from datasets import load_dataset, Dataset
from tqdm import tqdm


## system prompt
system_template = \
"""
You are an impartial judge tasked with evaluating the textual aesthetics of responses provided by two AI assistants to the user prompt displayed below. Your goal is to determine which response is more aesthetically pleasing and easier to read and understand.  
  
Begin your evaluation by considering the following aspects for each response:  
  
1. **Readability**: Is the text easy to read and understand? Are the sentences of appropriate length and complexity?  
2. **Visual Organization**: Is the text visually organized in a logical manner? Are there appropriate headings, subheadings, lists, and other formatting elements?  
3. **Consistency**: Does the text maintain a consistent style and format throughout?  
4. **Overall Structure**: Are the paragraphs well-structured and logically connected? Is there appropriate spacing between paragraphs?  
  
Follow these steps for your evaluation:  
1. **Analyze each response**: Carefully read and analyze both responses based on the criteria provided.  
2. **Compare both responses**: Determine which response excels in textual aesthetics considering all aspects.  
3. **Make a final decision**: Choose the response that is better in terms of textual aesthetics and justify your choice.  
  
You must output only one of the following choices as your final verdict with a label:  
1. Assistant A is significantly better: [[A>>B]]  
2. Assistant A is slightly better: [[A>B]]  
3. Tie, relatively the same: [[A=B]]  
4. Assistant B is slightly better: [[B>A]]  
5. Assistant B is significantly better: [[B>>A]]  
  
Example output: "My final verdict is Assistant A is slightly better: [[A>B]]."  
"""

## user prompt
user_template  = \
"""<|User Prompt|>{instruction}
<|The Start of Assistant A's Answer|>
{answer_1}
<|The End of Assistant A's Answer|>

<|The Start of Assistant B's Answer|>
{answer_2}
<|The End of Assistant B's Answer|>"  
"""

def get_data():
    ds = load_dataset("HuggingFaceH4/ultrafeedback_binarized", split="train_prefs")
    def get_instruct_response(item):
        # item["instruction"] = item['chosen'][0]["content"]
        item["response"] = item['chosen'][1]["content"]
        return item
    ds = ds.map(get_instruct_response, batch_size=1024, num_proc=8)
    export_data = ds.select_columns(["prompt", "response"])
    return export_data

def get_revised_text(client, instruction, completion, user_template, system_template, max_tokens=2048):  
    # 格式化用户模板，插入指令和完成的文本  
    content = user_template.format(instruction=instruction, completion=completion)  
      
    # 从客户端获取响应  
    gpt_answer = client.get_response(content=content, system=system_template, max_tokens=max_tokens)  
      
    if gpt_answer is None:  
        gpt_answer = ""  
    gpt_answer = gpt_answer.strip()  
      
    # 确定是否需要修改  
    need_modification = "Y" if "**Does it need modification**: [[Y]]" in gpt_answer else "N"  
      
    # 提取修改后的文本  
    if need_modification == "Y":  
        revised_text_start = gpt_answer.find("<|Revised Content Start|>") + len("<|Revised Content Start|>")  
        revised_text_end = gpt_answer.find("<|Revised Content End|>", revised_text_start)  
        revised_text = gpt_answer[revised_text_start:revised_text_end].strip()  
    else:  
        revised_text = ""  
      
    return need_modification, revised_text, gpt_answer  

  
    return need_modification, revised_text, gpt_answer  
 
def process_row(index, client, row, user_template, system_template, max_tokens=2048, output_file="output.jsonl"):  
    prompt = row['prompt']  
    response = row['response']  
    need_modification, revised_text, gpt_answer = get_revised_text(client, prompt, response, user_template, system_template, max_tokens=max_tokens)  
    # print(f"index {index}")
    result = {  
        'index': index,  
        'prompt': prompt,  
        'response': response,  
        'does_it_need_modification': need_modification,  
        'revised_text': revised_text,  
        'gpt_answer': gpt_answer  
    }  
    # Write the result to a JSONL file  
    with open(output_file, 'a') as f:  
        f.write(json.dumps(result) + "\n")  
    return result  
def main():  
    # Initialize multiple clients  
    clients = [Openai(apis=[API_INFOS[i]]) for i in range(len(API_INFOS))]  
    print(f"clients number: {len(clients)}")
    export_data = get_data()
    sample_data = export_data.select(range(100))
    # sample_data = export_data # all
    # user_template = "User: {instruction}\nCompletion: {completion}"  
    # system_template = "You are a helpful assistant."  
    max_tokens = 2048  
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    # data_path = os.path.join(cur_dir, "revised_data/output_sorted.jsonl")
    output_file = "revised_data/output_v2.jsonl"  
    output_file = os.path.join(cur_dir, output_file)
  
    # Clear the output file before starting  
    if os.path.exists(output_file):  
        os.remove(output_file)  
  
    revised_data = []  
  
    with ThreadPoolExecutor(max_workers=len(clients)) as executor:  
        # Create a future for each row in the dataset  
        futures = [executor.submit(process_row, i, clients[i % len(clients)], row, user_template, system_template, max_tokens, output_file) for i, row in enumerate(sample_data)]  
  
        # Collect the results as they complete  
        for future in tqdm(as_completed(futures), total=len(futures)):  
            revised_data.append(future.result())  

  
    # Load results from JSONL file and ensure the order is preserved  
    with open(output_file, 'r') as f:  
        revised_data = [json.loads(line) for line in f]  
  
    # Sort by the original index  
    revised_dataset = revised_data.sort(key=lambda x: x['index'])  
  
    # Create a new Dataset  
    revised_dataset = Dataset.from_pandas(pd.DataFrame(revised_data))  
    sorted_output_path = os.path.join(cur_dir, "revised_data/output_sorted_v2.jsonl")
    revised_dataset.to_json(sorted_output_path) 
if __name__ == "__main__":
    main()
    # from generate_res import generate_res
    # generate_res()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import re
from utils import read_jsonl
def extract_final_verdict(llm_output):  
    """  
    Extracts the final verdict from the LLM output.  
  
    Parameters:  
    llm_output (str): The output string from the LLM.  
  
    Returns:  
    str: The final verdict in the format [[A>>B]], [[A>B]], [[A=B]], [[B>A]], or [[B>>A]].  
    """  
    # Define the regex pattern to match the final verdict  
    pattern = r'\[\[A>>B\]\]|\[\[A>B\]\]|\[\[A=B\]\]|\[\[B>A\]\]|\[\[B>>A\]\]'  
  
    # Search for the pattern in the LLM output  
    match = re.search(pattern, llm_output)  
  
    if match:  
        return match.group(0)  
    else:  
        return None 
    
def get_judged_answer(client, instruction, answer_1, answer_2, user_template, system_template, max_tokens=2048):  
    # 格式化用户模板，插入指令和完成的文本  
    content = user_template.format(instruction=instruction, answer_1=answer_1, answer_2=answer_2)  
      
    # 从客户端获取响应  
    gpt_answer = client.get_response(content=content, system=system_template, max_tokens=max_tokens)  
      
    if gpt_answer is None:  
        gpt_answer = ""  
    gpt_answer = gpt_answer.strip()  
    
    score = extract_final_verdict(gpt_answer)

    return score, gpt_answer  

def get_judge(client, row, user_template, system_template, max_tokens=2048, output_file="judges.jsonl"):  
    # prompt = row['prompt']  
    # response = row['response']  
    # need_modification, revised_text, gpt_answer = get_revised_text(client, prompt, response, user_template, system_template, max_tokens=max_tokens)  
    # print(f"index {index}")
    prompt = row['prompt'] 
    answer_1 = row['response']
    answer_2 = row['revised_text']
    score, judgment = get_judged_answer(client, prompt, answer_1, answer_2, user_template, system_template, max_tokens=2048) 
    result = row
    result['judge'] = judgment
    result['score'] = score    
    with open(output_file, 'a') as f:  
        f.write(json.dumps(result) + "\n")  
    return result 
def main():  
    clients = [Openai(apis=[API_INFOS[i]]) for i in range(len(API_INFOS))]  
    print(f"clients number: {len(clients)}")
    # Initialize multiple clients  
    revised_data = read_jsonl("revised_data/output_sorted.jsonl")
    sample_data = revised_data.select(range(100))
    # sample_data = export_data # all
    # user_template = "User: {instruction}\nCompletion: {completion}"  
    # system_template = "You are a helpful assistant."  
    max_tokens = 2048  
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    # data_path = os.path.join(cur_dir, "revised_data/output_sorted.jsonl")
    output_file = "revised_data/output_judge.jsonl"  
    output_file = os.path.join(cur_dir, output_file)
  
    # Clear the output file before starting  
    if os.path.exists(output_file):  
        os.remove(output_file)  
  
    revised_data = []  
  
    with ThreadPoolExecutor(max_workers=len(clients)) as executor:  
        # Create a future for each row in the dataset  
        futures = [executor.submit(process_row, i, clients[i % len(clients)], row, user_template, system_template, max_tokens, output_file) for i, row in enumerate(sample_data)]  
  
        # Collect the results as they complete  
        for future in tqdm(as_completed(futures), total=len(futures)):  
            revised_data.append(future.result())  

  
    # Load results from JSONL file and ensure the order is preserved  
    with open(output_file, 'r') as f:  
        revised_data = [json.loads(line) for line in f]  
  
    # Sort by the original index  
    revised_dataset = revised_data.sort(key=lambda x: x['index'])  
  
    # Create a new Dataset  
    revised_dataset = Dataset.from_pandas(pd.DataFrame(revised_data))  
    sorted_output_path = os.path.join(cur_dir, "revised_data/output_sorted_v2.jsonl")
    revised_dataset.to_json(sorted_output_path) 

In [7]:
from utils import read_jsonl
revised_data = read_jsonl("revised_data/output_sorted.jsonl")
revised_data[:2]

[{'index': 0,
  'prompt': 'how can i develop a habit of drawing daily',
  'response': "Developing a daily habit of drawing can be challenging but with consistent practice and a few tips, it can become an enjoyable and rewarding part of your daily routine. Here are some strategies to help you develop the habit of drawing daily:\n\n1. Set a specific time: Allocate a specific time of the day to draw. It could be in the morning, afternoon, or evening. Make drawing a part of your daily routine.\n2. Set a specific duration: Determine the amount of time you want to spend on drawing each day. It can be as little as 10 minutes or as long as an hour. Be consistent with the duration to help build the habit.\n3. Start small and simple: Don't try to create a masterpiece every day, start with simple and easy-to-do sketches. Focus on improving your skills gradually.\n4. Use a variety of tools and mediums: Experiment with different tools like pencils, pens, markers, and different mediums like paper, c