In [2]:
import json  
  
def read_jsonl(file_path):  
    """  
    读取 JSONL 文件中的数据并返回一个包含所有记录的字典列表。  
  
    参数:  
    file_path (str): JSONL 文件的路径。  
  
    返回:  
    list: 包含所有记录的字典列表。  
    """  
    data = []  
  
    with open(file_path, 'r', encoding='utf-8') as file:  
        for line in file:  
            # 解析每一行的 JSON 对象并添加到列表中  
            data.append(json.loads(line.strip()))  
  
    return data  

In [22]:
data[:2]

[{'question_id': '4',
  'model': 'Meta-Llama-3.1-8B-Instruct',
  'judge': 'gpt-4o',
  'games': [{'user_prompt': "\n<|User Prompt|>How do I wrap a present neatly?\nBelow are two images: the first one is Assistant A's response, and the second one is Assistant B's response. Please evaluate them based on the criteria provided and give the final verdict answer.\n",
    'judgment': "\nMy final verdict is Assistant A is slightly better: [[A>B]].\n\n### Justification:\n\n1. **Readability**:\n   - **Assistant A**: The text is clear and easy to read. The instructions are concise and straightforward.\n   - **Assistant B**: The text is also clear, but the instructions are slightly more verbose, which can make it a bit harder to follow.\n\n2. **Visual Organization**:\n   - **Assistant A**: The use of bullet points and numbered steps is effective. The headings and subheadings are clear and well-organized.\n   - **Assistant B**: Also uses bullet points and numbered steps, but the additional tips and 

In [20]:
# data = read_jsonl("data/alpaca/model_judgment/gpt-4o_images/tulu_v2_8b_2048_default_template_dpo.jsonl")
data = read_jsonl("data/alpaca/model_judgment/gpt-4o_images/Meta-Llama-3.1-8B-Instruct.jsonl")
model_answer1 = read_jsonl("data/alpaca/model_answer/gpt4_1106_preview.jsonl") # baseline answer
model_answer2 = read_jsonl("data/alpaca/model_answer/Meta-Llama-3.1-8B-Instruct.jsonl")
# data = read_jsonl("data/alpaca/model_judgment/gpt-4o/Meta-Llama-3.1-8B-Instruct.jsonl")
# data = read_jsonl("data/arena-hard-v0.1/model_judgment/gpt-4o/Meta-Llama-3.1-8B-Instruct.jsonl")
score_A = [item['games'][0]['score'] for item in data]
score_B = [item['games'][1]['score'] for item in data]
score_all = [[x1, x2] for x1, x2 in zip(score_A, score_B)]
score_all
score_map = {
    "A>>B": 5,
    "A>B": 4,
    "A=B": 3,
    "B>A": 2,
    "B>>A": 1,
    None:0
}
score_map_generall = {
    "A>>B": 3,
    "A>B": 3,
    "A=B": 2,
    "B>A": 1,
    "B>>A": 1,
    None:0
}
score_all_quant = [[score_map[x1], score_map[x2]] for x1, x2 in score_all]
score_generall = [[score_map_generall[x1], score_map_generall[x2]] for x1, x2 in score_all]
null_judges = []
for i, (x1, x2) in enumerate(score_all_quant):
    if x1 == 0 or x2 == 0:
        null_judges.append(i)
print(f"null_judges: {len(null_judges)}")
same_judges = []
different_judges = []
for i, (x1, x2) in enumerate(score_generall):
    if (x1 == 1 and x2 == 3) or (x1 == 3 and x2 == 1) or (x1 == 2 and x2 == 2):
        same_judges.append(i)
    if (x1 == x2 and x1 != 2):
        different_judges.append(i)
print(f"same_judges: {len(same_judges)}")
print(f"different_judges: {len(different_judges)}")
good_samples = []
for i, (x1, x2) in enumerate(score_generall):
    if x1 == 1 and x2 == 3:
        good_samples.append(i)
print(f"good_samples: {len(good_samples)}")
judge_dict = {item['question_id']:[item['games'][0]['score'], item['games'][0]['judgment'], item['games'][1]['score'], item['games'][1]['judgment']]  for item in data}
model_answer1_dict = {item['question_id']:[item['choices'][0]['turns'][0]['content'], item['choices'][0]['turns'][0]['token_len'], item['model_id']]  for item in model_answer1}
model_answer2_dict = {item['question_id']:[item['choices'][0]['turns'][0]['content'], item['choices'][0]['turns'][0]['token_len'], item['model_id']]  for item in model_answer2}
res = []
index = 0
for index, idx in enumerate(good_samples):
    question_id = data[idx]['question_id']
    # if score != judge_dict[question_id][0]:
    res.append({
        "question_id":question_id, 
        "index": index,
        "score_1":judge_dict[question_id][0], 
        "score_2":judge_dict[question_id][2], 
        "judgment_1":judge_dict[question_id][1], 
        "judgment_2":judge_dict[question_id][3],
        "model_answer1": model_answer1_dict[question_id][0],
        "token_len1": model_answer1_dict[question_id][1],
        "model_id_1":model_answer1_dict[question_id][2],
        "model_answer2": model_answer2_dict[question_id][0],
        "token_len2": model_answer2_dict[question_id][1],
        "model_id_2":model_answer2_dict[question_id][2],
        })
    # index += 1

null_judges: 0
same_judges: 507
different_judges: 269
good_samples: 306


In [3]:
def get_judge_answer(model_name):
    # data = read_jsonl("data/alpaca/model_judgment/gpt-4o_images/tulu_v2_8b_2048_default_template_dpo.jsonl")
    data = read_jsonl(f"data/alpaca/model_judgment/gpt-4o_images/{model_name}.jsonl")
    model_answer1 = read_jsonl("data/alpaca/model_answer/gpt4_1106_preview.jsonl") # baseline answer
    model_answer2 = read_jsonl(f"data/alpaca/model_answer/{model_name}.jsonl")
    # data = read_jsonl("data/alpaca/model_judgment/gpt-4o/Meta-Llama-3.1-8B-Instruct.jsonl")
    # data = read_jsonl("data/arena-hard-v0.1/model_judgment/gpt-4o/Meta-Llama-3.1-8B-Instruct.jsonl")
    score_A = [item['games'][0]['score'] for item in data]
    score_B = [item['games'][1]['score'] for item in data]
    score_all = [[x1, x2] for x1, x2 in zip(score_A, score_B)]
    score_all
    score_map = {
        "A>>B": 5,
        "A>B": 4,
        "A=B": 3,
        "B>A": 2,
        "B>>A": 1,
        None:0
    }
    score_map_generall = {
        "A>>B": 3,
        "A>B": 3,
        "A=B": 2,
        "B>A": 1,
        "B>>A": 1,
        None:0
    }
    score_all_quant = [[score_map[x1], score_map[x2]] for x1, x2 in score_all]
    score_generall = [[score_map_generall[x1], score_map_generall[x2]] for x1, x2 in score_all]
    null_judges = []
    for i, (x1, x2) in enumerate(score_all_quant):
        if x1 == 0 or x2 == 0:
            null_judges.append(i)
    print(f"null_judges: {len(null_judges)}")
    same_judges = []
    different_judges = []
    for i, (x1, x2) in enumerate(score_generall):
        if (x1 == 1 and x2 == 3) or (x1 == 3 and x2 == 1) or (x1 == 2 and x2 == 2):
            same_judges.append(i)
        if (x1 == x2 and x1 != 2):
            different_judges.append(i)
    print(f"same_judges: {len(same_judges)}")
    print(f"different_judges: {len(different_judges)}")
    good_samples = []
    for i, (x1, x2) in enumerate(score_generall):
        if x1 == 1 and x2 == 3:
            good_samples.append(i)
    print(f"good_samples: {len(good_samples)}")
    judge_dict = {item['question_id']:[item['games'][0]['score'], item['games'][0]['judgment'], item['games'][1]['score'], item['games'][1]['judgment']]  for item in data}
    model_answer1_dict = {item['question_id']:[item['choices'][0]['turns'][0]['content'], item['choices'][0]['turns'][0]['token_len'], item['model_id']]  for item in model_answer1}
    model_answer2_dict = {item['question_id']:[item['choices'][0]['turns'][0]['content'], item['choices'][0]['turns'][0]['token_len'], item['model_id']]  for item in model_answer2}
    res = []
    index = 0
    for index, idx in enumerate(good_samples):
        question_id = data[idx]['question_id']
        # if score != judge_dict[question_id][0]:
        res.append({
            "question_id":question_id, 
            "index": index,
            "score_1":judge_dict[question_id][0], 
            "score_2":judge_dict[question_id][2], 
            "judgment_1":judge_dict[question_id][1], 
            "judgment_2":judge_dict[question_id][3],
            "model_answer1": model_answer1_dict[question_id][0],
            "token_len1": model_answer1_dict[question_id][1],
            "model_id_1":model_answer1_dict[question_id][2],
            "model_answer2": model_answer2_dict[question_id][0],
            "token_len2": model_answer2_dict[question_id][1],
            "model_id_2":model_answer2_dict[question_id][2],
            })
    return res
model_name = "Meta-Llama-3.1-8B-Instruct"
res = get_judge_answer(model_name)
res[:2]

null_judges: 0
same_judges: 444
different_judges: 216
good_samples: 264


[{'question_id': '3',
  'index': 0,
  'score_1': 'B>A',
  'score_2': 'A>B',
  'judgment_1': "\n1. **Analysis of Assistant A's response**:\n   - **Readability**: The text is clear and easy to read. Sentences are of appropriate length and complexity.\n   - **Visual Organization**: The response is well-organized with numbered lists and clear headings for each artist.\n   - **Consistency**: The style and format are consistent throughout the response.\n   - **Overall Structure**: The paragraphs are well-structured and logically connected. There is appropriate spacing between paragraphs.\n   - **Visual Clarity**: The text is clear and legible. The font and spacing are visually appealing and easy on the eyes.\n\n2. **Analysis of Assistant B's response**:\n   - **Readability**: The text is clear and easy to read. Sentences are of appropriate length and complexity.\n   - **Visual Organization**: The response is well-organized with numbered lists, bullet points, and clear headings for each genre

In [4]:
import os
import concurrent.futures  
from text2img import text_to_image, text_to_image_with_timeout  
def process_diff(item, model_name2):  
    model_name1 = "gpt4_1106_preview"
    index = item["index"]  
    output_img_original = f"output_original_{index}.png"  
    data_dir = "data/ta_judges"
    # print(item['choices'][0]['turns'][0]['content'])
    text_to_image_with_timeout(item["model_answer1"], output_img_original, save_dir=model_name1, temp_dir=f"{model_name1}_temp", data_dir=data_dir)  
    text_to_image_with_timeout(item["model_answer2"], output_img_original, save_dir=model_name2, temp_dir=f"{model_name2}_temp", data_dir=data_dir)  
    text_to_image_with_timeout(item["judgment_1"], output_img_original, save_dir=f"{model_name2}_game0_judge", temp_dir=f"{model_name2}_game0_judge_temp", data_dir=data_dir)  
    text_to_image_with_timeout(item["judgment_2"], output_img_original, save_dir=f"{model_name2}_game1_judge", temp_dir=f"{model_name2}_game1_judge_temp", data_dir=data_dir)  
    # text_to_image(item['choices'][0]['turns'][0]['content'], output_img_original, save_dir="original_response", temp_dir="original_temp")  
    # text_to_image("here", output_img_original, save_dir="original_response", temp_dir="original_temp")  
    # print("here")
    # output_img = f"output_{index}.png"  
    # text_to_image(item["gpt_answer"], output_img, save_dir="gpt_response", temp_dir="gpt_temp")  
    # output_img = f"output_{index}.png"  
    # text_to_image(item["revised_text"], output_img, save_dir="revised_response", temp_dir="revised_temp")  
# def process_judge(item):  
#     index = item["question_id"]  
#     output_img_original = f"output_original_{index}.png"  
#     judge_data[0]['games'][0]['user_prompt'], judge_data[0]['games'][0]['judgment']
#     # print(item['choices'][0]['turns'][0]['content'])
#     text_to_image(item['games'][0]['user_prompt'], output_img_original, save_dir="user_prompt", temp_dir="user_prompt_temp")  
#     text_to_image(item['games'][0]['judgment'], output_img_original, save_dir="judge_response", temp_dir="judge_temp")  
#     # text_to_image("here", output_img_original, save_dir="original_response", temp_dir="original_temp")  
    # print("here")

def generate_res(data, model_name2):
    # cur_dir = os.path.dirname(os.path.abspath(__file__))
    # data_path = os.path.join(cur_dir, "revised_data/output_sorted.jsonl")
    # revised_data = read_jsonl(data_path)  
    
    # 指定线程数，例如 4  
    max_workers = 20  
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:  
        futures = [executor.submit(process_diff, item, model_name2) for item in data]  
        for future in concurrent.futures.as_completed(futures):  
            try:  
                future.result()  
            except Exception as exc:  
                print(f'Generated an exception: {exc}')  
    # with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:  
generate_res(res, model_name)

2024-09-08 20:32:18,406 - INFO - Starting the conversion process.
2024-09-08 20:32:18,436 - INFO - Starting the conversion process.
2024-09-08 20:32:18,411 - INFO - Starting the conversion process.
2024-09-08 20:32:18,423 - INFO - Starting the conversion process.
2024-09-08 20:32:18,430 - INFO - Starting the conversion process.
2024-09-08 20:32:18,443 - INFO - Starting the conversion process.
2024-09-08 20:32:18,448 - INFO - Starting the conversion process.
2024-09-08 20:32:18,456 - INFO - Starting the conversion process.
2024-09-08 20:32:18,475 - INFO - 转换成功: /home/lidong1/jianglingjie/arena-hard-auto/data/ta_judges/gpt4_1106_preview_temp/output_original_8.html
2024-09-08 20:32:18,477 - INFO - 转换成功: /home/lidong1/jianglingjie/arena-hard-auto/data/ta_judges/gpt4_1106_preview_temp/output_original_0.html
  Please specify either 'title' or 'pagetitle' in the metadata,
  e.g. by using --metadata pagetitle="..." on the command line.
  Falling back to 'output_original_8'
  Please specify eit

In [17]:
other_indices = set(same_judges) | set(different_judges)
other_score = [score_all[i] for i in range(len(data)) if i not in other_indices]
other_score[:10]

[['A=B', 'B>>A'],
 ['A>>B', 'A=B'],
 ['B>A', 'A=B'],
 [None, 'A>B'],
 ['A=B', 'B>A'],
 ['B>A', 'A=B'],
 ['A=B', 'A>B'],
 ['A>B', 'A=B'],
 ['A=B', 'B>A'],
 ['A>B', 'A=B']]

In [None]:

model_name = 'ta_chosen_tuluv2_dpo_2048_default_template'
file_path = f'data/arena-hard-v0.1/model_answer/{model_name}.jsonl'
data = read_jsonl(file_path)
judge_data_path = f'data/arena-hard-v0.1/model_judgment/gpt-4o/{model_name}.jsonl'
judge_data = read_jsonl(judge_data_path)
# from utils import read_jsonl  
import os
import concurrent.futures  
from text2img import text_to_image  
def process_item(item):  
    index = item["question_id"]  
    output_img_original = f"output_original_{index}.png"  
    # print(item['choices'][0]['turns'][0]['content'])
    text_to_image(item['choices'][0]['turns'][0]['content'], output_img_original, save_dir="original_response", temp_dir="original_temp")  
    # text_to_image("here", output_img_original, save_dir="original_response", temp_dir="original_temp")  
    print("here")
    # output_img = f"output_{index}.png"  
    # text_to_image(item["gpt_answer"], output_img, save_dir="gpt_response", temp_dir="gpt_temp")  
    # output_img = f"output_{index}.png"  
    # text_to_image(item["revised_text"], output_img, save_dir="revised_response", temp_dir="revised_temp")  
def process_judge(item):  
    index = item["question_id"]  
    output_img_original = f"output_original_{index}.png"  
    judge_data[0]['games'][0]['user_prompt'], judge_data[0]['games'][0]['judgment']
    # print(item['choices'][0]['turns'][0]['content'])
    text_to_image(item['games'][0]['user_prompt'], output_img_original, save_dir="user_prompt", temp_dir="user_prompt_temp")  
    text_to_image(item['games'][0]['judgment'], output_img_original, save_dir="judge_response", temp_dir="judge_temp")  
    # text_to_image("here", output_img_original, save_dir="original_response", temp_dir="original_temp")  
    # print("here")

def generate_res(data, judge_data):
    # cur_dir = os.path.dirname(os.path.abspath(__file__))
    # data_path = os.path.join(cur_dir, "revised_data/output_sorted.jsonl")
    # revised_data = read_jsonl(data_path)  
    
    # 指定线程数，例如 4  
    max_workers = 20  
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:  
        futures = [executor.submit(process_item, item) for item in data[:100]]  
        for future in concurrent.futures.as_completed(futures):  
            try:  
                future.result()  
            except Exception as exc:  
                print(f'Generated an exception: {exc}')  
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:  
        futures = [executor.submit(process_judge, item) for item in judge_data[:100]]  
        for future in concurrent.futures.as_completed(futures):  
            try:  
                future.result()  
            except Exception as exc:  
                print(f'Generated an exception: {exc}')  
# generate_res(data, judge_data)

In [None]:
# from utils import read_jsonl  
import os
import concurrent.futures  
from text2img import text_to_image  
def process_item(item):  
    index = item["question_id"]  
    output_img_original = f"output_original_{index}.png"  
    # print(item['choices'][0]['turns'][0]['content'])
    text_to_image(item['choices'][0]['turns'][0]['content'], output_img_original, save_dir="original_response", temp_dir="original_temp")  
    # text_to_image("here", output_img_original, save_dir="original_response", temp_dir="original_temp")  
    print("here")
    # output_img = f"output_{index}.png"  
    # text_to_image(item["gpt_answer"], output_img, save_dir="gpt_response", temp_dir="gpt_temp")  
    # output_img = f"output_{index}.png"  
    # text_to_image(item["revised_text"], output_img, save_dir="revised_response", temp_dir="revised_temp")  

def generate_res(data):
    # cur_dir = os.path.dirname(os.path.abspath(__file__))
    # data_path = os.path.join(cur_dir, "revised_data/output_sorted.jsonl")
    # revised_data = read_jsonl(data_path)  
    
    # 指定线程数，例如 4  
    max_workers = 20  
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:  
        futures = [executor.submit(process_item, item) for item in data[:2000]]  
        for future in concurrent.futures.as_completed(futures):  
            try:  
                future.result()  
            except Exception as exc:  
                print(f'Generated an exception: {exc}')  
generate_res(data)

In [None]:
judge_data = read_jsonl("data/arena-hard-v0.1/model_judgment/gpt-4o/ta_chosen_tuluv2_dpo_2048_default_template.jsonl")
judge_data[0]['games'][0]['user_prompt'], judge_data[0]['games'][0]['judgment']

In [None]:
judge_data = read_jsonl("data/arena-hard-v0.1/model_judgment/gpt-4o/ta_chosen_tuluv2_dpo_2048_default_template.jsonl")
judge_data[0]['games'][0]['user_prompt'], judge_data[0]['games'][0]['judgment']
judges = [item['games'][0]['score'] for item in judge_data]
judges
judge_data2 = read_jsonl("data/arena-hard-v0.1/model_judgment/gpt-4o/tulu_v2_8b_2048_default_template_dpo.jsonl")
judges2 = [item['games'][0]['score'] for item in judge_data2]

In [7]:
def get_diffoutputs(model_name1, model_name2):
    judge_data = read_jsonl(f"data/arena-hard-v0.1/model_judgment/gpt-4o/{model_name1}.jsonl")
    judge_data[0]['games'][0]['user_prompt'], judge_data[0]['games'][0]['judgment']
    # judges = [item['games'][0]['score'] for item in judge_data]
    # judges
    judge_data2 = read_jsonl(f"data/arena-hard-v0.1/model_judgment/gpt-4o/{model_name2}.jsonl")
    # judges2 = [item['games'][0]['score'] for item in judge_data2]  
    judge_dict = {item['question_id']:[item['games'][0]['score'], item['games'][0]['judgment']]  for item in judge_data2}
    model_answer1 = read_jsonl(f'data/arena-hard-v0.1/model_answer/{model_name1}.jsonl')
    model_answer2 = read_jsonl(f'data/arena-hard-v0.1/model_answer/{model_name2}.jsonl')
    # model_answer1
    judge_dict = {item['question_id']:[item['games'][0]['score'], item['games'][0]['judgment']]  for item in judge_data2}
    model_answer1_dict = {item['question_id']:[item['choices'][0]['turns'][0]['content'], item['choices'][0]['turns'][0]['token_len'], item['model_id']]  for item in model_answer1}
    model_answer2_dict = {item['question_id']:[item['choices'][0]['turns'][0]['content'], item['choices'][0]['turns'][0]['token_len'], item['model_id']]  for item in model_answer2}
    # judge_dict
    res = []
    index = 0
    for item in judge_data:
        question_id = item['question_id']
        score = item['games'][0]['score']
        if score != judge_dict[question_id][0]:
            res.append({
                "question_id":question_id, 
                "index": index,
                "score_1":score, 
                "score_2":judge_dict[question_id][0], 
                "judgment_1":item['games'][0]['judgment'], 
                "judgment_2":judge_dict[question_id][1],
                "model_answer1": model_answer1_dict[question_id][0],
                "token_len1": model_answer1_dict[question_id][1],
                "model_id_1":model_answer1_dict[question_id][2],
                "model_answer2": model_answer2_dict[question_id][0],
                "token_len2": model_answer2_dict[question_id][1],
                "model_id_2":model_answer2_dict[question_id][2],
                })
            index += 1
    # res
    with open(f"data/diff_data/judges_{model_name1}_{model_name2}_diffs.json", 'w') as f:
        json.dump(res, f, indent=4)
    return res
res = get_diffoutputs("ta_chosen_tuluv2_dpo_2048_default_template", "tulu_v2_8b_2048_default_template_dpo")

In [8]:
res[:2]

[{'question_id': '90b29911b57848ec89fc7d8c15f27c88',
  'index': 0,
  'score_1': 'B>A',
  'score_2': 'A>>B',
  'judgment_1': "\n**My Answer:**\n\nIncorporating AI in the private equity deal sourcing process can significantly enhance efficiency, accuracy, and the ability to identify high-potential investment opportunities. Here are the steps to effectively integrate AI into this process:\n\n1. **Define Objectives and Criteria:**\n   - Establish clear investment objectives and criteria, such as target industries, company size, growth potential, and financial performance metrics. These will guide the AI algorithms in identifying relevant deals.\n\n2. **Data Collection and Integration:**\n   - Gather data from various sources, including financial databases, company websites, news articles, and social media. Ensure the data is clean, accurate, and up-to-date. Integrate this data into a centralized system for easy access and analysis.\n\n3. **Develop or Select AI Algorithms:**\n   - Choose ap

In [28]:
res[0]['score_1']
score_map = {
    "A>>B": 5,
    "A>B": 4,
    "A=B": 3,
    "B>A": 2,
    "B>>A": 1,
    None:0
}

In [37]:
def get_imporv_and_weaken(model_name1, model_name2):
    improve_cases = []
    weaken_cases = []
    great_improve_cases = []
    great_weaken_cases = []
    for i, item in enumerate(res):
        score_1 = score_map[item['score_1']]
        score_2 = score_map[item['score_2']]
        if score_1 > score_2:
            improve_cases.append(item)
        elif score_1 < score_2:
            weaken_cases.append(item)
        if score_1 - score_2 > 1:
            great_improve_cases.append(item)
        elif score_1 + 1 < score_2:
            great_weaken_cases.append(item)
    with open(f"data/diff_data/judges_{model_name1}_{model_name2}_improves.json", 'w') as f:
        json.dump(improve_cases, f, indent=4)
    with open(f"data/diff_data/judges_{model_name1}_{model_name2}_weaken.json", 'w') as f:
        json.dump(weaken_cases, f, indent=4)
    with open(f"data/diff_data/judges_{model_name1}_{model_name2}_great_improves.json", 'w') as f:
        json.dump(great_improve_cases, f, indent=4)
    with open(f"data/diff_data/judges_{model_name1}_{model_name2}_great_weaken.json", 'w') as f:
        json.dump(great_weaken_cases, f, indent=4)
    # print(len(improve_cases))
    # print(len(weaken_cases))
    print(f"improve cases:{len(improve_cases)} weaken cases {len(weaken_cases)}")
    print(f"great improve cases:{len(great_improve_cases)} great weaken cases {len(great_weaken_cases)}")
get_imporv_and_weaken("ta_chosen_tuluv2_dpo_2048_default_template", "tulu_v2_8b_2048_default_template_dpo")

improve cases:115 weaken cases 86
great improve cases:40 great weaken cases 25


In [34]:
great_improve_cases = []
great_weaken_cases = []
for i, item in enumerate(res):
    score_1 = score_map[item['score_1']]
    score_2 = score_map[item['score_2']]
    if score_1 - score_2 > 1:
        great_improve_cases.append(item)
    elif score_1 + 1 < score_2:
        great_weaken_cases.append(item)
print(len(improve_cases))
print(len(weaken_cases))

40
25


In [9]:
import os
import concurrent.futures  
from text2img import text_to_image  
def process_diff(item, model_name1, model_name2):  
    index = item["index"]  
    output_img_original = f"output_original_{index}.png"  
    # print(item['choices'][0]['turns'][0]['content'])
    text_to_image(item["model_answer1"], output_img_original, save_dir=model_name1, temp_dir=f"{model_name1}_temp")  
    text_to_image(item["model_answer2"], output_img_original, save_dir=model_name2, temp_dir=f"{model_name2}_temp")  
    text_to_image(item["judgment_1"], output_img_original, save_dir=f"{model_name1}_judge", temp_dir=f"{model_name1}_judge_temp")  
    text_to_image(item["judgment_2"], output_img_original, save_dir=f"{model_name2}_judge", temp_dir=f"{model_name2}_judge_temp")  
    # text_to_image(item['choices'][0]['turns'][0]['content'], output_img_original, save_dir="original_response", temp_dir="original_temp")  
    # text_to_image("here", output_img_original, save_dir="original_response", temp_dir="original_temp")  
    # print("here")
    # output_img = f"output_{index}.png"  
    # text_to_image(item["gpt_answer"], output_img, save_dir="gpt_response", temp_dir="gpt_temp")  
    # output_img = f"output_{index}.png"  
    # text_to_image(item["revised_text"], output_img, save_dir="revised_response", temp_dir="revised_temp")  
# def process_judge(item):  
#     index = item["question_id"]  
#     output_img_original = f"output_original_{index}.png"  
#     judge_data[0]['games'][0]['user_prompt'], judge_data[0]['games'][0]['judgment']
#     # print(item['choices'][0]['turns'][0]['content'])
#     text_to_image(item['games'][0]['user_prompt'], output_img_original, save_dir="user_prompt", temp_dir="user_prompt_temp")  
#     text_to_image(item['games'][0]['judgment'], output_img_original, save_dir="judge_response", temp_dir="judge_temp")  
#     # text_to_image("here", output_img_original, save_dir="original_response", temp_dir="original_temp")  
    # print("here")

def generate_res(data, model_name1, model_name2):
    # cur_dir = os.path.dirname(os.path.abspath(__file__))
    # data_path = os.path.join(cur_dir, "revised_data/output_sorted.jsonl")
    # revised_data = read_jsonl(data_path)  
    
    # 指定线程数，例如 4  
    max_workers = 20  
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:  
        futures = [executor.submit(process_diff, item, model_name1, model_name2) for item in data]  
        for future in concurrent.futures.as_completed(futures):  
            try:  
                future.result()  
            except Exception as exc:  
                print(f'Generated an exception: {exc}')  
    # with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:  
generate_res(res, "ta_chosen_tuluv2_dpo_2048_default_template", "tulu_v2_8b_2048_default_template_dpo")

2024-09-01 07:45:08,090 - INFO - Starting the conversion process.
2024-09-01 07:45:08,091 - INFO - Starting the conversion process.
2024-09-01 07:45:08,094 - INFO - Starting the conversion process.
2024-09-01 07:45:08,099 - INFO - Starting the conversion process.
2024-09-01 07:45:08,100 - INFO - Starting the conversion process.
2024-09-01 07:45:08,101 - INFO - Starting the conversion process.
2024-09-01 07:45:08,106 - INFO - Starting the conversion process.
2024-09-01 07:45:08,110 - INFO - Starting the conversion process.
2024-09-01 07:45:08,111 - INFO - Starting the conversion process.
2024-09-01 07:45:08,114 - INFO - Starting the conversion process.
2024-09-01 07:45:08,120 - INFO - Starting the conversion process.
2024-09-01 07:45:08,124 - INFO - Starting the conversion process.
  Please specify either 'title' or 'pagetitle' in the metadata,
  e.g. by using --metadata pagetitle="..." on the command line.
  Falling back to 'output_original_1'
2024-09-01 07:45:08,126 - INFO - Starting 

  Please specify either 'title' or 'pagetitle' in the metadata,
  e.g. by using --metadata pagetitle="..." on the command line.
  Falling back to 'output_original_19'
2024-09-01 07:45:08,230 - INFO - 转换成功: /home/lidong1/jianglingjie/arena-hard-auto/output/ta_chosen_tuluv2_dpo_2048_default_template_temp/output_original_17.html
2024-09-01 07:45:08,238 - INFO - 转换成功: /home/lidong1/jianglingjie/arena-hard-auto/output/ta_chosen_tuluv2_dpo_2048_default_template_temp/output_original_16.html
  Please specify either 'title' or 'pagetitle' in the metadata,
  e.g. by using --metadata pagetitle="..." on the command line.
  Falling back to 'output_original_14'
2024-09-01 07:45:08,247 - INFO - 转换成功: /home/lidong1/jianglingjie/arena-hard-auto/output/ta_chosen_tuluv2_dpo_2048_default_template_temp/output_original_3.html
2024-09-01 07:45:08,247 - INFO - 转换成功: /home/lidong1/jianglingjie/arena-hard-auto/output/ta_chosen_tuluv2_dpo_2048_default_template_temp/output_original_18.html
2024-09-01 07:45:08,249

In [None]:
judge_dict = {item['question_id']:[item['games'][0]['score'], item['games'][0]['judgment']]  for item in judge_data2}
model_answer1 = read_jsonl('data/arena-hard-v0.1/model_answer/tulu_v2_8b_2048_default_template_dpo.jsonl')
model_answer2 = read_jsonl('data/arena-hard-v0.1/model_answer/tulu_v2_8b_bsz64_default_template_dpo.jsonl')
# model_answer1
judge_dict = {item['question_id']:[item['games'][0]['score'], item['games'][0]['judgment']]  for item in judge_data2}
model_answer1_dict = {item['question_id']:[item['choices'][0]['turns'][0]['content'], item['choices'][0]['turns'][0]['token_len'], item['model_id']]  for item in model_answer1}
model_answer2_dict = {item['question_id']:[item['choices'][0]['turns'][0]['content'], item['choices'][0]['turns'][0]['token_len'], item['model_id']]  for item in model_answer2}
# judge_dict
res = []
for item in judge_data:
    question_id = item['question_id']
    score = item['games'][0]['score']
    if score != judge_dict[question_id][0]:
        res.append({
            "question_id":question_id, 
            "score_1":score, 
            "score_2":judge_dict[question_id][0], 
            "judgment_1":item['games'][0]['judgment'], 
            "judgment_2":judge_dict[question_id][1],
            })
res

In [None]:
res = []
for item in judge_data:
    question_id = item['question_id']
    score = item['games'][0]['score']
    if score != judge_dict[question_id][0]:
        res.append({
            "question_id":question_id, 
            "score_1":score, 
            "score_2":judge_dict[question_id][0], 
            "judgment_1":item['games'][0]['judgment'], 
            "judgment_2":judge_dict[question_id][1],
            "model_answer1": model_answer1_dict[question_id][0],
            "token_len1": model_answer1_dict[question_id][1],
            "model_id_1":model_answer1_dict[question_id][2],
            "model_answer2": model_answer2_dict[question_id][0],
            "token_len2": model_answer2_dict[question_id][1],
            "model_id_2":model_answer2_dict[question_id][2]
            })
# res
with open(f"judges_diffs.json", 'w') as f:
    json.dump(res, f, indent=4)

In [None]:
model_answer1[0]['choices'][0]['turns']

In [None]:
model_answer1 = read_jsonl('data/arena-hard-v0.1/model_answer/tulu_v2_8b_2048_default_template_dpo.jsonl')
model_answer2 = read_jsonl('data/arena-hard-v0.1/model_answer/tulu_v2_8b_bsz64_default_template_dpo.jsonl')
# model_answer1
judge_dict = {item['question_id']:[item['games'][0]['score'], item['games'][0]['judgment']]  for item in judge_data2}
model_answer1_dict = {item['question_id']:[item['choices'][0]['turns'][0]['content'], item['choices'][0]['turns'][0]['token_len']]  for item in model_answer1}
model_answer2_dict = {item['question_id']:[item['choices'][0]['turns'][0]['content'], item['choices'][0]['turns'][0]['token_len']]  for item in model_answer2}

In [None]:
model_answer1_dict

In [None]:
indexs = []
for i, _ in enumerate(judges):
    if judges[i] != judges2[i]:
        indexs.append(i)
print(indexs)

In [None]:
judge_data[0]['games'][0]['judgment']

In [None]:
set(judges)

In [None]:
[[i, judges[i], judges2[i], judge_data[i]['question_id'], judge_data2[i]['question_id'], model_answer1[i]['question_id']] for i, (judge_A, judge_B) in enumerate(zip(judges, judges2))]