## Print the evaluation results

In [1]:

import os
import json
import pandas as pd


def model_file_path(base_model: str, loss_type: str, model_output_path: str):
    # Base model name mapping
    base_model_name_map = {
        "llama-3-8b": "Llama-3-Base-8B-SFT",
        "mistral-7b": "Mistral-7B-Base-SFT",
        "qwen-2.5-7b": "Qwen2.5-7B-sft-ultrachat"
    }
    
    base_model_name = base_model_name_map.get(base_model, base_model)

    # Construct model output path
    if loss_type == "sft":
        if base_model == "llama-3-8b":
            model_output_file = os.path.join(model_output_path, "Llama-3-Base-8B-SFT")
        elif base_model == "mistral-7b":
            model_output_file = os.path.join(model_output_path, "mistral-7b-sft-beta")
        elif base_model == "qwen-2.5-7b":
            model_output_file = os.path.join(model_output_path, "Qwen2.5-7B-sft-ultrachat")
        else:
            model_output_file = os.path.join(model_output_path, f"{base_model}-sft")
    elif loss_type.lower() in ["ipo", "kto", "cpo", "rdpo", "orpo", "slic-hf", "dpo"]:
        model_output_file = os.path.join(model_output_path, f"{base_model_name}-{loss_type.upper()}")
    elif loss_type.lower() in ["simpo"]:
        model_output_file = os.path.join(model_output_path, f"{base_model_name}-SimPO")
    else:
        model_output_file = os.path.join(model_output_path, f"{base_model}-{loss_type}")

    return base_model_name, model_output_file


#####################################
# base_model = "llama-3-8b"
# base_model = "mistral-7b"


if base_model == "llama-3-8b":
    loss_type_list = [
        "sft", "dpo", "cpo", "kto", "simpo",
        "dpo-sorted-llama-full-ckpt-191",
        "ours4-6-sorted-score-diff-full"
    ]
elif base_model == "mistral-7b":
    loss_type_list = [
        "sft", "dpo", "cpo", "kto", "simpo",
        "dpo-sorted-mistral-full",
        "ours4-6-sorted-score-diff-new-base-full-lr5"
    ]

# Task list
TASK_LISTS = [
    'mmlu', 'bbh', 'gsm8k', 'truthfulqa_mc2', 'arc_challenge', 'piqa',
    'hellaswag', 'openbookqa', 'triviaqa', 'sciq', 'arc_easy', 'logiqa',
    'boolq', 'winogrande'
]

# Aggregate results
results_all = {}
for loss_type in loss_type_list:
    data_root = "./downstream_task_results"
    base_model_name, model_path = model_file_path(base_model, loss_type, data_root)

    try:
        # Expecting one subfolder inside each model directory
        subfolder = os.listdir(model_path)[0]
        data_path = os.path.join(model_path, subfolder)
        json_files = [f for f in os.listdir(data_path) if f.endswith(".json")]
    except Exception as e:
        print(f"[Warning] Failed to locate result path for {loss_type}: {e}")
        continue

    results = {}
    for file in json_files:
        file_path = os.path.join(data_path, file)
        with open(file_path, 'r') as f:
            temp = json.load(f)

        for task in TASK_LISTS:
            if task in temp.get('results', {}):
                if task in ['hellaswag', 'piqa', 'openbookqa', 'arc_challenge', 'mmlu', 'truthfulqa_mc2', 'sciq', 'arc_easy', 'logiqa', 'boolq', 'winogrande']:
                    metric = 'acc,none'
                elif task == 'gsm8k':
                    metric = 'exact_match,strict-match'
                elif task == 'triviaqa':
                    metric = 'exact_match,remove_whitespace'
                elif task == 'bbh':
                    metric = 'exact_match,get-answer'
                else:
                    continue

                value = temp['results'][task].get(metric)
                if value is not None:
                    results[task] = value

    results_all[loss_type] = results

# Convert to DataFrame
results_df = pd.DataFrame.from_dict(results_all, orient='index')

# Select major task list and convert to percentage
display_tasks = ['mmlu', "truthfulqa_mc2", "hellaswag", "arc_challenge", "gsm8k", 'winogrande']
results_df = results_df[display_tasks]
results_df = results_df.map(lambda x: round(100 * x, 2) if pd.notnull(x) else x)
results_df['Average'] = results_df.mean(axis=1).round(1)

# Order and print
results_df = results_df.reindex(loss_type_list)
results_df.index = results_df.index.str.replace('_', '-', regex=False)

print("\nResults DataFrame (Reordered with Average, Percentage Format):\n")
print(results_df.to_string(line_width=1000))

# Export LaTeX
latex_table = results_df.to_latex(index=True, caption="模型评估结果", label="tab:results", float_format="%.2f")
print("\n" + "#" * 80 + "\nLaTeX Form:\n" + "#" * 80)
print(latex_table)



Results DataFrame (Reordered with Average, Percentage Format):

                                              mmlu  truthfulqa_mc2  hellaswag  arc_challenge  gsm8k  winogrande  Average
sft                                          59.77           42.86      61.91          54.95   38.5       76.89     55.8
dpo                                          57.57           53.14      64.34          57.19   30.5       78.33     56.8
cpo                                          58.12           46.93      60.33          52.28   35.5       77.29     55.1
kto                                          59.73           56.51      65.18          59.43   39.0       78.09     59.7
simpo                                        58.49           50.68      63.89          59.26   35.5       78.41     57.7
dpo-sorted-mistral-full                      59.08           45.97      65.12          60.38   28.5       77.37     56.1
ours4-6-sorted-score-diff-new-base-full-lr5  59.73           52.07      65.78          6