In [None]:
import pandas as pd
DATASET = 'usmle' # one of bio, usmle, cmcqrd
FP = True

fp = "fp_" if FP else ""
# Load the train and test and combine them
df1 = pd.read_csv(f'../data/{DATASET}/preprocessed/combined_results_{fp}train_set.csv')
df2 = pd.read_csv(f'../data/{DATASET}/preprocessed/combined_results_{fp}test_set.csv')
all_data = pd.concat([df1, df2], ignore_index=True)

# Define model names and uncertainty metric names
MODEL_NAMES = ['phi3_5-chat', 'Llama3_2-3b-chat', 'Qwen2_5-3b-chat',
               'Llama3_1-8b-chat', 'Qwen2_5-14b-chat', 'Qwen2_5-32b-chat',
               'Yi-34b-chat', 'Llama3_1-70b-chat', 'Qwen2_5-72b-chat']

uncertainty_metrics = ['first_token_probability', 'order_probability']
false_choice_uncertainty_metrics = ['first_token_probability_selected_choice', 'order_probability_selected_choice']


latex_str = ""
latex_str += "\\multirow{9}{*}{"
if DATASET == 'bio':
    latex_str += "Biopsychology}"
elif DATASET == 'usmle':
    latex_str += "USMLE}"
elif DATASET == 'cmcqrd':
    latex_str += "CMCQRD}"


for idx, model in enumerate(MODEL_NAMES):
    model_latex = model.replace('_', '\\_').replace('-chat', '')
    
    mean_first_token_correct = round(all_data.loc[all_data[f'model_is_correct_{model}'] == True, f'first_token_probability_{model}'].mean(), 3)
    mean_order_correct       = round(all_data.loc[all_data[f'model_is_correct_{model}'] == True, f'order_probability_{model}'].mean(), 3)
    
    mean_first_token_incorrect = round(all_data.loc[all_data[f'model_is_correct_{model}'] == False, f'first_token_probability_selected_choice_{model}'].mean(), 3)
    mean_order_incorrect       = round(all_data.loc[all_data[f'model_is_correct_{model}'] == False, f'order_probability_selected_choice_{model}'].mean(), 3)
    
    overall_correctness = round(all_data[f'model_is_correct_{model}'].sum() / len(all_data), 3)
    
    if idx == 0:
        row = f" & {model_latex} & {overall_correctness} & \\textcolor{{ForestGreen}}{{{mean_first_token_correct}}} / \\textcolor{{BrickRed}}{{{mean_first_token_incorrect}}} & \\textcolor{{ForestGreen}}{{{mean_order_correct}}} / \\textcolor{{BrickRed}}{{{mean_order_incorrect}}} \\\\\n"
    else:
        row = f"                               & {model_latex} & {overall_correctness} & \\textcolor{{ForestGreen}}{{{mean_first_token_correct}}} / \\textcolor{{BrickRed}}{{{mean_first_token_incorrect}}} & \\textcolor{{ForestGreen}}{{{mean_order_correct}}} / \\textcolor{{BrickRed}}{{{mean_order_incorrect}}} \\\\\n"
    latex_str += row

# Print the LaTeX table code
print(latex_str)
