In [None]:
import os
import pandas as pd
from tqdm.auto import tqdm

os.environ['HF_HOME'] = '/scratch/' + str(open('../tokens/HPC_ACCOUNT_ID.txt', 'r').read())
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [None]:
# The MAIN_DF_NAME also dictates the values of "full_answer", "full_prompt", as we don't store the other models' full answers
MAIN_DF_NAME = "Qwen2_5-14b-chat"
MODEL_NAMES = ['phi3_5-chat', 'Llama3_2-3b-chat', 'Qwen2_5-3b-chat', 'Llama3_1-8b-chat', 'Qwen2_5-14b-chat', 'Qwen2_5-32b-chat', 'Yi-34b-chat', 'Llama3_1-70b-chat', 'Qwen2_5-72b-chat']

HF_TOKEN = open('../tokens/HF_TOKEN.txt', 'r').read()

In [None]:
def combine_uncertainties(df, dataset_name, split_name, full_precision):
    """Combines the uncertainties of the different models into one dataframe"""
    for cols in ["first_token_probability", "order_probability", "first_token_probability_selected_choice", "order_probability_selected_choice", "full_answer", "model_is_correct"]:
        df = df.rename(columns={cols: cols + "_" + MAIN_DF_NAME})

    # For each of the other dfs, we now add the uncertainty column to the main df
    for extra_model_df_name in MODEL_NAMES:
        df_extra = pd.read_csv("../data/" + dataset_name + "/with_uncertainty/" + extra_model_df_name + "_" + full_precision + split_name + "_set.csv")
        # We copy over the uncertainty of the other models, and we annotate the column as such
        df["first_token_probability" + "_" + extra_model_df_name] = df_extra["first_token_probability"]
        df["order_probability" + "_" + extra_model_df_name] = df_extra["order_probability"]
        df["first_token_probability_selected_choice" + "_" + extra_model_df_name] = df_extra["first_token_probability_selected_choice"]
        df["order_probability_selected_choice" + "_" + extra_model_df_name] = df_extra["order_probability_selected_choice"]
        df["model_is_correct" + "_" + extra_model_df_name] = df_extra["model_is_correct"]
        
    return df

In [None]:
def create_question_with_options_string(df):
    """Creates a new column with the question and options only"""
    # We add a column with the question and options only. We iterate through all rows, and for each row we add the question
    # and options to the new column (making sure we don't include empty options)
    choice_columns = ['Answer_' + chr(ord('A') + i)
                        for i in range(10)]  # Answer_A ... Answer_J
    choice_letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
    for row in df.iterrows():
        choices = ""
        for i in range(10):
            if pd.notna(row[1][choice_columns[i]]):
                choice_text = row[1][choice_columns[i]]
                if choice_text:
                    choices += choice_letters[i] + ") " + str(choice_text) + "\n"
        df.at[row[0], 'question_with_options'] = row[1]['Question'] + '\n' + choices
        
    return df

In [None]:
def add_choice_similarity(df):
    """Calculates the average similarity between the answer and the choices"""
    # Function to compute average similarity score
    def compute_average_similarity(row):
        # Generate embeddings
        answer_text_embedding = model.encode([str(row['Answer_Text'])])
        choices = [row['Answer_A'], row['Answer_B'], row['Answer_C'], row['Answer_D'], row['Answer_E'],
                   row['Answer_F'], row['Answer_G'], row['Answer_H'], row['Answer_I'], row['Answer_J']]
        choices = [choice for choice in choices if not pd.isna(choice)]
        choices = [choice for choice in choices if choice != row['Answer_Text']]
        choices_embeddings = model.encode(choices)

        # Compute cosine similarity between answer text and each choice
        similarities = cosine_similarity(
            answer_text_embedding, choices_embeddings).flatten()
        # Compute average similarity
        mean_similarity = similarities.mean().item()
        return mean_similarity


    # Load pre-trained model for generating embeddings
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", token = HF_TOKEN) # efficient model
    df['choices_similarity'] = df.apply(compute_average_similarity, axis=1)
    
    model = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO', token = HF_TOKEN) # bio clinical model
    df['choices_similarity_clinical'] = df.apply(compute_average_similarity, axis=1)

    return df

In [None]:
for dataset_name in ['usmle', 'bio', 'cmcqrd']:
    for split in ['train', 'test']:
        for full_precision in ["fp_", ""]:
            print("Processing " + dataset_name + " " + split)
            input_file_name = "../data/" + dataset_name + "/with_uncertainty/" + MAIN_DF_NAME + "_" + full_precision  + split + "_set.csv"
            output_file_name = "../data/" + dataset_name + "/preprocessed/combined_results_" + full_precision  + split +  "_set.csv"
            
            combined_df = combine_uncertainties(pd.read_csv(input_file_name), dataset_name, split, full_precision)
            combined_df = create_question_with_options_string(combined_df)
            combined_df = add_choice_similarity(combined_df)
            
            combined_df.to_csv(output_file_name, index=False)