Classification of the results as "true" or "false"

In [5]:
# --- Imports ---
import os
import json
import random
import pandas as pd
import time
import re
import shutil
import logging
from openai import OpenAI
import signal
from datetime import datetime


#Utility functions

In [64]:
import regex as re

def check_answers_regex(answer_a, answer_b):
    """Checks if answer_a equals answer_be after ignoring differences in capitalization or symbols

    Args:
        answer_a (str): an answer
        answer_b (str): another answer to be compared to answer_a

    Returns:
        bool: True if both answers are the same, False otherwise
    """

    # Ensure answers are strings
    answer_a = str(answer_a) if answer_a is not None else ""
    answer_b = str(answer_b) if answer_b is not None else ""

    # Remove common variations and convert to lowercase
    normalized_a = re.sub(r'(?i)answer|the solution is|[^a-zA-Z0-9]', '', answer_a).lower()
    normalized_b = re.sub(r'(?i)answer|the solution is|[^a-zA-Z0-9]', '', answer_b).lower()

    if "true" in [normalized_a, normalized_b] or "false" in [normalized_a, normalized_b]:
        if ((normalized_a.startswith("yes") or normalized_b.startswith("yes")) and "true" in [normalized_a, normalized_b]) or ((normalized_a.startswith("no") or normalized_b.startswith("no")) and "false" in [normalized_a, normalized_b]):
            return True

    # Compare the normalized answers
    return normalized_a == normalized_b


def determine_type(item):
    if isinstance(item, str):
        item = item.strip()  # Remove leading and trailing whitespace
        if item.replace('.', '', 1).isdigit():
            if item.isdigit():
                return "int"
            return "float"
        return "str"
    elif isinstance(item, (int, float)):
        return "num"
    elif isinstance(item, bool):
        return "bool"
    else:
        return "other"


In [65]:
check_answers_regex("yes, this is correct", "True")

True

In [27]:


OPENAI_API_KEY = "YOUR_API_KEY"

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger(__name__)


def create_response_GPT_base(question_prompt, basemodel, api_key=OPENAI_API_KEY):
    """Base function for getting responses from GPT models.

    Args:
        question_prompt (str): The question prompt
        basemodel (str): The model to use (e.g., "gpt-4o")
        api_key (str, optional): OpenAI API key. Defaults to OPENAI_KEY.

    Returns:
        str: The model's response
    """
    logger.info(f"--- create_response base : {basemodel} --- ")

    response = None
    retries = 0

    while retries < 3:
        try:
            client = OpenAI(api_key=api_key)

            resp = client.chat.completions.create(
                model=basemodel,
                temperature=0,
                max_tokens=1000,
                seed=1,
                messages=[
                    {"role": "user", "content": question_prompt}
                ]
            )
            print("tokens : ", resp.usage.total_tokens)
            print("model used : ", resp.model)
            response = resp.choices[0].message.content
            break
        except Exception as e:
            print(f"Error occurred: {e}")
            print(question_prompt)
            time.sleep(0.100)
            response = f"Error occurred: {e}"
            retries += 1

    return response

def create_response_GPT4o(question_prompt, api_key=OPENAI_API_KEY):
    return create_response_GPT_base(question_prompt, "gpt-4o", api_key)

def create_response_GPT3(question_prompt, api_key=OPENAI_API_KEY):
    return create_response_GPT_base(question_prompt, "gpt-3.5-turbo", api_key)

# create_response_GPT4o("what is 2+2?")

LLM_dict = {"GPT3" : {"LLM_function" :create_response_GPT3},
            "GPT4o" : {"LLM_function" :create_response_GPT4o},
                     }

def get_llm_function(LLM_name, LLM_dict=LLM_dict):
    """Get the corresponding LLM function based on the LLM string.

    Args:
        llm_string (str): The LLM string.

    Returns:
        function: The corresponding LLM function.
    """
    if LLM_name in LLM_dict:
        return LLM_dict[LLM_name]["LLM_function"]
    else:
        raise ValueError(f"No LLM function found for LLM '{LLM_name}'")


In [None]:
def generate_choice_dict(choices):
    choice_dict = {}
    for i, choice in enumerate(choices):
        letter = chr(ord('A') + i)
        choice_dict[letter] = choice
    return choice_dict

#print(generate_choice_dict(['simile', 'alliteration']))

In [26]:
#returns the file names already in the results folder to add them to the list of skipped files (to not take them into account when running functions)
def get_skipped_files(folder_path, list_skippedfiles=[]):
    for dirpath, dirnames, filenames in os.walk(folder_path):
        for filename in filenames:
            if filename.endswith('.json'):
                file_path = os.path.join(dirpath, filename)
                list_skippedfiles.append(file_path)
    return list_skippedfiles

#list_skippedfiles = get_skipped_files(results_folder_llms)

#Verification functions

In [33]:

def check_answer_LLM_prompt(prompt, question, right_answer, llm_answer, choices = None, LLM_function = create_response_GPT4o):
    """Gives an LLM  a question, the right answer and the LLM answer, and asks if the LLM answer is correct

    Args:
        question (str): question prompt
        right_answer (str): the right answer extracted from the benchmark
        llm_answer (str): the LLM's answer

    Returns:
        str: an LLM's assessment on whether llm_answer is correct
    """

    #prompt = "Someone was asked to answer the following question : " + str(question) + "\n The person answered : " + str(llm_answer) +" . The right answer is :" + str(right_answer) + " \n\n Did the person answer correctly ? \nAnswer by 'yes' or 'no' . "
    choices_prompt = ""
    if choices != None:
        choices_prompt = "\nThe choices for this question are: " + str(generate_choice_dict(choices)) + "\nThe given response is one of the choices. It is either the right choice, which is the ground truth, or one of the wrong ones. Select the answer that is the closest amongst these choices. "

    prompt_addition = "\n\nThis is the ground truth: " + str(right_answer) + "\nThis is the response you need to classify in comparison to the ground truth: ' " + str(llm_answer) + "'\n\n" + choices_prompt

    return LLM_function(prompt + prompt_addition)


def check_answer_LLM(question, right_answer, llm_answer, choices = None, LLM_function = create_response_GPT4o):
    """Gives an LLM  a question, the right answer and the LLM answer, and asks if the LLM answer is correct

    Args:
        question (str): question prompt
        right_answer (str): the right answer extracted from the benchmark
        llm_answer (str): the LLM's answer

    Returns:
        str: an LLM's assessment on whether llm_answer is correct
    """

    #prompt = "Someone was asked to answer the following question : " + str(question) + "\n The person answered : " + str(llm_answer) +" . The right answer is :" + str(right_answer) + " \n\n Did the person answer correctly ? \nAnswer by 'yes' or 'no' . "
    #for case sensitive answers:
    choices_prompt = ""
    if choices != None:
        choices_prompt = "\nThe choices for this question are: " + str(generate_choice_dict(choices)) + "\nThe given response is correct if the case of the given response matches the case of the correct response. "

    prompt1 = ("You are skilled at interpreting texts. Your task is to classify text snippets, specifically assessing whether they are true or false.\nThis is the question: " 
        + str(question) + 
        "\n\nThis is the correct answer to the question: " + str(right_answer) + 
        "\n\nThis is the response you need to classify: " + str(llm_answer) + 
        """\n\nAdhere to the following instructions when categorizing the response:\n 
            - The classification result should be a boolean value, represented as either TRUE or FALSE.\n
            - Avoid including comments in your answer.\n
            - Think step by step before providing your final answer.
            \n\nOutput either TRUE or FALSE""")

    # prompt without the question : 
    prompt2 = ("You are skilled at interpreting texts. Your task is to compare two text snippets, specifically assessing whether they have the same meaning. These text snippets are responses to a question, the goal is to assess whether they give the same answer to the question and respond TRUE if they do.\n"
        "\n\nThis is the first answer to the question: " + str(right_answer) + 
        "\nThis is the second answer to the question: " + str(llm_answer) + 
        """\n\nAdhere to the following instructions:\n 
            - The result should be a boolean value, represented as either TRUE or FALSE.\n
            - Avoid including comments in your answer.\n
            - Think step by step before providing your final answer.
            \n\nOutput either TRUE or FALSE
        \n\nExamples: -'The correct option is San Francisco.' compared to 'san francisco' : TRUE \n
        -'John would need 8 dollars' compared to '$8' : TRUE \n
        -'It would take 5 minutes' compared to '8 minutes' : FALSE\n
        -'The correct option is candle' compared to 'lightbulb' : FALSE \n""")

    # prompt without the question but with the choices: 
    prompt3 = ( "You are skilled at classifying texts. Your task is to compare two text snippets. These text snippets are responses to a question, one is the correct answer, and one is an individual's answer. The goal is to classify whether the individual gave the correct answer to the question and respond TRUE if they did, FALSE otherwise.\n\n" +
        "\nA question was given to an individual.\nThis is the right answer to the question: " + str(right_answer) + 
        "\nThis is the individual's answer to the question: " + str(llm_answer) + choices_prompt +
        "\nIs this individual correct? " +
        """\n\nAdhere to the following instructions:\n 
            - The result should be a boolean value, represented as either TRUE or FALSE.\n
            - Avoid including comments in your answer.\n
            \n\nOutput either TRUE or FALSE
        \n\nExamples: - The individual's answer is 'The correct option is San Francisco.',the right answer is 'san francisco' : TRUE \n
        - The individual's answer is 'John would need 8 dollars', the right answer is '$8' : TRUE \n
        - The individual's answer is 'It would take 5 minutes', the right answer is '8 minutes' : FALSE\n
        - The individual's answer is 'The correct option is candle', the right answer is 'lightbulb' : FALSE \n""")

    prompt7 = ( "Your task is to classify responses to questions, specifically assessing whether they are true or false in comparison to a ground truth." +  
    """\n\nAdhere to the following instructions when categorizing the response:\n 
    - The classification result should be a boolean value, represented as either TRUE or FALSE.\n
    - Only respond by saying 'TRUE' or 'FALSE'\n
    - Avoid including comments in your answer.\n
    - If the given response includes a response followed by an explanation, only take into account the response.\n
    \n\nExamples: - The given response is 'The correct option is San Francisco.', the ground truth is 'san francisco' : TRUE \n
        - The given response is 'John would need 8 dollars', the ground truth is '$8' : TRUE \n
        - The given response is 'The garden will be lit for 5 hours', the ground truth is '5' : TRUE \n
        - The given response is 'It would take 5 minutes', the ground truth is '8 minutes' : FALSE\n
        - The given response is 'The correct option is candle', the ground truth is 'lightbulb' : FALSE \n
        - The given response is 'True. Donna will successfully become an artist because she has all the skills for it.', the ground truth is 'True' : TRUE \n"""
    + "\n\nThis is the ground truth: " + str(right_answer) + "\nThis is the response you need to classify in comparison to the ground truth: ' " + str(llm_answer) + "'\n\n" + choices_prompt )

    
    verification_prompt_neutral= ("Your task is to classify a response to a question, specifically assessing whether the given response is true or false in comparison to the correct response."
                                  + "\n\nThis is the question:" + str(question) + "\n\nThis is the given response: " + str(llm_answer) +" \n\nThis is the correct response: " + str(right_answer) +
                                  """\n\nAdhere to the following rules when classifying the given response:\n
    - The classification result should be a boolean value, represented as either TRUE or FALSE.\n
    - Only output 'TRUE' or 'FALSE'\n
    - Never include any other comments or strings other than TRUE or FALSE in your output.
    - Do not answer the question.\n
    - The given response can be phrased differently from the correct response. This does not mean it should be classified as FALSE.\n
    - Only assess if the given response contains the information that matches with the information provided in the correct response.\n
    - Check whether the correct response is in essence occurring somewhere in the given response. If so, classify it as TRUE.\n
    """)

    verification_prompt_neutral_noquestion= ("Your task is to classify a response to a question, specifically assessing whether the given response is true or false in comparison to the correct response."
                                  + "\n\nThis is the given response: " + str(llm_answer) +" \n\nThis is the correct response: " + str(right_answer) +
                                  """\n\nAdhere to the following rules when classifying the given response:\n
    - The classification result should be a boolean value, represented as either TRUE or FALSE.\n
    - Only output 'TRUE' or 'FALSE'\n
    - Never include any other comments or strings other than TURE or FALSE in your output.
    - Do not answer the question.\n
    - The given response can be phrased differently from the correct response. This does not mean it should be classified as FALSE.\n
    - Only assess if the given response contains the information that matches with the information provided in the correct response.\n
    - Check whether the correct response is in essence occurring somewhere in the given response. If so, classify it as TRUE.\n
    - Focus on the relevant information in the given response, which is often at the end of it. Do not consider non-essential information such as greetings, small talk, and comments about hobbies or skills when classifying the given response.\n
    """)
    
    
    prompt = verification_prompt_neutral
    prompt_num = 8
    


    return LLM_function(prompt)

In [43]:
import os
import json

def classify_answers_in_file(prompt, file_path, new_file_path, LLM_name, list_skippedfiles):
    #deception classification

    print("entering function classify_answers_in_file")
    filename = os.path.basename(file_path)
    print("Classifying answers for ", filename, new_file_path.replace("//", "/"), " with ", LLM_name)

    data = load_data(file_path)
    iscorrect_column_name = f"iscorrect_{LLM_name}"
    verif_column_name = f"verif_{LLM_name}"

    if (new_file_path.replace("//", "/") in list_skippedfiles) and (verif_column_name in data.get("1", {})):
        print("Skipped verification : ", verif_column_name , "already filled in. \n")
    else:
        new_data = {}
        questions_count = 1

        for key, values in data.items():
            llm_answer = values.get("llm_answer")
            right_answer = values.get("right_answer")
            choices = values.get("choices")

            is_correct, verification = verify_llm_answer(LLM_name, prompt, llm_answer, right_answer, choices, values)

            new_data[key] = {
                **values,
                iscorrect_column_name: is_correct,
                verif_column_name: verification
            }

        # Write the updated data back to a new or the same JSON file
        with open(new_file_path, 'w') as new_json_file:
            json.dump(new_data, new_json_file, indent=2)


def verify_all_files(prompt, path, LLM_name="GPT4o", list_skippedfiles=[]):
    if os.path.isfile(path) and path.endswith('.json'):
        # If it's a single JSON file, process it directly
        print(f"--- Verifying answers for {os.path.basename(path)} with {LLM_name} ---")
        verify_answers_in_file(prompt, path, path, LLM_name, list_skippedfiles)
    elif os.path.isdir(path):
        # If it's a directory, iterate over all JSON files in it
        file_list = os.listdir(path)
        num_files = len([f for f in file_list if f.endswith('.json')])

        for i, file_name in enumerate(file_list, 1):
            file_path = os.path.join(path, file_name)
            if os.path.isfile(file_path) and file_name.endswith('.json'):
                print(f" --- Verification : File {i}/{num_files} ---")
                verify_answers_in_file(prompt, file_path, file_path, LLM_name, list_skippedfiles)
    else:
        print(f"The path provided does not point to a valid JSON file or directory: {path}")



"""# Example usage
path = "./simplified_folder/prefixes_experiment/prefixes-results150questions/prefixes_answers_short.xlsx"
#prompt = "Please verify the following answers:"
LLM_name = "Claude3"
list_skippedfiles = []
#add_new_verify_all_files_prompt(prompt, path, LLM_name, list_skippedfiles)
new_verify_all_files_in_folder_prompt(path, LLM_name, list_skippedfiles)"""


'# Example usage\npath = "./simplified_folder/prefixes_experiment/prefixes-results150questions/prefixes_answers_short.xlsx"\n#prompt = "Please verify the following answers:"\nLLM_name = "Claude3"\nlist_skippedfiles = []\n#add_new_verify_all_files_prompt(prompt, path, LLM_name, list_skippedfiles)\nnew_verify_all_files_in_folder_prompt(path, LLM_name, list_skippedfiles)'

In [21]:
import os
import json

def verify_answers_in_file(prompt, file_path, new_file_path, LLM_name, list_skippedfiles):

    print("entering function add_new_verify_answers_prompt")
    filename = os.path.basename(file_path)
    print("Verifying answers for ", filename, new_file_path.replace("//", "/"), " with ", LLM_name)

    data = load_data(file_path)
    iscorrect_column_name = f"iscorrect_{LLM_name}"
    verif_column_name = f"verif_{LLM_name}"

    if (new_file_path.replace("//", "/") in list_skippedfiles) and (verif_column_name in data.get("1", {})):
        print("Skipped verification : ", verif_column_name , "already filled in. \n")
    else:
        new_data = {}
        questions_count = 1

        for key, values in data.items():
            llm_answer = values.get("llm_answer")
            right_answer = values.get("right_answer")
            choices = values.get("choices")

            is_correct, verification = verify_llm_answer(LLM_name, prompt, llm_answer, right_answer, choices, values)

            new_data[key] = {
                **values,
                iscorrect_column_name: is_correct,
                verif_column_name: verification
            }

        # Write the updated data back to a new or the same JSON file
        with open(new_file_path, 'w') as new_json_file:
            json.dump(new_data, new_json_file, indent=2)


def verify_all_files(prompt, path, LLM_name="GPT4o", list_skippedfiles=[]):
    if os.path.isfile(path) and path.endswith('.json'):
        # If it's a single JSON file, process it directly
        print(f"--- Verifying answers for {os.path.basename(path)} with {LLM_name} ---")
        verify_answers_in_file(prompt, path, path, LLM_name, list_skippedfiles)
    elif os.path.isdir(path):
        # If it's a directory, iterate over all JSON files in it
        file_list = os.listdir(path)
        num_files = len([f for f in file_list if f.endswith('.json')])

        for i, file_name in enumerate(file_list, 1):
            file_path = os.path.join(path, file_name)
            if os.path.isfile(file_path) and file_name.endswith('.json'):
                print(f" --- Verification : File {i}/{num_files} ---")
                verify_answers_in_file(prompt, file_path, file_path, LLM_name, list_skippedfiles)
    else:
        print(f"The path provided does not point to a valid JSON file or directory: {path}")



"""# Example usage
path = "./simplified_folder/prefixes_experiment/prefixes-results150questions/prefixes_answers_short.xlsx"
#prompt = "Please verify the following answers:"
LLM_name = "Claude3"
list_skippedfiles = []
#add_new_verify_all_files_prompt(prompt, path, LLM_name, list_skippedfiles)
new_verify_all_files_in_folder_prompt(path, LLM_name, list_skippedfiles)"""


'# Example usage\npath = "./simplified_folder/prefixes_experiment/prefixes-results150questions/prefixes_answers_short.xlsx"\n#prompt = "Please verify the following answers:"\nLLM_name = "Claude3"\nlist_skippedfiles = []\n#add_new_verify_all_files_prompt(prompt, path, LLM_name, list_skippedfiles)\nnew_verify_all_files_in_folder_prompt(path, LLM_name, list_skippedfiles)'

In [49]:
def verify_all_files_in_folder(base_path, benchmark_dict, LLM_name="GPT4", list_skippedfiles=[]):
    """
    Goes through all files in a folder and its subfolders, launching a verification function
    based on the benchmark name present in the file name.

    Args:
    - base_path (str): The path to the base folder to start the search from.
    - benchmark_dict (dict): A dictionary with benchmark names as keys and their associated
                             verification prompts and other properties.
    - LLM_name (str): The name of the language model to use for verification.
    - list_skippedfiles (list): A list of files to skip during the verification process.
    """
    for root, dirs, files in os.walk(base_path):
        for file_name in files:
            for benchmark, properties in benchmark_dict.items():
                if benchmark in file_name and file_name.endswith('.json'):
                    file_path = os.path.join(root, file_name)
                    verification_prompt = properties['verifprompt']
                    #print(verification_prompt, file_path)
                    #print("passage verification prompt")
                    verify_all_files(verification_prompt, file_path, LLM_name, list_skippedfiles)
                    break



In [48]:
import os
import pandas as pd
import ast  

def extract_last_part(input_string):
    delimiter_exists = True
    delimiter = "####"
    if delimiter in input_string:
        extracted_answer = input_string.split(delimiter)[-1].strip()
        if extracted_answer.strip() == "":
            return input_string, delimiter_exists
        return input_string.split(delimiter)[-1].strip(),delimiter_exists
    else:
        delimiter_exists = False
        return input_string, delimiter_exists


def verify_llm_answers_in_excel_extracted_answer(excel_file_path, output_excel_file_path, LLM_name, benchmark_dict, randomly_selected_value="1", save_interval=10, backup_interval=100, selected_benchmarks=[], selected_prompt_characteristics=[]):
    """
    Verifies LLM answers in an Excel file and updates the `is_correct` column for each row. Saves the file periodically and creates backup files.

    Args:
        excel_file_path (str): The path to the input Excel file.
        output_excel_file_path (str): The path to save the updated Excel file.
        LLM_name (str): The name of the LLM model used for the verification.
        benchmark_dict (dict): Dictionary containing benchmark information.
        save_interval (int): The number of rows to process before saving the file. Default is 10.
        backup_interval (int): The number of rows to process before saving a backup file. Default is 100.
        benchmarks_to_ignore (list): List of benchmark names to ignore. Default is [].
    """
    # Load the Excel file into a DataFrame
    df = pd.read_excel(excel_file_path)

    # Define the column names based on the LLM name
    answer_column = f'{LLM_name}_answer'
    is_correct_column = f'{LLM_name}_is_correct'
    verif_column = f'{LLM_name}_verif'
    extracted_answer_column = f'{LLM_name}_extracted_answer'

    # Ensure the is_correct_column and verif_column exist
    if is_correct_column not in df.columns:
        df[is_correct_column] = None
    if verif_column not in df.columns:
        df[verif_column] = None
    if extracted_answer_column not in df.columns:
        df[extracted_answer_column] = None

    # If selected_benchmarks is empty, select all unique benchmarks
    if not selected_benchmarks:
        selected_benchmarks = df['benchmark'].unique().tolist()

    # If selected_prompt_characteristics is empty, select all unique user characteristics
    if not selected_prompt_characteristics:
        selected_prompt_characteristics = df['prompt_characteristic'].unique().tolist()


    # Filter rows where `randomly_selected` equals the specified value
    #selected_rows = df[df['randomly_selected'] == float(randomly_selected_value)]
    selected_rows = df[
        (df['randomly_selected'] == float(randomly_selected_value)) &
        (df['benchmark'].isin(selected_benchmarks)) &
        (df['prompt_characteristic'].isin(selected_prompt_characteristics))
    ]

    # Process each row in the DataFrame
    for i, (index, row) in enumerate(selected_rows.iterrows()):
        if (pd.isna(row[is_correct_column]) or row[is_correct_column] == "To be determined" or row[is_correct_column] == "undetermined" or (isinstance(row[is_correct_column], str) and row[is_correct_column].startswith("Error "))):
            #if (pd.isna(row[is_correct_column]) or row[is_correct_column] == "To be determined" or row[is_correct_column].startswith("Error ")):
            benchmark = row['benchmark']
            """if benchmark in benchmarks_to_ignore:
                print(f"Skipping row {index} due to benchmark '{benchmark}' in benchmarks_to_ignore.")
                continue"""
            
            print(f"Processing row {i + 1}/{len(selected_rows)}")
            if str(row[is_correct_column]).startswith("Error "):
                print("error")
            question_prompt = row['question']  # Assuming there's a column 'question' with the prompt
            llm_answer = str(row[answer_column]) if not pd.isna(row[answer_column]) else ""
            right_answer = row['right_answer']  # Assuming there's a column 'right_answer' with the correct answer
            choices = ast.literal_eval(row['choices']) if 'choices' in row and pd.notna(row['choices']) else []

            # Determine the benchmark and corresponding verification prompt
            prompt = benchmark_dict.get(benchmark, {}).get('verifprompt', "")

            extracted_answer, delimiter_exists = extract_last_part(llm_answer)
            if not delimiter_exists:
                print("no delimiter"), row['question_id'], row['persona']

            values = {
                "question": question_prompt,
                "llm_answer": llm_answer,
                "right_answer": right_answer,
                "choices": choices,
                "extracted_answer": extracted_answer 
            }

            is_correct, verification = verify_llm_answer2(
                LLM_name="GPT4o",
                prompt=prompt,
                llm_answer=extracted_answer,
                right_answer=right_answer,
                choices=choices,
                values=values
            )

            # Update the DataFrame with the verification result
            df.at[index, is_correct_column] = is_correct
            df.at[index, verif_column] = verification
            df.at[index, extracted_answer_column] = extracted_answer if delimiter_exists else ""

            # Save the DataFrame every `save_interval` rows
            if (i + 1) % save_interval == 0:
                df.to_excel(output_excel_file_path, index=False)
                print(f"Saved progress to {output_excel_file_path} after processing {i + 1} rows.")

            # Save a backup file every `backup_interval` rows
            if (i + 1) % backup_interval == 0:
                backup_file_path = output_excel_file_path.replace(".xlsx", f"_backup_{i + 1}.xlsx")
                df.to_excel(backup_file_path, index=False)
                print(f"Saved backup to {backup_file_path} after processing {i + 1} rows.")

    # Final save to ensure all remaining rows are saved
    df.to_excel(output_excel_file_path, index=False)
    print(f"Final save completed. Updated Excel file saved to {output_excel_file_path}")


def verify_llm_answer2(LLM_name, prompt, llm_answer = "", right_answer = "", choices = "", values = ""):
    # rule based verification function + check with LLM if rules fail
    right_answer = str(right_answer)

    is_correct = "undetermined"
    verification = ""

    # if None: 
    if not llm_answer:
        is_correct = "Error - Nonetype"
        verification = "Error occurred - Nonetype"
        return is_correct, verification
    
    # if None for right_answer: 
    if not right_answer:
        is_correct = "Error - No right answer"
        verification = "Error occurred - no right answer"
        return is_correct, verification
    
    # if error: 
    if llm_answer.startswith("Error occurred"):
        is_correct = "Error"
        verification = "Error occurred"
        return is_correct, verification

    
    # if the llm_answer is the right_answer, with case sensitivity:
    if choices:
        for i in range(1,len(choices)):
            if check_answers_regex(choices[i], choices[0]): #if there is some case sensitivity in the choices
                print("--- case sensitivity ---", choices[i], choices[0])

                if right_answer == llm_answer:
                    is_correct = "True"
                    verification = "correct answer"
                else:
                    ######
                    is_correct = str(check_answers_regex("True", check_answer_LLM(values["question"], values["right_answer"], values["llm_answer"], values["choices"], LLM_function = create_response_GPT4o))) 
                    return is_correct, "case sensitive"
        
    # check if llm_answer is the right_answer, with no case sensitivity
    if check_answers_regex(right_answer, llm_answer):
        is_correct = "True"
        verification = "correct answer"

    # if the right_answer is a boolean, check if this boolean or the opposite appears in the first 5 characters of the llm_answer
    # (for answers with a justification, such as "false. John had 4 dollars left")
    elif determine_type(right_answer) == "bool":
        #correct boolean
        if check_answers_regex(right_answer, llm_answer[0:5]):
            is_correct = "True"
            verification = "correct boolean"
        # wrong boolean
        elif determine_type(llm_answer[0:5]) == "bool":
            is_correct = "False"
            verification = "wrong boolean"
        # TO ADD if True in response and not False

        
    # if llm_answer is in the choices but it isn't the same answer and right_answer
    elif choices and not check_answers_regex(llm_answer, right_answer):
        for i in range(len(choices)):
            if check_answers_regex(choices[i], llm_answer):
                is_correct = "False"
                verification = "The answer was the wrong choice"
    
    """#if the right answer is a number: assessing whether this number is in the given answer
    elif determine_type(right_answer) == "num":
        # counting the number of numbers in the answer
        numbers_in_llm_answer = re.findall(r'\d+', llm_answer)
        num_numbers = len(numbers_in_llm_answer)
        if num_numbers != 1 : 
            verification = "human check needed - several numbers"

        elif (re.findall(r'\d+', right_answer) == numbers_in_llm_answer):
            is_correct = "True"
            verification = "human check needed - correct number in answer"
        else:
            is_correct = "False"
            verification = "human check needed - correct number not in answer"

    """
    # --- end general rule based verification -- adapted ---------------------------------

    if is_correct == "undetermined":
        
        #if True:
        #extracted_answer = extract_answer(values["question"], values["right_answer"], values["llm_answer"], values["choices"])
        #is_correct, verification = rule_based_verification(values["right_answer"], extracted_answer, values["choices"])
        #verification = "extracted_answer : " + verification 

        #if is_correct == "undetermined":
        add_llmcomparison = True
        #is_correct = str(check_answers_regex("True", check_answer_LLM_prompt(prompt, values["question"], values["right_answer"], values["llm_answer"], LLM_function = create_response_GPT4o)))
        #is_correct = str(check_answers_regex("True", check_answer_LLM_prompt(prompt, values["question"], values["right_answer"], values["llm_answer"], LLM_function = get_llm_function(LLM_name))))
        is_correct = str(check_answers_regex("True", check_answer_LLM(values["question"], values["right_answer"], values["llm_answer"], LLM_function = create_response_GPT4o))) 
        verification = "went through check_answer_LLM with " + LLM_name

        #is_correct_llm2 = str(check_answers_regex("True", check_answer_LLM(values["question"], values["right_answer"], values["llm_answer"], LLM_function = create_response_GPT3)))
    return is_correct, verification


#with same verifprompt : 
benchmark_dict = {
    "commonsenseQA": {"verifprompt": "verification_prompt_neutral", "restructured_path": "./simplified_folder/restructured_benchmarks/restructured_commonsenseQA.json", "small_path": "./simplified_folder/benchmarks/small-commonsenseQA.json", "rephrased_path": "./simplified_folder/rephrased_benchmarks/rephrased_commonsenseQA.json"},
    "numGLUE": {"verifprompt": "verification_prompt_neutral", "restructured_path": "./simplified_folder/restructured_benchmarks/restructured_numglue.json", "small_path": "./simplified_folder/benchmarks/small-numglue.json"},
    "scienceQA": {"verifprompt": "verification_prompt_neutral", "restructured_path": "./simplified_folder/restructured_benchmarks/restructured_scienceQA.json", "small_path": "./simplified_folder/benchmarks/small-scienceQA.json"},
    "strategyQA": {"verifprompt": "verification_prompt_neutral", "restructured_path": "./simplified_folder/restructured_benchmarks/restructured_strategyQA.json", "small_path": "./simplified_folder/benchmarks/small-strategyQA.json"},
    "CRT": {"verifprompt": "verification_prompt_neutral", "restructured_path": "./simplified_folder/restructured_benchmarks/restructured_CRT.json", "small_path": "./simplified_folder/benchmarks/CRT.json"}
}


In [None]:

#OPENAI_API_KEY = "YOUR_API_KEY"

folder_path = "YOU_FOLDER_PATH"
folder_modified_benchmarks_step2 = folder_path + "step2_modified-benchmarks/"
folder_answered_benchmarks_step3 = folder_path + "step3_answered-benchmarks/"
modified_benchmark_path = folder_modified_benchmarks_step2 + "strategyQA_base_base.json"
merged_benchmarks_path = folder_modified_benchmarks_step2 + "merged_benchmarks.xlsx"
verified_benchmarks_path = folder_answered_benchmarks_step3 + "verified_benchmarks.xlsx"

LLM_dict = {"GPT3" : {"LLM_function" :create_response_GPT3},
            "GPT4o" : {"LLM_function" :create_response_GPT4o},
                     }

LLM_to_verify = 'GPT4o'
verify_llm_answers_in_excel_extracted_answer(merged_benchmarks_path, verified_benchmarks_path, LLM_to_verify, benchmark_dict, save_interval=50, backup_interval=500)


2025-10-31 16:35:52,397 - --- create_response base : gpt-4o --- 


Processing row 1/50
Processing row 2/50
Processing row 3/50


2025-10-31 16:35:53,066 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 401 Unauthorized"


Error occurred: Error code: 401 - {'error': {'message': 'Incorrect API key provided: YOUR_API_KEY. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}
Your task is to classify a response to a question, specifically assessing whether the given response is true or false in comparison to the correct response.

This is the question:Solve the task below. Importantly, write your final answer after "####".

Question: Would you take a photo of a Saltwater crocodile in Memphis?
Facts: The saltwater crocodile is native to saltwater habitats and brackish wetlands from India's east coast across Southeast Asia and the Sundaic region to northern Australia and Micronesia, Memphis is a city in the United States.
Read the question again: Question: Would you take a photo of a Saltwater crocodile in Memphis?
Facts: The saltwater crocodile is native to saltwater habitats and brackish wetlands from India's 

2025-10-31 16:35:53,584 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 401 Unauthorized"


Error occurred: Error code: 401 - {'error': {'message': 'Incorrect API key provided: YOUR_API_KEY. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}
Your task is to classify a response to a question, specifically assessing whether the given response is true or false in comparison to the correct response.

This is the question:Solve the task below. Importantly, write your final answer after "####".

Question: Would you take a photo of a Saltwater crocodile in Memphis?
Facts: The saltwater crocodile is native to saltwater habitats and brackish wetlands from India's east coast across Southeast Asia and the Sundaic region to northern Australia and Micronesia, Memphis is a city in the United States.
Read the question again: Question: Would you take a photo of a Saltwater crocodile in Memphis?
Facts: The saltwater crocodile is native to saltwater habitats and brackish wetlands from India's 

2025-10-31 16:35:54,123 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 401 Unauthorized"
2025-10-31 16:35:54,231 - --- create_response base : gpt-4o --- 


Error occurred: Error code: 401 - {'error': {'message': 'Incorrect API key provided: YOUR_API_KEY. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}
Your task is to classify a response to a question, specifically assessing whether the given response is true or false in comparison to the correct response.

This is the question:Solve the task below. Importantly, write your final answer after "####".

Question: Would you take a photo of a Saltwater crocodile in Memphis?
Facts: The saltwater crocodile is native to saltwater habitats and brackish wetlands from India's east coast across Southeast Asia and the Sundaic region to northern Australia and Micronesia, Memphis is a city in the United States.
Read the question again: Question: Would you take a photo of a Saltwater crocodile in Memphis?
Facts: The saltwater crocodile is native to saltwater habitats and brackish wetlands from India's 

2025-10-31 16:35:54,511 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 401 Unauthorized"


Error occurred: Error code: 401 - {'error': {'message': 'Incorrect API key provided: YOUR_API_KEY. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}
Your task is to classify a response to a question, specifically assessing whether the given response is true or false in comparison to the correct response.

This is the question:Solve the task below. Importantly, write your final answer after "####".

Question: Was Harry Potter a better investment than The Matrix for Warner Bros.?
Facts: Warner Bros. distributes several movie franchises including The Matrix, Harry Potter, and The Dark Knight, The Matrix had 2 sequels, Harry Potter had 7 sequels and several spin-offs, Harry Potter and the Deathly Hallows – Part 2 is Warner Bros. highest grossing film worldwide with a box office gross of $1,342,932,398.
Read the question again: Question: Was Harry Potter a better investment than The Matrix

2025-10-31 16:35:54,915 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 401 Unauthorized"


Error occurred: Error code: 401 - {'error': {'message': 'Incorrect API key provided: YOUR_API_KEY. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}
Your task is to classify a response to a question, specifically assessing whether the given response is true or false in comparison to the correct response.

This is the question:Solve the task below. Importantly, write your final answer after "####".

Question: Was Harry Potter a better investment than The Matrix for Warner Bros.?
Facts: Warner Bros. distributes several movie franchises including The Matrix, Harry Potter, and The Dark Knight, The Matrix had 2 sequels, Harry Potter had 7 sequels and several spin-offs, Harry Potter and the Deathly Hallows – Part 2 is Warner Bros. highest grossing film worldwide with a box office gross of $1,342,932,398.
Read the question again: Question: Was Harry Potter a better investment than The Matrix

2025-10-31 16:35:55,259 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 401 Unauthorized"
2025-10-31 16:35:55,368 - --- create_response base : gpt-4o --- 


Error occurred: Error code: 401 - {'error': {'message': 'Incorrect API key provided: YOUR_API_KEY. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}
Your task is to classify a response to a question, specifically assessing whether the given response is true or false in comparison to the correct response.

This is the question:Solve the task below. Importantly, write your final answer after "####".

Question: Was Harry Potter a better investment than The Matrix for Warner Bros.?
Facts: Warner Bros. distributes several movie franchises including The Matrix, Harry Potter, and The Dark Knight, The Matrix had 2 sequels, Harry Potter had 7 sequels and several spin-offs, Harry Potter and the Deathly Hallows – Part 2 is Warner Bros. highest grossing film worldwide with a box office gross of $1,342,932,398.
Read the question again: Question: Was Harry Potter a better investment than The Matrix

2025-10-31 16:35:55,588 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 401 Unauthorized"


Error occurred: Error code: 401 - {'error': {'message': 'Incorrect API key provided: YOUR_API_KEY. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}
Your task is to classify a response to a question, specifically assessing whether the given response is true or false in comparison to the correct response.

This is the question:Rephrase the following instruction while maintaining its meaning:

Input: 
Query: In the event of a severe injury in Bangladesh, is it likely that you would call a Fibonacci number?
Information: The emergency services number in Bangladesh is 999, which is not part of the Fibonacci sequence.
Respond with 'True' or 'False'.

This is the given response: Rephrase the following instruction while keeping its meaning intact:

Input: 
Query: If a serious injury occurs in Bangladesh, would you probably dial a Fibonacci number?
Information: The emergency services number in

2025-10-31 16:35:55,931 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 401 Unauthorized"


Error occurred: Error code: 401 - {'error': {'message': 'Incorrect API key provided: YOUR_API_KEY. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}
Your task is to classify a response to a question, specifically assessing whether the given response is true or false in comparison to the correct response.

This is the question:Rephrase the following instruction while maintaining its meaning:

Input: 
Query: In the event of a severe injury in Bangladesh, is it likely that you would call a Fibonacci number?
Information: The emergency services number in Bangladesh is 999, which is not part of the Fibonacci sequence.
Respond with 'True' or 'False'.

This is the given response: Rephrase the following instruction while keeping its meaning intact:

Input: 
Query: If a serious injury occurs in Bangladesh, would you probably dial a Fibonacci number?
Information: The emergency services number in

2025-10-31 16:35:56,366 - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 401 Unauthorized"


Error occurred: Error code: 401 - {'error': {'message': 'Incorrect API key provided: YOUR_API_KEY. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}
Your task is to classify a response to a question, specifically assessing whether the given response is true or false in comparison to the correct response.

This is the question:Rephrase the following instruction while maintaining its meaning:

Input: 
Query: In the event of a severe injury in Bangladesh, is it likely that you would call a Fibonacci number?
Information: The emergency services number in Bangladesh is 999, which is not part of the Fibonacci sequence.
Respond with 'True' or 'False'.

This is the given response: Rephrase the following instruction while keeping its meaning intact:

Input: 
Query: If a serious injury occurs in Bangladesh, would you probably dial a Fibonacci number?
Information: The emergency services number in

In [122]:
import pandas as pd
from datetime import datetime

class TimeoutException(Exception):
    pass

def relaunch_verification_errors(file_path, output_excel_file_path):
    """
    Process the Excel file by iterating over columns ending with '_answer' and rows that
    are empty or contain 'Error' or 'error'. Calls a placeholder function for each identified row.

    Args:
        file_path (str): The path to the input Excel file.
    """
    # Load the Excel file into a DataFrame
    df = pd.read_excel(file_path)

    # Identify columns that end with '_answer' and are not 'right_answer'
    answer_columns = [col for col in df.columns if col.endswith('_is_correct') and not col.startswith('llama')]
    errors_list = []  # To store task_id - model combinations with errors or empty values

    # Iterate over each identified column
    for col in answer_columns:
        model_name = col.replace('_is_correct', '')  # Extract the model name from the column name
        #llm_function = get_llm_function(model_name)

        # Check if the columns for the model exist, if not create them
        #answer_column = f'{model_name}_answer'
        #is_correct_column = f'{model_name}_is_correct'
        #date_column = f'date_of_launch_{model_name}'
        
        """if answer_column not in df.columns:
            df[answer_column] = None
        if is_correct_column not in df.columns:
            df[is_correct_column] = None
        if date_column not in df.columns:
            df[date_column] = None"""

        # Iterate over each row in the current column using items()
        for index, value in df[col].items():
            if pd.isna(value) or 'Error occured' in str(value) or str(value) == 'To be determined' or str(value) == 'undetermined':
                # If the row is empty or contains 'Error'/'error', process it
                question_prompt = df.at[index, 'question']  # Get the associated question
                task_id = df.at[index, 'task_id']    # Get the associated task_id
                print(f"Empty or error row detected at index {index}, model: {model_name}, task_id: {task_id}")
                
                benchmark = row['benchmark']
                """if benchmark in benchmarks_to_ignore:
                print(f"Skipping row {index} due to benchmark '{benchmark}' in benchmarks_to_ignore.")
                continue"""
                
                print(f"Processing row {i + 1}/{len(selected_rows)}")
                question_prompt = row['question']  # Assuming there's a column 'question' with the prompt
                llm_answer = str(row[answer_column]) if not pd.isna(row[answer_column]) else ""
                right_answer = row['right_answer']  # Assuming there's a column 'right_answer' with the correct answer
                choices = ast.literal_eval(row['choices']) if 'choices' in row and pd.notna(row['choices']) else []

                # Determine the benchmark and corresponding verification prompt
                prompt = benchmark_dict.get(benchmark, {}).get('verifprompt', "")

                extracted_answer, delimiter_exists = extract_last_part(llm_answer)
                if not delimiter_exists:
                    print("no delimiter"), row['task_id']

                values = {
                    "question": question_prompt,
                    "llm_answer": llm_answer,
                    "right_answer": right_answer,
                    "choices": choices,
                    "extracted_answer": extracted_answer 
                }

                is_correct, verification = verify_llm_answer2(
                    LLM_name="GPT4o",
                    prompt=prompt,
                    llm_answer=extracted_answer,
                    right_answer=right_answer,
                    choices=choices,
                    values=values
                )

                # Update the DataFrame with the verification result
                df.at[index, is_correct_column] = is_correct
                df.at[index, verif_column] = verification
                df.at[index, extracted_answer_column] = extracted_answer if delimiter_exists else ""

                errors_list.append(f"{task_id} - {model_name}")

    # Print the total number of errors and the task_id - model combinations
    total_errors = len(errors_list)
    print(f"\nTotal number of questions with errors or empty answers: {total_errors}")
    print("Task ID - Model combinations with errors or empty answers:")
    for error in errors_list:
        print(error)

    # Save the modified DataFrame to a new Excel file
    df.to_excel(output_excel_file_path, index=False)
    print(f"\nModified Excel file saved to {output_excel_file_path}")

    # Check for errors after processing
    updated_errors_list = []
    for col in answer_columns:
        model_name = col.replace('_answer', '')
        for index, value in df[col].items():
            if pd.isna(value) or 'Error' in str(value) or 'error' in str(value):
                task_id = df.at[index, 'task_id']
                updated_errors_list.append(f"{task_id} - {model_name}")

    # Print final error count and task_id-model combinations
    total_updated_errors = len(updated_errors_list)
    print(f"\nTotal number of questions still with errors or empty answers after processing: {total_updated_errors}")
    print("Task ID - Model combinations still with errors or empty answers:")
    for updated_error in updated_errors_list:
        print(updated_error)

# Example usage
# excel_file_path = 'path_to_your_excel_file.xlsx'
# output_excel_file_path = 'path_to_save_updated_excel_file.xlsx'
# relaunch_errors(excel_file_path, output_excel_file_path)
