This code segment adapts the selected useable benchmarks and modifies the prompting, either by adding a prefix or a suffix to the tasks, or by asking an LLM to rewrite the prompts given a specific instruction


## Imports and installations

In [3]:
# --- Imports ---
import os
import json
import random
import pandas as pd
import time
import re
import shutil
import logging
from openai import OpenAI


## Loading and saving data files from and into JSON or Excel files

Functions:
- load_data(path) : Loads data from an Excel or JSON file and returns it as a dictionary.
    - Args:
        path (str): the file path
    - Returns:
        dict: data loaded as a dictionary
<br>
<br>


- save_data(data, path): Saves data (dictionary or DataFrame) to a JSON or Excel file based on the file extension in the path
    - Args:
        data (dict or DataFrame): data to be saved
        path (str): file path where data will be saved
    - Returns:
        bool: True if data is successfully saved, False otherwise.

In [4]:

def load_data(path):
    """
    Loads data from an Excel or JSON file and returns it as a dictionary.

    Args:
        path (str): the file path

    Returns:
        dict: data loaded as a dictionary
    """
    data = None

    if path.endswith('.xlsx'):
        try:
            data = pd.read_excel(path).to_dict(orient='records')
        except Exception as e:
            print(f"Error loading data from Excel file: {str(e)}")

    elif path.endswith('.json'):
        try:
            with open(path, 'r') as json_file:
                data = json.load(json_file)
        except Exception as e:
            print(f"Error loading data from JSON file: {str(e)}")
    else:
        print("Unsupported file format. Please provide an Excel (.xlsx) or JSON (.json) file.")

    return data


def save_data(data, path):
    """
    Saves data (dictionary or DataFrame) to a JSON or Excel file based on the file extension in the path

    Args:
        data (dict or DataFrame): data to be saved
        path (str): file path where data will be saved

    Returns:
        bool: True if data is successfully saved, False otherwise.
    """

    try:
        if path.endswith('.json'):
            if isinstance(data, dict):
                with open(path, 'w') as json_file:
                    json.dump(data, json_file, indent=2)
            elif isinstance(data, pd.DataFrame):
                data_dict = data.to_dict(orient='list')
                with open(path, 'w') as json_file:
                    json.dump(data_dict, json_file, indent=2)


        elif path.endswith('.xlsx'):

            if isinstance(data, dict):
                # Convert dictionary to DataFrame if it's a dictionary
                data = pd.DataFrame.from_dict(data)
            data.to_excel(path, index=False)
        else:
            print("Unsupported file format. Please provide a JSON (.json) or Excel (.xlsx) file path.")
            return False

        #print(f"Data saved to {path}")
        return True
    except Exception as e:
        print(f"Error saving data: {str(e)}")
        return False

## Adding prompt engineering pre- and suffixes in the benchmarks

This section modifies benchmarks with prompt engineering techniques based on adding some instructions or description before and/or after the question, such as zero-shot chain-of-thought prompting ("Answer step-by-step") or ExpertPrompting ("You are an expert in ...").

Functions :
- questions_with_intro(intro_prompt_dict, benchmark_path, benchmark_with_intro_path):
    """Adds introduction prompts to the benchmark questions

    Args:
        intro_prompt_dict (dict): dictionary containing all introduction prompts.
        Each instance should be in the following format: {"FirstName" : ["Introduction prompt", " Additional prompt added at the end of the question", "education level: either 'educated', 'uneducated', or 'neutral' "]}
        benchmark_path (str): path of the benchmark to be updated
        benchmark_with_intro_path (str): path of the updated benchmark with introduction prompts
    """

In [5]:
def questions_with_intro( benchmark_path, folder_benchmark_with_intros, benchmark_name, intro_prompt_dict = {}):
    """Adds introduction prompts to the benchmark questions

    Args:
        intro_prompt_dict (dict): dictionary containing all introduction prompts.
        Each instance should be in the following format: {"FirstName" : ["Introduction prompt", " Additional prompt added at the end of the question", "prompt_characteristic: either 'educated', 'uneducated', or 'neutral' "]}
        benchmark_path (str): path of the benchmark to be updated
        benchmark_with_intro_path (str): path of the updated benchmark with introduction prompts
    """
    #creating a folder to put in the rewritten benchmarks, if it does not already exist
    if not os.path.exists(folder_benchmark_with_intros):
        os.makedirs(folder_benchmark_with_intros)

    for persona, promptlist in intro_prompt_dict.items():
        benchmark_with_intro = {}

        suffix = promptlist[1]
        prompt = promptlist[0]
        #print(promptlist, suffix, prompt)

        data = load_data(benchmark_path)

        for key, value in data.items():
            new_question = prompt + "\n" + "Answer this question: " + "\n" + value["question"] + "\n" + suffix

            new_instance = {
                    "question": new_question,
                    "original_question": value["original_question"],
                    "choices": value["choices"],
                    "right_answer": value["right_answer"],
                    "benchmark": value["benchmark"],
                    "question_id": value["question_id"],
                    "persona": persona,
                    "prompt_characteristic": promptlist[2]
                }

            benchmark_with_intro[key] = new_instance

        # Defining a name for the created files
        # careful: some later functions are based on the systems of underscores here to retrieve values,
        # they will not work if the name is not under the format benchmarkname_persona_usercharacteristic_llm.json
        # (no extra underscore should be in the names)

        personalised_benchmark_path = folder_benchmark_with_intros + benchmark_name + "_" + persona +"_" + promptlist[2] + ".json"


        save_data(benchmark_with_intro, personalised_benchmark_path)


## Rephrasing the benchmarks following a specific prompt

This section modifies benchmarks with prompt engineering techniques based on asking LLMs to rephrase the question.

Functions :
- create_response_GPT_base(question_prompt, basemodel, api_key=OPENAI_API_KEY):
    Function for getting responses from GPT models.

    Args:
        question_prompt (str): The question prompt
        basemodel (str): The model to use (e.g., "gpt-4o")
        api_key (str, optional): OpenAI API key. Defaults to OPENAI_KEY.

    Returns:
        str: The model's response
    
- rephrase_questions(additional_prompt, input_path, output_reformated_path, persona, LLM_function = create_response_GPT4o):
    Rephrases all questions in a benchmark JSON file using a given LLM function,
    adds persona and prompt_characteristic fields, and saves the modified data.

    Args:
        additional_prompt (str): Text prompt with the rephrasing instructions
        input_path (str): Path to the input JSON benchmark file.
        output_reformated_path (str): Path where the rephrased JSON file will be saved.
        persona (str): Label describing the persona/prompt characteristic to associate with the questions.
        LLM_function (callable, optional): Function that takes a string input and returns
            a rephrased version of the question (default: create_response_GPT4o).

    Output:
        Saves a formatted JSON file with the same structure as
        the input, but with rephrased questions and added persona metadata.
    
- rephrase_all_prompts(benchmark_name, benchmark_path, rephrasing_dict={}, folder_new_benchmark = "./rephrased_benchmarks/"):
    Applies the rephrase_questions function on all prompts in rephrasing_dict

- add_base(input_path, benchmarkname, folder_new_benchmark = "./modified-benchmarks/"):
    Adds 'persona' and 'prompt_characteristic' = 'base' to each entry for the control dataset.
    


    



In [None]:
# if rephrasing benchmarks with LLMs: 

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
logger = logging.getLogger(__name__)

OPENAI_API_KEY = "YOUR_API_KEY"

def create_response_GPT_base(question_prompt, basemodel, api_key=OPENAI_API_KEY):
    """Base function for getting responses from GPT models.

    Args:
        question_prompt (str): The question prompt
        basemodel (str): The model to use (e.g., "gpt-4o")
        api_key (str, optional): OpenAI API key. Defaults to OPENAI_KEY.

    Returns:
        str: The model's response
    """
    logger.info(f"--- create_response base : {basemodel} --- ")

    response = None
    retries = 0

    while retries < 3:
        try:
            client = OpenAI(api_key=api_key)

            resp = client.chat.completions.create(
                model=basemodel,
                temperature=0,
                max_tokens=1000,
                seed=1,
                messages=[
                    {"role": "user", "content": question_prompt}
                ]
            )
            print("tokens : ", resp.usage.total_tokens)
            print("model used : ", resp.model)
            response = resp.choices[0].message.content
            break
        except Exception as e:
            print(f"Error occurred: {e}")
            print(question_prompt)
            time.sleep(0.100)
            response = f"Error occurred: {e}"
            retries += 1

    return response

def create_response_GPT4o(question_prompt, api_key=OPENAI_API_KEY):
    return create_response_GPT_base(question_prompt, "gpt-4o", api_key)

# create_response_GPT4o("what is 2+2?")

In [None]:

def rephrase_questions(additional_prompt, input_path, output_reformated_path, persona, LLM_function = create_response_GPT4o):
    """
    Rephrases all questions in a benchmark JSON file using a given LLM function,
    adds persona and prompt_characteristic fields, and saves the modified data.

    Args:
        additional_prompt (str): Text prompt with the rephrasing instructions
        input_path (str): Path to the input JSON benchmark file.
        output_reformated_path (str): Path where the rephrased JSON file will be saved.
        persona (str): Label describing the persona/prompt characteristic to associate with the questions.
        LLM_function (callable, optional): Function that takes a string input and returns
            a rephrased version of the question (default: create_response_GPT4o).

    Output:
        Saves a formatted JSON file at `output_reformated_path` with the same structure as
        the input, but with rephrased questions and added persona metadata.
    """

    new_data = {}
    questions_count = 1

    with open(input_path, 'r') as json_file:

        data = json.load(json_file)
        for instance_id, instance in data.items():
            question = instance.get("question", "")
            question = LLM_function(additional_prompt + question)

            new_instance = {
                    "question": question,
                    "original_question": instance.get("original_question"),
                    "choices": instance.get("choices"),
                    "right_answer" : instance.get("right_answer"),
                    "question_id" : instance.get("question_id"),
                    "benchmark" : instance.get("benchmark"),
                    "persona": persona,
                    "prompt_characteristic": persona
            }

            new_data[questions_count] = new_instance

            questions_count += 1

    with open(output_reformated_path, 'w') as new_json_file:
        json.dump(new_data, new_json_file, indent=2)

def rephrase_all_prompts(benchmark_name, benchmark_path, rephrasing_dict={}, folder_new_benchmark = "./rephrased_benchmarks/"):

    #creating a folder to put in the rewritten benchmarks, if it does not already exist
    if not os.path.exists(folder_new_benchmark):
        os.makedirs(folder_new_benchmark)

    for key, value in rephrasing_dict.items():
        print(" Rephrasing ", key)
        rephrased_path = folder_new_benchmark + benchmark_name + "_" + key + ".json"
        rephrase_questions(value, benchmark_path, rephrased_path, key)
        print("---")


def add_base(input_path, benchmarkname, folder_new_benchmark = "./modified-benchmarks/"):
    """
    Adds 'persona' and 'prompt_characteristic' = 'base' to each entry for the control dataset.
    """
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    for instance in data.values():
        instance["persona"] = "base"
        instance["prompt_characteristic"] = "base"

    os.makedirs(folder_new_benchmark, exist_ok=True)
    base_filename = os.path.splitext(os.path.basename(input_path))[0]
    output_path = os.path.join(folder_new_benchmark, f"{benchmarkname}_base_base.json")

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"✅ File saved to {output_path}")



## Re-Reading Prompt Engineering technique

This function is specific to the Re-Reading prompt engineering technique as it is based on a different structure than many of the other prefix or suffix-based prompt engineering technique

In [55]:
def rereading_benchmark(input_path, output_path):

    with open(input_path, 'r') as infile:
        input_json = json.load(infile)
    
    updated_json = {}

    for key, value in input_json.items():
        original_question_text = value["original_question"].replace(" \nAnswer by only giving the correct option.", "")
        
        new_question = ("Solve the task below. Importantly, write your final answer after \"####\".\n\n" + original_question_text + "\nRead the question again: " + original_question_text + "\n")

        # Update benchmark and question_id
        benchmark_name = value["benchmark"].split('-')[0]
        question_id = value["question_id"]
        #benchmark_name = "CRT"
        #question_id = f"{benchmark_name}_{key}"

        updated_json[key] = {
            "question": new_question,
            "original_question": original_question_text,
            "right_answer": value["right_answer"],
            "prompt_characteristic": "rereading",
            "persona": "rereading",
            "benchmark": benchmark_name,
            "question_id": question_id
        }

        if "choices" in value:
            updated_json[key]["choices"] = value["choices"]

    with open(output_path, 'w') as outfile:
        json.dump(updated_json, outfile, indent=2)
    print(f"Updated JSON has been saved to {output_path}")


# Generate all benchmark variants

In [None]:

def create_modified_benchmarks (benchmarkname, benchmarkpath, bioprefix_dict, rephrasing_dict, newfolder):
     """
    Generates a complete set of modified benchmark variants by sequentially applying
    all transformation functions (intro addition, LLM-based rephrasing, and base version creation).

    Specifically, this function:
      1. Calls `questions_with_intro()` to prepend custom introductions defined
         in `bioprefix_dict` to the benchmark questions, producing multiple prefixed versions.
      2. Calls `rephrase_all_prompts()` to rephrase the same benchmark using GPT-based
         prompts from `rephrasing_dict`, generating multiple rephrased variants.
      3. Calls `add_base()` to create a baseline version of the benchmark with
         neutral persona fields ('base') for reference or comparison.

    Args:
        benchmarkname (str): Name identifier for the benchmark (used in output filenames).
        benchmarkpath (str): Path to the original benchmark JSON file.
        bioprefix_dict (dict): Mapping of prefix names to introductory text snippets
            to be prepended to each question.
        rephrasing_dict (dict): Mapping of persona names to prompt rephrasing instructions
            for GPT-based question generation.
        newfolder (str): Output directory where all generated benchmark variants will be saved.

    Output:
        Creates and saves:
          - One version per prefix in `bioprefix_dict`
          - One version per persona in `rephrasing_dict`
          - One baseline version with 'persona' and 'prompt_characteristic' set to 'base'
        All files are stored in the specified `newfolder`.
    """

    questions_with_intro(benchmarkpath, newfolder, benchmarkname, bioprefix_dict)
    # all N prefixes from the dictionary added, creation of N new versions of the benchmark

    rephrase_all_prompts(benchmarkname, benchmarkpath, rephrasing_dict, newfolder)
    # used an LLM (GPT4) to rephrase the benchmarks according to all M prompts in the rephrasing dictionary, creation of M rephrased versions of the benchmark in the selected folder

    add_base(benchmarkpath, benchmarkname, folder_rephrased_benchmark)
    #shutil.copyfile(benchmarkpath, folder_rephrased_benchmark + benchmarkname +'_base_neutral.json')

# Step 2 Main

In [None]:
# Example with StrategyQA:

OPENAI_API_KEY = "YOUR_API_KEY"

folder_path = "YOUR_FOLDER_PATH"
folder_modified_benchmarks_step2 = folder_path + "step2_modified-benchmarks/"
cleaned_benchmark_path = folder_path + "strategyqa_cleaned_10questions.json"

# examples of dictionaries for the prompt engineering techniques
bioprefix_dict = {"educatedpersona" : ["Hello, my name is Dr. Mark Davis. I am a globally recognized cardiologist from Los Angeles, California. I completed my studies at Yale University where I was awarded my PhD in Cardiology. My research primarily investigates the role of lifestyle in heart diseases. During my free time, I like to cycle, play chess, and read biographies. I also devote time to mentor young doctors in community health centers.", " Please reply succintly", "educated"],
                     "uneducatedpersona" : ["Hello, my name is Billy Thompson. Hailing from Phoenix, Arizona, I never had much of a taste for schoolin', never graduated past fifth grade. I follow my own curiosities. Fishing, working on my motorcycle, and listening to classic rock records are how I like to spend my days. I'm also very good with my hands, building different things from spare parts lying around. You'll always find me helping folks fix things in the neighborhood.", " Reply with few words", "uneducated"],
                     }
rephrasing_dict = {"promptvariation" : """Generate a variation of the following instruction while keeping the semantic meaning. \n
Input : 
"""}

create_modified_benchmarks("strategyQA", cleaned_benchmark_path, bioprefix_dict, rephrasing_dict, folder_new_benchmarks)
rereading_benchmark(cleaned_benchmark_path, folder_new_benchmarks + "strategyQA_rereading_rereading.json")


