This code segment creates new useable benchmarks based on the raw initial benchmarks.
After applying this code, the 5 benchmarks have the same structure and have been filtered to select questions with question marks, and the punctuation typos have been fixed.

## Imports and installations

In [15]:
# --- Imports ---

import os
import json
import random
import pandas as pd
import time
import regex as re
import shutil
import itertools




# Restructuring benchmarks

## Loading and saving data files from and into JSON or Excel files

Functions:
- load_data(path) : Loads data from an Excel or JSON file and returns it as a dictionary.
    - Args:
        path (str): the file path
    - Returns:
        dict: data loaded as a dictionary
<br>
<br>


- save_data(data, path): Saves data (dictionary or DataFrame) to a JSON or Excel file based on the file extension in the path
    - Args:
        data (dict or DataFrame): data to be saved
        path (str): file path where data will be saved
    - Returns:
        bool: True if data is successfully saved, False otherwise.

In [3]:

def load_data(path):
    """
    Loads data from an Excel or JSON file and returns it as a dictionary.

    Args:
        path (str): the file path

    Returns:
        dict: data loaded as a dictionary
    """
    data = None

    if path.endswith('.xlsx'):
        try:
            data = pd.read_excel(path).to_dict(orient='records')
        except Exception as e:
            print(f"Error loading data from Excel file: {str(e)}")

    elif path.endswith('.json'):
        try:
            with open(path, 'r') as json_file:
                data = json.load(json_file)
        except Exception as e:
            print(f"Error loading data from JSON file: {str(e)}")
    else:
        print("Unsupported file format. Please provide an Excel (.xlsx) or JSON (.json) file.")

    return data


def save_data(data, path):
    """
    Saves data (dictionary or DataFrame) to a JSON or Excel file based on the file extension in the path

    Args:
        data (dict or DataFrame): data to be saved
        path (str): file path where data will be saved

    Returns:
        bool: True if data is successfully saved, False otherwise.
    """
    try:
        if path.endswith('.json'):
            if isinstance(data, dict):
                with open(path, 'w') as json_file:
                    json.dump(data, json_file, indent=2)
            elif isinstance(data, pd.DataFrame):
                data_dict = data.to_dict(orient='list')
                with open(path, 'w') as json_file:
                    json.dump(data_dict, json_file, indent=2)

        elif path.endswith('.xlsx'):
            if isinstance(data, dict):
                data = pd.DataFrame.from_dict(data)
            data.to_excel(path, index=False)
        else:
            print("Unsupported file format. Please provide a JSON (.json) or Excel (.xlsx) file path.")
            return False

        return True
    except Exception as e:
        print(f"Error saving data: {str(e)}")
        return False

## From the original benchmark to a common json structure

structure of "id": {
    "question": "...?",
    "choices": null, OR "choices": [
                        "choice 1",
                        "choice 2",
                        ...
                        "choice X"
                        ],
    "right_answer": "answer"}


Functions:
- reformat_benchmark(input_path, output_path, structure_mapping):
- reformat_benchmark_numglue (input_path, output_reformated_path):
- reformat_xlsx(excel_file_path, reformated_path): for the CRT benchmark

In [None]:
def reformat_benchmark(input_path, output_path, structure_mapping):
    """ Reformats the benchmark from the original structure (input_path) to a uniformized json structure (output_path)
    The uniformized structure (defined by the structure_mapping) is the following :
    "id": {
        "question": "...?",
        "choices": null, OR "choices": [
                        "choice 1",
                        "choice 2",
                        ...
                        "choice X",
                        ],
        "right_answer": "answer"
    }

    This is the structure that will be used for the rest of the pipeline, so each benchmark should be reformatted to this structure for the following steps.

    Args:
        input_path (str): path of the original benchmark
        output_path (str): path where the uniformized json should be written in
        structure_mapping (dict): mapping of the original structure to the uniformized structure
    Returns:
        None
    
    """
    data = load_data(input_path)

    reformatted_data = []

    if isinstance(data, list):
        for item in data:
            new_instance = {}
            for field, mapping in structure_mapping.items():
                if mapping is not None:
                    new_instance[field] = mapping(item)
            reformatted_data.append(new_instance)
    elif isinstance(data, dict):
        for key, value in data.items():
            new_instance = {}
            for field, mapping in structure_mapping.items():
                if mapping is not None:
                    new_instance[field] = mapping(value)
            reformatted_data[key] = new_instance

    df = pd.DataFrame.from_dict(reformatted_data)
    df.to_json(output_path, orient="index", indent=2)


# Examples of structure_mapping for the reformat_benchmark function:

scienceQA_structure = {
    "question": lambda value: value["question"],
    "choices": lambda value: value["choices"],
    "right_answer": lambda value: str(value["answer"])
}

commonsenseQA_structure = {
    "question": lambda value: value["question"]["stem"],
    "choices": lambda value: [choice["text"] for choice in value["question"]["choices"]],
    "right_answer": lambda value: next(choice["text"] for choice in value["question"]["choices"] if choice["label"] == value["answerKey"])
}

strategyQA_structure = {
    "question": lambda value: f"Question: {value['question']}\nFacts: {', '.join(fact.rstrip('.') for fact in value['facts'])}.",
    "choices": lambda value: None,
    "right_answer": lambda value: str(value["answer"])
}


In [5]:
# reformatting functions specific to some benchmarks
def reformat_benchmark_numglue (input_path, output_reformated_path):

    new_data = {}
    questions_count = 1

    with open(input_path, 'r') as json_file:

        for line in json_file:
            data = json.loads(line)

            if data["type"] == "Type_1":

                new_instance = {
                    "question": data["question"],
                    "choices": None,
                    "right_answer": str(data["answer"])
                }

                new_data[questions_count] = new_instance
                questions_count += 1

    save_data(new_data, output_reformated_path)



def reformat_CRT(excel_file_path, reformated_path):
    df = pd.read_excel(excel_file_path)

    json_data = {}

    for index, row in df.iterrows():
        question_id = str(index + 1)
        question_data = {
            "question": row["tasks"],
            "choices": None,
            "right_answer": row["correct_number"]
        }

        json_data[question_id] = question_data

    save_data(json_data, reformated_path)


## File processing functions

Merge, update question ids, shuffle choices, filter duplicate questions, select X random questions

Functions :
- merge_json_files(folder_path, output_path): Reads all of the files in folder_path, merges them into a json file in output_path
    - Args:
        - folder_path (str): path of the folder in which all of the files will be merged
        - output_path (str): path where the output should be written
        
<br>

- update_quest_ids(file_path, updated_file_path):
    """Opens the json file at file_path, updates all the question IDs for them to start from 1 and increment by 1 for each question
    Position :
    
    Args:
        file_path (str): path of the json file to be updated
        updated_file_path (str): path where the json file with updated IDs should be written in
<br>

- shuffle_choices(file_path, updated_file_path):
    """Opens the json file at file_path, shuffles the choices so that the order is random
    Position :
    
    Args:
        file_path (str): path of the json file to be updated
        updated_file_path (str): path where the json file with shuffled choices should be written in


- filter_duplicate_questions(input_file, output_file):

- random_selection (file_path, number_selected, new_file_path):
    """Opens the json file at file_path, selects number_selected questions from this file, and writes them in the same json format in new_file_path
    Position:
    
    Args:
        file_path (str): path of the json file to be updated
        number_selected (int): number of questions to keep. Should be smaller that the total number of questions included in the file, otherwise all questions will be selected
        new_file_path (str): path where the json file with updated IDs should be written in
    """

- fix_regex_questions (input_path, output_reformated_path): corrects some easy punctuation issues

- add_question_id(json_file_path): adds a question_id with the name of the benchmark included


In [7]:
def merge_json_files(folder_path, output_path):
    """Reads all of the files in folder_path, merges them into a json file in output_path
    Position in pipeline: Before step 1 in Benchmark processing, only for benchmarks that are divided in different json files

    Args:
        folder_path (str): path of the folder in which all of the files will be merged
        output_path (str): path where the output should be written
    """

    merged_data = {}
    num_files = 0

    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)

            try:
                data = load_data(file_path)

                prefixed_data = {f"{filename}_{key}": value for key, value in data.items()}
                merged_data.update(prefixed_data)

            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")

    save_data(merged_data, output_path)


In [8]:

# Selection of random questions from the datasets

def random_selection (file_path, number_selected, new_file_path):
    """Opens the json file at file_path, selects number_selected questions from this file, and writes them in the same json format in new_file_path
    Position:

    Args:
        file_path (str): path of the json file to be updated
        number_selected (int): number of questions to keep. Should be smaller that the total number of questions included in the file, otherwise all questions will be selected
        new_file_path (str): path where the json file with updated IDs should be written in
    """

    data = load_data(file_path)

    instance_ids = list(data.keys())
    random.shuffle(instance_ids)

    # Number of questions to be selected
    if number_selected > len(instance_ids):
        selected_ids = instance_ids
        print("Number in argument bigger than the number of questions,", len(instance_ids), " questions selected")
    else:
        selected_ids = instance_ids[:number_selected]

    selected_data = {}
    for instance_id in selected_ids:
        selected_data[instance_id] = data[instance_id]

    save_data(selected_data, new_file_path)

def filter_duplicate_questions(input_file, output_file):
    seen_questions = set()
    filtered_data = []

    with open(input_file, 'r') as json_file:
        data = json.load(json_file)

    for instance_id, instance in data.items():
        question = instance.get("question", "")
        if question not in seen_questions:
            filtered_data.append((instance_id, instance))
            seen_questions.add(question)
        #section to comment out
        else:
            print ("Question not taken : ", question)

    filtered_data_dict = dict(filtered_data)

    with open(output_file, 'w') as output_json:
        json.dump(filtered_data_dict, output_json, indent=2)
        
def shuffle_choices(file_path, updated_file_path):
    """Opens the json file at file_path, shuffles the choices so that the order is random
    Position :

    Args:
        file_path (str): path of the json file to be updated
        updated_file_path (str): path where the json file with shuffled choices should be written in
    """

    data = load_data(file_path)
    new_data = {}

    for instance_id, instance_data in data.items():
        if "choices" in instance_data and instance_data["choices"]!= None:
            random.shuffle(instance_data["choices"])
        new_instance = instance_data.copy()
        new_data[instance_id] = new_instance

    save_data(new_data, updated_file_path)
    
def update_quest_ids(file_path, updated_file_path):
    """Opens the json file at file_path, updates all the question IDs for them to start from 1 and increment by 1 for each question
    Position :

    Args:
        file_path (str): path of the json file to be updated
        updated_file_path (str): path where the json file with updated IDs should be written in
    """

    data = load_data(file_path)

    new_id = 1

    new_data = {}

    for instance_id, instance_data in data.items():
        new_instance = instance_data.copy()
        #new_instance["id"] = str(new_id)
        new_data[str(new_id)] = new_instance
        new_id += 1

    save_data(new_data, updated_file_path)



In [97]:

def fix_regex_questions (input_path, output_reformated_path):

    new_data = {}
    questions_count = 1

    with open(input_path, 'r') as json_file:

        data = json.load(json_file)
        for instance_id, instance in data.items():
            old_question = instance.get("question", "")

            # fix spacing and punctuation issues
            # careful : has a specific optimised order
            question = ""
            question_upd = "update"

            while question != question_upd: #repeat in case new patterns have been created by the previous set of .replace
                question_upd = question
                question = old_question.replace('  ', ' ')
                question = question.replace('\n\n.', '\n\n')
                question = question.replace(': , ', ': ')
                question = question.replace(' ,', ',')
                question = question.replace(' .', '.')
                question = question.replace('..', '.')
                question = question.replace('?.', '?')
                question = question.replace(':,', ': ')

            # only select questions with interrogation points
            if "?" in question:

                new_instance = {
                        "question": question,
                        "original_question": instance.get("original_question"),
                        "choices": instance.get("choices"),
                        "right_answer": instance.get("right_answer"),
                        "benchmark": ""
                }

                new_data[questions_count] = new_instance

                questions_count += 1
            """else:
                print("question with no ? : ", question)"""

    with open(output_reformated_path, 'w') as new_json_file:
        json.dump(new_data, new_json_file, indent=2)


In [None]:
# adding the question id as part of the attributes
def add_question_id(json_file_path, benchmark = None):
    with open(json_file_path, 'r') as file:
        data = json.load(file)

    for question_id, question_data in data.items():
        benchmark_name = benchmark
        if benchmark_name == None and "benchmark" in question_data:
            benchmark_name = question_data["benchmark"]
        else:
            benchmark_name = json_file_path.split("/")[-1].split("_")[0]

        question_data["benchmark"] = benchmark_name
        question_data["question_id"] = benchmark_name + "_" + question_id

    with open(json_file_path, 'w') as file:
        json.dump(data, file, indent=2)


## From the common structure to a smaller useable benchmark

The new benchmark has number_selected questions, choices are shuffled and included in the question, the questions are cleaned (no double spaces, etc...)

Functions:
- cleaned_final_benchmark(file_path, new_file_path, number_selected = 175):
- add_choices_in_question(benchmark_path, new_benchmark_path):
    """Adds "the choices are ..." to the benchmark questions

    Args:
        benchmark_path (str): path of the benchmark to be updated
        new_benchmark_path (str): path of the updated benchmark with choices in the question
    """


In [94]:
def check_answers_regex(answer_a, answer_b):
    """Checks if answer_a equals answer_be after ignoring differences in capitalization or symbols

    Args:
        answer_a (str): an answer
        answer_b (str): another answer to be compared to answer_a

    Returns:
        bool: True if both answers are the same, False otherwise
    """

    # Ensure answers are strings
    answer_a = str(answer_a) if answer_a is not None else ""
    answer_b = str(answer_b) if answer_b is not None else ""

    # Remove common variations and convert to lowercase
    normalized_a = re.sub(r'(?i)answer|the solution is|[^a-zA-Z0-9]', '', answer_a).lower()
    normalized_b = re.sub(r'(?i)answer|the solution is|[^a-zA-Z0-9]', '', answer_b).lower()

    # Compare the normalized answers
    return normalized_a == normalized_b


def add_choices_in_question(benchmark_path, new_benchmark_path, lettered_choices = True):
    """Adds "the choices are ..." to the benchmark questions

    Args:
        benchmark_path (str): path of the benchmark to be updated
        new_benchmark_path (str): path of the updated benchmark with choices in the question
    """

    benchmark_with_intro = {}
    data = load_data(benchmark_path)

    for key, value in data.items():
        # if open question:
        old_question = str(value.get("question", "")).strip()
        new_question = str(value.get("question", "")).strip()

        # if boolean question:
        if check_answers_regex(value["right_answer"], "True") or check_answers_regex(value["right_answer"], "False"):
            new_question += ". \nAnswer by 'True' or 'False'. "

        # if multiple choice question:
        elif value["choices"]!= None:
            if lettered_choices:
                choice_dict = generate_choice_dict(value["choices"])
                new_question += "\n\nThe choices are the following: " + json.dumps(choice_dict) + "\nOnly select the letter associated with the choice. \n Your response should be a single letter, with no other text whatsoever."

        new_instance = {
                "original_question": old_question,
                "question": new_question,
                "choices": value["choices"],
                "right_answer": value["right_answer"]
            }

        benchmark_with_intro[key] = new_instance

    save_data(benchmark_with_intro, new_benchmark_path)


In [99]:
#Creation of a smaller benchmark from the common-structure benchmark

def cleaned_final_benchmark(file_path, new_file_path, number_selected = 150, name_benchmark="no_name", lettered_choices = False, temp_path = "./temp_file_replicatingPET.json"):

    # --- Restructuring to a common format --- #
    # input: the benchmark already under a specific format, such as:
    """ "quest_nb_X": {
        "question": "This is a very interesting question, don't you think?",
        "choices": ["True", "yes", "of course"],
        "right_answer": "of course"
    }, """


    # output: the json file with indented integers as question ids,
    # a "original question" element which will not change (to keep a trace of the original question at all stages),
    # and a "question_id" element associated with the original_question of the benchmark
    # Example :
    """ "X": {
        "question": "This is a very interesting question, don't you think?",
        "original_question": "This is a very interesting question, don't you think?",
        "choices": ["True", "yes", "of course"],
        "right_answer": "of course",
        "question_id": "nameofbenchmark_1"
    },"""
    # correct number of questions, no duplicates, question filtered (no punctuation issues like double spaces, all questions have an interrogation point)
    # for questions with choices, a part is added in the question: "the choices are the following : " + choices
   

    random_selection(file_path, round(number_selected*1.10), temp_path)
    filter_duplicate_questions(temp_path, temp_path)
    shuffle_choices(temp_path, temp_path)
    add_choices_in_question(temp_path, temp_path, lettered_choices)
    fix_regex_questions (temp_path, temp_path) 
    random_selection(temp_path, number_selected, temp_path)
    update_quest_ids(temp_path, new_file_path)
    add_question_id(new_file_path, name_benchmark)
    


# Complete a benchmark :

If we use this function on a benchmark that has fewer questions than the number in argument, it completes it to reach the correct number of questions.
If we use it on a benchmark with a higher number of questions, it only selects the requested number of questions.

In [None]:
def complete_benchmark(full_structured_benchmark, small_benchmark_path, benchmark_name, newJSON_path, added_quest_json_path, number_of_questions=150, temp_path = "./simplified_folder/temporary_json.json"):

    # fix questions from the restructured benchmark
    filter_duplicate_questions(full_structured_benchmark, temp_path)
    shuffle_choices(temp_path, temp_path)
    update_quest_ids(temp_path, temp_path)
    add_choices_in_question(temp_path, temp_path)
    fix_regex_questions (temp_path, temp_path)

    with open(temp_path, 'r') as file1, open(small_benchmark_path, 'r') as file2:
        json1_data = json.load(file1)
        json2_data = json.load(file2)

    # Extract questions from the benchmark to complete
    questions_from_json2 = list(json2_data.values())
    initial_length = len(questions_from_json2)
    nb_added_quest = 0

    # Check if it has more questions than required
    if len(questions_from_json2) > number_of_questions:
        print("The benchmark already has more questions than required.")
        selected_questions = questions_from_json2[:number_of_questions]
        print("Benchmark shortened to ", number_of_questions, " questions.")
    else:
        selected_questions = questions_from_json2

    # Randomly shuffle questions from the full benchmark
    questions_from_json1 = list(json1_data.values())
    random.shuffle(questions_from_json1)

    # Add questions from the full benchmark to the small benchmark
    while len(selected_questions) < number_of_questions and questions_from_json1:
        next_question = questions_from_json1.pop()
        # Check if the question is not a duplicate in selected_questions
        if all(next_question['question'] != q['question'] for q in selected_questions):
            selected_questions.append(next_question)
            nb_added_quest +=1

    # Create a new JSON with the selected questions
    new_json_data = {str(i + 1): question for i, question in enumerate(selected_questions)}

    # Write the new JSON data to a file
    with open(newJSON_path, 'w') as new_file:
        json.dump(new_json_data, new_file, indent=2)

    update_quest_ids(newJSON_path, newJSON_path)
    add_question_id(newJSON_path, benchmark_name)

    #print("newJSON_path: ", newJSON_path, "added quest :", nb_added_quest)

    try:
        with open(newJSON_path, 'r') as json_file:
            newdata = json.load(json_file)
        #print("newdata: ", newdata)

        if isinstance(newdata, dict):
            newdata = list(newdata.values())  # Convert the dictionary to a list of values

        added_questions = {str(i + 1): item for i, item in enumerate(newdata[initial_length:])}

        with open(added_quest_json_path, 'w') as output_file:
            json.dump(added_questions, output_file, indent=2)
    except Exception as e:
        print("Error creating the added_questions file:", str(e))



# STEP 1 Main

In [None]:
# Example with StrategyQA:

folder_path = "YOUR_FOLDER_PATH"

unedited_benchmark_path = folder_path + "strategyqa.json"
reformated_benchmark_path = folder_path + "strategyqa_reformated.json"
cleaned_benchmark_path = folder_path + "strategyqa_cleaned.json"
temporary_path = folder_path + "temporary_strategyqa.json"

strategyQA_structure = {
    "question": lambda value: f"Question: {value['question']}\nFacts: {', '.join(fact.rstrip('.') for fact in value['facts'])}.",
    "choices": lambda value: None,
    "right_answer": lambda value: str(value["answer"])
}

reformat_benchmark(unedited_benchmark_path, reformated_benchmark_path, strategyQA_structure)
cleaned_final_benchmark(reformated_benchmark_path, cleaned_benchmark_path, 150, "strategyqa", lettered_choices = False, temp_path = temporary_path)