In [7]:
import json
import os
from dotenv import load_dotenv
from openai import OpenAI
import string
import random

from datasets import load_dataset, Dataset, DatasetDict

load_dotenv()
client = OpenAI()


def add_prompt_variations(prompt_template, variation_count, original_fields):
    prompt = f"""
    Create {variation_count} variations for prompt template:
    ----- Prompt Template -----
    '{prompt_template}'
    ---------------------------
    Make sure to use exaclty the same fields as in the original prompt template.
    Keep the intent of the original prompt template, but vary the wording.
    Also change the order of the fields if possible.
    Output a JSON object with a single key 'prompts' that contains a list of strings.
    """

    response = client.chat.completions.create(
        model="gpt-4o",
        response_format={"type": "json_object"},
        messages=[
            {"role": "system", "content": prompt},
        ],
    )
    prompt_variations = json.loads(response.choices[0].message.content)["prompts"]
    # check if the variations are valid i.e. contain same fields as original prompt template
    for variation in prompt_variations:
        if not all(
            [
                ("{" + field + "}") in variation
                for _, field, _, _ in original_fields
                if field
            ]
        ):
            raise ValueError(
                f"Variation '{variation}' does not contain all fields of the original prompt template '{prompt_template}'."
            )
    return prompt_variations + [prompt_template]

In [8]:
# create variations for prompt template
prompt_template = "Alice has one brother and she also has {num_sisters} sisters. How many sisters does Alice's brother have?"

formatter = string.Formatter()
fields = list(formatter.parse(prompt_template))


variations_1brother = add_prompt_variations(prompt_template, 10, fields)
variations_1brother

['Alice has {num_sisters} sisters and one brother. How many sisters does her brother have?',
 "With {num_sisters} sisters and one brother, how many sisters does Alice's brother have?",
 'Alice has a single brother and {num_sisters} sisters. How many sisters does her brother have?',
 'If Alice has {num_sisters} sisters and a brother, how many sisters does her brother have?',
 "Alice's family includes {num_sisters} sisters and a brother. How many sisters does the brother have?",
 "There are {num_sisters} sisters and one brother in Alice's family. How many sisters does her brother have?",
 "Counting {num_sisters} sisters and a brother, how many sisters does Alice's brother count?",
 '{num_sisters} sisters and one brother belong to Alice’s family. How many sisters does her brother have?',
 'Alice, who has {num_sisters} sisters and one brother, needs to know how many sisters her brother has. What is the answer?',
 "Alice's sibling group consists of {num_sisters} sisters and a brother. How m

In [9]:
# create variations for prompt template
prompt_template = "Alice has {num_brothers} brothers and she also has {num_sisters} sisters. How many sisters does one of Alice's brother have?"

formatter = string.Formatter()
fields = list(formatter.parse(prompt_template))


variations_many_brother = add_prompt_variations(prompt_template, 10, fields)
variations_many_brother

['Alice has {num_sisters} sisters and {num_brothers} brothers. How many sisters does one brother of Alice have?',
 'If Alice has {num_brothers} brothers and {num_sisters} sisters, how many sisters does each of her brothers have?',
 'Consider Alice who has {num_sisters} sisters and {num_brothers} brothers. How many sisters does one of her brothers have?',
 'Alice has a total of {num_brothers} brothers and {num_sisters} sisters. How many sisters does each brother of Alice have?',
 "With {num_brothers} brothers and {num_sisters} sisters in Alice's family, how many sisters does one of her brothers have?",
 'Alice has {num_sisters} sisters along with {num_brothers} brothers. How many sisters does any one of her brothers have?',
 'Given that Alice has {num_brothers} brothers and {num_sisters} sisters, compute the number of sisters for one of her brothers.',
 "If Alice's family includes {num_sisters} sisters and {num_brothers} brothers, how many sisters does a brother of Alice have?",
 'Imagi

In [33]:
# create list of simple problems
# each problem is a dictionary with the following keys: question, answer
# sample 100 problems with random numbers from 1 to 5 for each field and random prompt template
# if num_sisters or num_brothers is 1, replace "brothers" or "sisters" with "brother" or "sister" respectively
# solution is number of sisters plus 1
simple_problems = []
for _ in range(100):
    num_sisters = random.randint(1, 6)
    prompt_template = random.choice(variations_1brother)
    question = prompt_template.format(num_sisters=num_sisters)
    if num_sisters == 1:
        question = question.replace("sisters", "sister")
    answer = num_sisters + 1
    simple_problems.append({"question": question, "answer": answer})

for _ in range(100):
    num_brothers = random.randint(2, 7)
    num_sisters = random.randint(1, 6)
    prompt_template = random.choice(variations_many_brother)
    question = prompt_template.format(num_brothers=num_brothers, num_sisters=num_sisters)
    if num_sisters == 1:
        question = question.replace("sisters", "sister")
    answer = num_sisters + 1
    simple_problems.append({"question": question, "answer": answer})


In [16]:
# create variations for prompt template
prompt_template = """
 "Alice has {num_sisters} sisters.
 Her mother has 1 sister who does not have children -
 she (the sister) has {num_aunt_nephews} nephews and nieces and also {num_mom_brothers} brothers.
 Alice's father has a brother who has {num_uncle_nephews} nephews and nieces in total, and who has also {num_unclue_sons} sons.
 How many cousins does Alice's sister have?
"""


# answer: num_aunt_nephews - num_sisters - 1 + num_uncle_nephews - num_sisters - 1 + num_uncle_sons
# = num_aunt_nephews + num_uncle_nephews + num_uncle_sons - 2*num_sisters - 2


formatter = string.Formatter()
fields = list(formatter.parse(prompt_template))


variations_hard = add_prompt_variations(prompt_template, 10, fields)
variations_hard

["Alice's mother has 1 sister who does not have children. This aunt has {num_aunt_nephews} nephews and nieces, and {num_mom_brothers} brothers. Alice has {num_sisters} sisters. Her father has a brother with {num_uncle_nephews} nephews and nieces, and {num_unclue_sons} sons. How many cousins does Alice’s sister have?",
 "{num_sisters} sisters are part of Alice's family. Alice's mother has a sister with {num_aunt_nephews} nephews and nieces, and also {num_mom_brothers} brothers. Additionally, Alice's father has a brother. This uncle has {num_uncle_nephews} nephews and nieces, and {num_unclue_sons} sons. How many cousins does Alice's sister have?",
 "Alice has {num_sisters} sisters. Her mother's sister has {num_aunt_nephews} nephews and nieces and {num_mom_brothers} brothers, but no children of her own. Alice's father's brother has {num_uncle_nephews} nephews and nieces, and also {num_unclue_sons} sons. Can you calculate how many cousins Alice’s sister has?",
 "With {num_sisters} sisters,

In [55]:
index = 10
variations_hard[index]

"Alice has {num_sisters} sisters. Her mother has 1 sister who does not have children - she (the sister) has {num_aunt_nephews} nephews and nieces and also {num_mom_brothers} brothers. Alice's father has a brother who has {num_uncle_nephews} nephews and nieces in total, and who has also {num_unclue_sons} sons. How many cousins does Alice's sister have?"

In [56]:
variations_hard[index] = "Alice has {num_sisters} sisters. Her mother has 1 sister who does not have children. This sister has {num_aunt_nephews} nephews and nieces and also {num_mom_brothers} brothers. Alice's father has a brother who has {num_uncle_nephews} nephews and nieces in total, and who has also {num_unclue_sons} sons. How many cousins does Alice's sister have?"

In [57]:
variations_hard

["Alice's mother has 1 sister who does not have children. This aunt has {num_aunt_nephews} nephews and nieces, and {num_mom_brothers} brothers. Alice has {num_sisters} sisters. Her father has a brother with {num_uncle_nephews} nephews and nieces, and {num_unclue_sons} sons. How many cousins does Alice’s sister have?",
 "{num_sisters} sisters - not including Alice - are part of Alice's family. Alice's mother has a sister with {num_aunt_nephews} nephews and nieces, and also {num_mom_brothers} brothers. Additionally, Alice's father has a brother. This uncle has {num_uncle_nephews} nephews and nieces, and {num_unclue_sons} sons. How many cousins does Alice's sister have?",
 "Alice has {num_sisters} sisters. Her mother's sister has {num_aunt_nephews} nephews and nieces and {num_mom_brothers} brothers, but no children of her own. Alice's father's brother has {num_uncle_nephews} nephews and nieces, and also {num_unclue_sons} sons. Can you calculate how many cousins Alice’s sister has?",
 "Wit

In [58]:
# create list of hard problems
hard_problems = []
for _ in range(100000):
    num_sisters = random.randint(2, 7)
    
    num_aunt_nephews = random.randint(num_sisters + 1, num_sisters + 6)
    num_uncle_nephews = random.randint(num_sisters + 1, num_sisters + 6)

    num_mom_brothers = random.randint(2, 5)
    num_unclue_sons = random.randint(2, 7)
    prompt_template = random.choice(variations_hard)
    question = prompt_template.format(
        num_sisters=num_sisters,
        num_aunt_nephews=num_aunt_nephews,
        num_mom_brothers=num_mom_brothers,
        num_uncle_nephews=num_uncle_nephews,
        num_unclue_sons=num_unclue_sons,
    )
    answer = num_aunt_nephews + num_uncle_nephews + num_unclue_sons - 2 * num_sisters - 2
    hard_problems.append({"question": question, "answer": answer})

In [59]:
hard_problems[0]

{'question': "Alice has 4 sisters. Her mother's sister has 9 nephews and nieces and 4 brothers, but no children of her own. Alice's father's brother has 9 nephews and nieces, and also 6 sons. Can you calculate how many cousins Alice’s sister has?",
 'answer': 14}

In [60]:
# find all unique answers for hard problems
unique_answers = set()
for problem in hard_problems:
    unique_answers.add(problem["answer"])

# sample set of size 100 so that every answer has same number of problems
sampled_problems = []
for answer in unique_answers:
    problems = [problem for problem in hard_problems if problem["answer"] == answer]
    sampled_problems += random.sample(problems, 200//len(unique_answers))

hard_problems = sampled_problems

In [61]:
# create datasets from simple and hard problems
simple_dataset = Dataset.from_dict({"question": [problem["question"] for problem in simple_problems], "answer": [problem["answer"] for problem in simple_problems]})
hard_dataset = Dataset.from_dict({"question": [problem["question"] for problem in hard_problems], "answer": [problem["answer"] for problem in hard_problems]})

# push to hub
simple_dataset.push_to_hub("jeggers/AIW", "easy")
hard_dataset.push_to_hub("jeggers/AIW", "hard")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1001.03ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  3.49it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<?, ?ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.04s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/jeggers/AIW/commit/b40e903377662be3d7351a8839b61ce05d712731', commit_message='Upload dataset', commit_description='', oid='b40e903377662be3d7351a8839b61ce05d712731', pr_url=None, pr_revision=None, pr_num=None)

In [62]:
cache = _PROMPT_VARIATIONS_CACHE_FILE = "prompt_variations_cache.json"
# read cache
with open(cache, "r") as f:
    cache_obj = json.load(f)


new_cache_obj = {}
# update cache
# iterate entries in cache
for key, value in cache_obj.items():
    # remove name from value 
    name = value.pop("name")
    value.update({"prompt_template": key})
    new_cache_obj[name] = value

# write cache
with open("_" + cache, "w") as f:
    json.dump(new_cache_obj, f, indent=2)