# Generate Query Generation Prompt for Math

In [None]:
# User defined assistant goal for automatic query generation
ASSISTANT_GOAL = "Get really good at math."

TASK_GENERATION_SUCCESS_CRITERIA = "The task and final answer must be a dictionary with 'prompt', 'target', and 'solution' keys. The 'prompt' should be a math problem, the 'target' should be the expected answer to the math problem, and the 'solution' must be a solution to the math problem that clearly and unambiguously leads to the expected answer 'target'."

EXAMPLE_FINAL_ANSWER = '''<answer>
{
    "prompt": "What is 1 + 1?',
    'target': 2,
    'solution': 'To solve 1 + 1, we add the two numbers together. Starting with the number 1, adding another 1 gives us 2. Therefore, the answer is 2.'
}
</answer>'''


query_system_prompt = '''You are a helpful assistant. You first thinks about the reasoning process in the mind and then provides the user with the answer.'''

query_user_prompt = f'''Given the assistant goal, task generation success criteria, and example final answer, generate a task that fulfills the task generation success criteria. The generated task should be:

1) maximizing user benefit and helpfulness and minimizing user risk, harm, and unwanted outcomes

2) novel - avoid designing a task that is drawn from the same linguistic templates or differ only in superficial details

3) difficult yet tractable - the task should be challenging for the assistant to complete, without being completely intractable

4) clear - the task should be specific enough to not require further user input to understand what is being asked of the assistant and should provide all the context and detail necessary

Show your work in <think> </think> tags. And return the task and final answer in <answer> </answer> tags.


Assistant Goal: {ASSISTANT_GOAL}

Task Generation Success Criteria: {TASK_GENERATION_SUCCESS_CRITERIA}

Example Final Answer: {EXAMPLE_FINAL_ANSWER}'''


messages = [
    # {"role": "system", "content": query_system_prompt},
    {"role": "user", "content": query_user_prompt},
    {"role": "assistant", "content": "<think>\n"}
]

In [6]:
# Load the model
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-3B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    continue_final_message=True
)
print(text)

<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Given the assistant goal, task generation success criteria, and example final answer, generate a task that fulfills the task generation success criteria. The generated task should be:

1) maximizing user benefit and helpfulness and minimizing user risk, harm, and unwanted outcomes

2) novel - avoid designing a task that is drawn from the same linguistic templates or differ only in superficial details

3) difficult yet tractable - the task should be challenging for the assistant to complete, without being completely intractable

4) clear - the task should be specific enough to not require further user input to understand what is being asked of the assistant and should provide all the context and detail necessary

Show your work in <think> </think> tags. And return the task and final answer in <answer> </answer> tags.


Assistant Goal: Get really good at math.

Task Generati

In [7]:
from huggingface_hub import login

login(token="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", add_to_git_credential=True)

Token has not been saved to git credential helper.


[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m


In [8]:
# Push to HF hub

# Create the dataset with 50K rows where each row's "prompt" value is the same text
from datasets import Dataset, DatasetDict
from huggingface_hub import HfApi

# 'text' is assumed to be defined from the previous cell
num_rows = 50000
data = {"prompt": [text] * num_rows}

# Create a Hugging Face Dataset
dataset = Dataset.from_dict(data)

# Optionally, wrap it in a DatasetDict under a 'train' split
dataset_dict = DatasetDict({"train": dataset})

# Verify that you are logged in to the Hugging Face Hub
api = HfApi()
whoami = api.whoami()
print(f"Logged in as: {whoami['name']}")

# Define the dataset repository name (adjust the repo name if needed)
dataset_id = "khuang2/math-query-gen-prompts-w-solution"

# Push the dataset to the Hub
dataset_dict.push_to_hub(dataset_id)

Logged in as: khuang2


Creating parquet from Arrow format: 100%|██████████| 50/50 [00:00<00:00, 2350.94ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.44it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/khuang2/math-query-gen-prompts-w-solution/commit/b488bc1bb66446e5daeb15bfd8d6a7ef98bdbe1b', commit_message='Upload dataset', commit_description='', oid='b488bc1bb66446e5daeb15bfd8d6a7ef98bdbe1b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/khuang2/math-query-gen-prompts-w-solution', endpoint='https://huggingface.co', repo_type='dataset', repo_id='khuang2/math-query-gen-prompts-w-solution'), pr_revision=None, pr_num=None)