In [None]:
from datasets import load_dataset
import datasets
ds = load_dataset("openai/gsm8k", "main")
train: datasets.Dataset = ds["train"]
prompt_templ = """A conversation between User and Assistant. The User asks a question, and the Assistant solves it. The Assistant first thinks about the reasoning process in the mind and then provides the User with the answer. The reasoning process is enclosed within <think> </think> and answer is enclosed within <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.
User: {0}
Assistant: <think>"""
print(prompt_templ)

In [None]:
from cs336_alignment.math_baseline import extract_answer
prompts = []
ground_truths = []
for t, data in enumerate(train):
    question = data["question"]
    answer_text = data["answer"]
    answer = extract_answer(answer_text)
    assert answer is not None, f"Could not extract answer from: {answer_text}"
    full_prompt = prompt_templ.format(question)
    prompts.append(full_prompt)
    ground_truths.append(answer)

In [None]:
import random
index = random.randint(0, len(prompts) - 1)
print("Question:")
print(train[index]["question"])
print("-" * 20)
print("Ground truth answer:")
print(ground_truths[index])
print("-" * 20)
print("Answer Text:")
print(train[index]["answer"])

In [1]:
import pickle
from cs336_alignment.math_baseline import EvalEntry
with open("../data/math_baseline_eval_results_remote.pkl", "rb") as f:
    results: list[EvalEntry] = pickle.load(f)

INFO 11-05 14:18:30 __init__.py:190] Automatically detected platform cuda.


In [2]:
import random
from cs336_alignment.math_baseline import EvalEntry
total_samples = len(results)
index = random.randint(0, total_samples - 1)
eval_entry: EvalEntry = results[index]
if eval_entry.reward <= 0 and eval_entry.format_reward <= 0:
    print("Prompt:")
    print(eval_entry.prompt)
    print("-" * 20)
    print("Response:")
    print(eval_entry.response)
    print("-" * 20)
    print("Ground Truth:")
    print(eval_entry.ground_truth)
    print("-" * 20)
    print("Reward:")
    print(eval_entry.reward)

"""Format Reward looks correct, only one issue is that therre are some cases 
where <think/> something <answer> answer </answer> is not satisfied. 
We can improve the format reward by checking for this condition."""

Prompt:
A conversation between User and Assistant. The User asks a question, and the Assistant solves it. The Assistant first thinks about the reasoning process in the mind and then provides the User with the answer. The reasoning process is enclosed within <think> </think> and answer is enclosed within <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.
User: Calculate $\sqrt{10p} \cdot \sqrt{5p^2} \cdot \sqrt{6p^4}$ . Express your answer in simplest radical form in terms of $p$.

Note: When entering a square root with more than one character, you must use parentheses or brackets.  For example, you should enter $\sqrt{14}$ as "sqrt(14)" or "sqrt{14}".
Assistant: <think>
--------------------
Response:
 set $x = \sqrt{10p} * \sqrt{5p^2} * \sqrt{6p^4}$ </think> <think> here $10p = (10p)(1)(1)$ </think> <think> here $= \sqrt{(10p)(1))((5p^2)(1)(1))((6p^4)(1)(1))$ </think> <think> now you saw use <code>$x2 = \sqrt{x}\\ x2` = 

'Format Reward looks correct, only one issue is that therre are some cases \nwhere <think/> something <answer> answer </answer> is not satisfied. \nWe can improve the format reward by checking for this condition.'

In [2]:
correct_count = 0
format_correct_count = 0
wrong_count = 0
total_count = len(results)
for t, entry in enumerate(results):
    if entry.reward > 0:
        correct_count += 1
    elif entry.format_reward > 0:
        format_correct_count += 1
    else:
        wrong_count += 1
print(f"Total samples: {len(results)}")
print(f"Correct samples: {correct_count}, percent: {correct_count / total_count:.4f}")
print(f"Format correct samples: {format_correct_count}, percent: {format_correct_count / total_count:.4f}")
print(f"Wrong samples: {wrong_count}, percent: {wrong_count / total_count:.4f}")

Total samples: 512
Correct samples: 376, percent: 0.7344
Format correct samples: 134, percent: 0.2617
Wrong samples: 2, percent: 0.0039


In [7]:
import random
from math_verify import parse
from cs336_alignment.extract import extract_ans
all_indexes = []
for t, entry in enumerate(results):
    if entry.format_reward > 0 and entry.reward > 0:
        all_indexes.append(t)

print(f"Total such samples: {len(all_indexes)}")
index = random.choice(all_indexes)
entry = results[index]
format_rewards = entry.format_reward
print("Prompt:")
print(entry.prompt)
print("-" * 20)
print("Response:")
print(entry.response)
print(parse(entry.response))
print("-" * 20)
print("Ground Truth:")
print(entry.ground_truth)
print(f"answer={extract_ans(entry.ground_truth, True)}")
print("-" * 20)
print("Reward:")
print(f"reward: {entry.reward}, format_reward: {entry.format_reward}, answer_reward_v2: {entry.answer_reward_v2}")

Total such samples: 376
Prompt:
A conversation between User and Assistant. The User asks a question, and the Assistant solves it. The Assistant first thinks about the reasoning process in the mind and then provides the User with the answer. The reasoning process is enclosed within <think> </think> and answer is enclosed within <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.
User: A right triangle with integer leg lengths is called "cool'' if the number of square units in its area is equal to twice the number of units in the sum of the lengths of its legs. What is the sum of all the different possible areas of cool right triangles?
Assistant: <think>
--------------------
Response:
Let the legs of the right triangle be of lengths $a$ and $b$.
The area of the triangle is $\frac{ab}{2}$.
The sum of the lengths of the legs is $a+b$.
The condition for the triangle to be "cool" is therefore
\[\frac{ab}{2}=2(a+b).\]
Multiplyi

In [6]:
from cs336_alignment.drgrpo_grader import r1_zero_reward_fn, _normalize
from math_verify import parse, verify, LatexExtractionConfig, ExprExtractionConfig

resp_answer = parse(entry.response)
gt_answer = parse(entry.ground_truth)
is_correct = verify(resp_answer, gt_answer)
print(f"resp_answer: {resp_answer}, ground truth: {gt_answer}, is_correct: {1.0 if is_correct else 0.0}")

reward = r1_zero_reward_fn(entry.response, entry.ground_truth, False)
print(f"Reward: {reward}")

resp_answer: [8, '8'], ground truth: [9, '9'], is_correct: 0.0
Reward: {'format_reward': 1.0, 'answer_reward': 0.0, 'reward': 0.0}


In [2]:
from datasets import load_dataset
import datasets
ds = load_dataset("hkust-nlp/dart-math-uniform")
train: datasets.Dataset = ds["train"]
prompt_templ = """A conversation between User and Assistant. The User asks a question, and the Assistant solves it. The Assistant first thinks about the reasoning process in the mind and then provides the User with the answer. The reasoning process is enclosed within <think> </think> and answer is enclosed within <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.
User: {0}
Assistant: <think>"""

In [None]:
from typing import List
from cs336_alignment.extract import extract_ans
prompts: List[str] = []
responses: List[str] = []
for t, data in enumerate(train):
    question: str = data["query"] # type: ignore
    answer_text: str = data["response"] # type: ignore
    answer = extract_ans(answer_text, True)
    if answer is None:
        print(f"Skipping sample {t} due to no extractable answer.")
        break
    full_prompt = prompt_templ.format(question)
    prompts.append(full_prompt)
    responses.append(f"{answer_text} </think> <answer> {answer} </answer>")

In [35]:
from math_verify import parse, verify
import random
index = random.randint(0, len(train) - 1)
data = train[index]

response = data["response"]

resp_answer = parse(response)
print(f"resp_answer: {resp_answer}")

resp_answer: [11/850, '\\frac{11}{850}']


In [30]:
from math_baseline import get_evaluation_samples
tr_prompts, _ = get_evaluation_samples(1024, 0)
print(f"Number of training prompts: {len(tr_prompts)}")
val_prompts, _ = get_evaluation_samples(512, 8192)
print(f"Number of validation prompts: {len(val_prompts)}")

                                                                                                 

Number of training prompts: 1024


                                                                                                 

Number of validation prompts: 512


In [None]:
count = 0
for p0 in tr_prompts:
    tokens_p0 = set(p0.split())
    for p1 in val_prompts:
        tokens_p1 = set(p1.split())
        if p0 == p1:
            sim = 1
        else:
            union_tokens = tokens_p0 | tokens_p1
            inserct_tokens = tokens_p0 & tokens_p1
            sim = len(inserct_tokens) / len(union_tokens)
        if sim > 0.9:
            print(f"Found similar prompts with similarity {sim:.4f}:")
            print("-" * 80)
            print(p0)
            print("-" * 80)
            print(p1)
            count += 1
print(f"Total similar prompts found: {count}")

Found similar prompts with similarity 0.6452:
--------------------------------------------------------------------------------
A conversation between User and Assistant. The User asks a question, and the Assistant solves it. The Assistant first thinks about the reasoning process in the mind and then provides the User with the answer. The reasoning process is enclosed within <think> </think> and answer is enclosed within <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.
User: What is the degree of the polynomial $(4 +5x^3 +100 +2\pi x^4 + \sqrt{10}x^4 +9)$?
Assistant: <think>
--------------------------------------------------------------------------------
A conversation between User and Assistant. The User asks a question, and the Assistant solves it. The Assistant first thinks about the reasoning process in the mind and then provides the User with the answer. The reasoning process is enclosed within <think> </think> and

KeyboardInterrupt: 