In [None]:
!git clone https://github.com/radical-cybertools/ROSE
!cd ROSE && pip install .

fatal: destination path 'ROSE' already exists and is not an empty directory.
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [1]:
import os
import sys
import time

In [None]:


from rose.metrics import GREATER_THAN_THRESHOLD
from rose.rl.reinforcement_learner import SequentialReinforcementLearner

from radical.asyncflow import WorkflowEngine, RadicalExecutionBackend



In [None]:
execution_engine = await RadicalExecutionBackend({'resource': 'local.localhost'})
asyncflow = await WorkflowEngine.create(execution_engine)
rl = SequentialReinforcementLearner(asyncflow)

In [2]:
#from google.colab import userdata
os.environ["HF_TOKEN"] = "REDACTED"
os.environ["HF_HOME"] = "/anvil/scratch/x-apark4/cache"

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from datasets import load_dataset
from trl import GRPOConfig, GRPOTrainer

  from .autonotebook import tqdm as notebook_tqdm


INFO 09-26 02:22:56 [__init__.py:216] Automatically detected platform cuda.


In [4]:
model_name = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
#if tokenizer.pad_token is None:
#    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.pad_token = "<|finetune_right_pad_id|>"
tokenizer.pad_token_id = 128004
model = AutoModelForCausalLM.from_pretrained(
    model_name, torch_dtype=torch.bfloat16, device_map="auto"
)
model.generation_config = GenerationConfig.from_pretrained(model_name)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6)

Loading checkpoint shards: 100%|██████████| 2/2 [00:16<00:00,  8.41s/it]


In [5]:
dataset = load_dataset("qwedsacf/competition_math", split="train")
def to_prompt_completion(example):
    return {
        "prompt": (
            example['problem']
        ),
        "ground_truth": str(example["solution"]).strip(),
    }
mapped = dataset.map(to_prompt_completion, remove_columns=dataset.column_names)

In [6]:
import re
def normalize(expr: str):
    """Basic normalization for math strings."""
    if expr is None:
        return None
    expr = expr.strip()
    if expr.startswith("\\boxed{") and expr.endswith("}"):
        expr = expr[len("\\boxed{"):-1]
    return expr.replace(" ", "")
def get_answer(expr: str):
    match = re.search(r"\\boxed\{(.+?)\}", expr)
    if match:
        return match.group(1).strip()
    return None

def compute_reward(prompts, completions, ground_truth, **kwargs):
    pred = [normalize(c) for c in completions]
    ref = [normalize(g) for g in ground_truth]
    return [1.0 if p == r else 0.0 for p, r in zip(pred, ref)]

def rewards_func(prompts, completions, ground_truth, **kwargs):
    rewards = []
    for prompt, completion, ground in zip(prompts, completions, ground_truth):
        c = get_answer(completion)
        g = get_answer(ground)
        reward = 1.0 if g == c else 0
        rewards.append(reward)
    return rewards

def reward_func(prompt, completions, ground_truth, **kwargs):
    c = get_answer(completions)
    g = get_answer(ground_truth)
    reward = 1 if g == c else 0
    return reward


In [10]:
import random
sample = random.choice(dataset)
print(sample)
messages = [
    {"role": "user", "content": f"{sample["problem"]}?\nPlease reason step by step, and put your final answer within \\boxed{{}}."}
]
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
outputs = model.generate(input_tensor.to(model.device), max_new_tokens=2048)
result = tokenizer.decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=True)
print(result)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{'problem': 'Find the product of all $x$ such that the expression $\\frac{x^2+2x+1}{x^2+2x-3}$ is undefined.', 'level': 'Level 3', 'type': 'Algebra', 'solution': 'The expression is only undefined when the denominator is equal to zero. Therefore, the goal is to find the product of all real $x$ that satisfy the equation $x^2+2x-3=0$. Since the discriminant of this quadratic is $2^2 - 4(1)(-3) = 16$, which is positive, we know that the roots of $x^2 +2x-3$ are distinct real numbers.  The product of the roots of a quadratic of the form of $ax^2+bx+c$ is equal to $\\frac{c}{a}$, so the desired product of the values of $x$ for which $x^2 + 2x - 3=0$ is $\\frac{-3}{1}$, or $\\boxed{-3}$.'}
To find the product of all x such that the expression is undefined, we need to find the values of x that make the denominator equal to zero.

Step 1: Factor the denominator
x^2 + 2x - 3 = (x + 3)(x - 1)

Step 2: Set the denominator equal to zero and solve for x
(x + 3)(x - 1) = 0

This equation is true when

In [11]:
rewards_func(["Problem: Solve the equation $2x + 3 = 7$. Solution:", "Problem: Solve the equation $3x - 5 = 10$."],["\\boxed{2}",result] ,["\\boxed{2}",sample["solution"]])

[1.0, 1.0]

In [12]:
training_args = GRPOConfig(output_dir="r1math-output")
trainer = GRPOTrainer(
    model=model,
    reward_funcs=rewards_func,
    args=training_args,
    train_dataset=mapped,
)
trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss
10,0.0074


KeyboardInterrupt: 

In [7]:
#manual GRPO training
def grpo_rlvr_step(problem, true_answer, K=4):
    messages = [{"role": "user", "content": f"{problem}\nPlease reason step by step, and put your final answer within \\boxed{{}}."}]
    input_message = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)

    gens = [
        model.generate(input_tensor, max_new_tokens=2048, num_beams=4, top_p=0.9)
        for _ in range(K)
    ]
    outputs = [tokenizer.decode(g[0][input_tensor.shape[1]:], skip_special_tokens=True) for g in gens]

    rewards = torch.tensor(
        [reward_func(input_message, out, true_answer) for out in outputs],
        dtype=torch.float32, device=model.device
    )

    rel_rewards = rewards - rewards.mean()

    with torch.no_grad():
        ref_out = model.generate(input_tensor, max_new_tokens=256, temperature=0.7, do_sample=True, top_k=10)
        ref_text = tokenizer.decode(ref_out[0], skip_special_tokens=True)
        ref_r = reward_func(input_message, ref_text, true_answer)
    final_rewards = rel_rewards - ref_r

    loss = 0.0
    for out, r in zip(outputs, final_rewards):
        tokens = tokenizer(input_message + out, return_tensors="pt", truncation=True).to(model.device)
        out = model(**tokens, labels=tokens["input_ids"])
        logprobs = out.loss  # NLL
        loss += r * logprobs

    loss = loss / K
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    return outputs, rewards.tolist(), ref_text, ref_r

In [8]:
import random

for step in range(5):  # demo loop
    sample = random.choice(dataset)
    outs, rs, ref, ref_r = grpo_rlvr_step(sample["problem"], sample["solution"], K=4)
    print(f"Step {step} | Rewards: {rs} | Ref reward: {ref_r}")
    print(sample["problem"], "\n")
    print("Sample Solution:", sample["solution"], "\n")
    print("Sample output:", outs[0], "\n")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention m

Step 0 | Rewards: [1.0, 1.0, 1.0, 1.0] | Ref reward: 1
Find all values of $x$ such that $\displaystyle\frac{1}{x-1} + \frac{2x}{x - 1} = 5$. 

Sample Solution: We can combine the two terms on the left side to get $\dfrac{1+2x}{x-1} = 5$.  We then multiply both sides of this equation by $x-1$ to get rid of the fractions.   This gives us $1+2x = 5(x-1)$. Expanding the right side gives $1+2x = 5x -5$.  Subtracting $5x$ from both sides   gives $1-3x = -5$, and subtracting 1 from both sides of this equation yields $-3x = -6$.  Dividing both sides   of this equation by $-3$ gives us our answer, $x = \boxed{2}$. 

Sample output: Step 1: The given equation is $\displaystyle\frac{1}{x-1} + \frac{2x}{x - 1} = 5$. The first step is to simplify the equation by combining the fractions on the left-hand side.

Step 2: Since both fractions have the same denominator $(x-1)$, we can combine them into a single fraction: $\displaystyle\frac{1 + 2x}{x-1} = 5$.

Step 3: Next, we can multiply both sides of t

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end gene

Step 1 | Rewards: [0.0, 0.0, 0.0, 0.0] | Ref reward: 0
Find all solutions to the equation \[\frac{\left(\frac{x}{x+1}\right)^2 + 11}{\left(\frac{x}{x+1}\right)^2 + 1} = 2.\]Enter all the solutions, separated by commas. 

Sample Solution: We make the substitution $y = \left(\frac{x}{x+1}\right)^2$ to simplify the equation, so that \[\frac{y+11}{y+1} = 2.\]Multiplying by $y+1$ gives $y+11 = 2y+2,$ so $y=9.$ Therefore, we have \[\frac{x}{x+1} = \pm 3.\]Then, either $x = 3(x+1)$ or $x = -3(x+1).$ These give solutions $x =\boxed{ -\tfrac32}$ and $x = \boxed{-\tfrac34},$ respectively. 

Sample output: Step 1: Multiply both sides of the equation by the denominator to eliminate the fraction.

\[\frac{\left(\frac{x}{x+1}\right)^2 + 11}{\left(\frac{x}{x+1}\right)^2 + 1} = 2\]

\[\left(\frac{x}{x+1}\right)^2 + 11 = 2\left(\left(\frac{x}{x+1}\right)^2 + 1\right)\]

Step 2: Expand the right-hand side of the equation.

\[\left(\frac{x}{x+1}\right)^2 + 11 = 2\left(\frac{x^2}{(x+1)^2}\right) + 2\]

\[

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end gene

Step 2 | Rewards: [1.0, 1.0, 1.0, 1.0] | Ref reward: 1
What is $88 \div (4 \div 2)$? 

Sample Solution: We perform the operation in parentheses first: \[88 \div (4 \div 2) = 88 \div 2 = \boxed{44}.\] 

Sample output: To evaluate the expression $88 \div (4 \div 2)$, we need to follow the order of operations (PEMDAS):

1. Evaluate the expression inside the parentheses: $4 \div 2 = 2$
2. Now the expression becomes $88 \div 2$
3. Divide 88 by 2: $88 \div 2 = 44$

Therefore, the final answer is: $\boxed{44}$ 



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end gene

Step 3 | Rewards: [0.0, 0.0, 0.0, 0.0] | Ref reward: 0
The equations
\[75x^4 + ax^3 + bx^2 + cx + 12 = 0\]and
\[12x^5 + dx^4 + ex^3 + fx^2 + gx + 75 = 0\]have a common rational root $k$ which is not an integer, and which is negative.  What is $k?$ 

Sample Solution: Let $k = \frac{m}{n}$ in reduced form, where $m$ and $n$ are integers.  Then by the Rational Root Theorem, $m$ divides 12 and $m$ divides 75, so $m$ must divide $\gcd(12,75) = 3.$  Similarly, $n$ divides 75 and $n$ divides 12, so $n$ must divide $\gcd(75,12) = 3.$  Thus, $m,$ $n \in \{-3, -1, 1, 3\}.$

We are told that $k = \frac{m}{n}$ is not an integer, and negative.  The only possibility is that $k =\boxed{-\frac{1}{3}}.$ 

Sample output: The Rational Root Theorem states that if a rational number $p/q$ is a root of the polynomial equation $a_nx^n + a_{n-1}x^{n-1} + \dots + a_1x + a_0 = 0$, then $p$ must be a factor of $a_0$ and $q$ must be a factor of $a_n$.

For the first equation $75x^4 + ax^3 + bx^2 + cx + 12 = 0$, th

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end gene

Step 4 | Rewards: [0.0, 0.0, 0.0, 0.0] | Ref reward: 0
Find all positive real numbers $x$ that satisfy
\[x \sqrt{12 - x} + \sqrt{12x - x^3} \ge 12.\]Enter all solutions, separated by commas. 

Sample Solution: We write
\[x \sqrt{12 - x} + \sqrt{12x - x^3} = \sqrt{12 - x} \cdot \sqrt{x^2} + \sqrt{x} \cdot \sqrt{12 - x^2}\]By Cauchy-Schwarz,
\[(\sqrt{12 - x} \cdot \sqrt{x^2} + \sqrt{x} \cdot \sqrt{12 - x^2})^2 \le (12 - x + x)(x^2 + 12 - x^2) = 144,\]so
\[\sqrt{12 - x} \cdot \sqrt{x^2} + \sqrt{x} \cdot \sqrt{12 - x^2} \le 12.\]But $\sqrt{12 - x} \cdot \sqrt{x^2} + \sqrt{x} \cdot \sqrt{12 - x^2} \ge 12,$ so the expression must be equal to 12.  From the equality condition for Cauchy-Schwarz,
\[\frac{12 - x}{x} = \frac{x^2}{12 - x^2}.\]Then $(12 - x)(12 - x^2) = x^3,$ which simplifies to $x^2 + x - 12 = 0.$  This factors as $(x - 3)(x + 4) = 0,$ so the only solution is $x = \boxed{3}.$ 

Sample output: Step 1:  We are given the inequality $x \sqrt{12 - x} + \sqrt{12x - x^3} \ge 12.$ Our goa