## Model configs

### Generation of Steps

In [None]:
hf_token = "HF_TOKEN"

In [2]:
from huggingface_hub import login

login(token=hf_token)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")

In [9]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [None]:
import torch

def generate_next_steps(context="hey", M=5, temperature=0.8, max_new_tokens=200):
    """ Generate M next steps based on the provided context using the model.
    Args:
    - context (str): The input context to generate steps from.
    - M (int): Number of steps to generate (default: 5).
    - temperature (float): Sampling temperature (default: 0.8).
    - max_new_tokens (int): Maximum number of new tokens to generate (default: 200).
    Returns:
    - List of generated texts.
    """

    inputs = tokenizer(context, return_tensors="pt").to(model.device)

    outputs = model.generate(
        **inputs,
        do_sample=True,
        temperature=temperature,
        num_return_sequences=M,
        max_new_tokens=max_new_tokens
    )

    generated_texts = []

    for i in range(M):
        generated = tokenizer.decode(outputs[i][inputs['input_ids'].shape[-1]:], skip_special_tokens=False)
        generated_texts.append(generated.strip())

    return generated_texts


In [None]:
def slice_str(text):
  """Parses the steps from the text."""

  steps_list = text.split("## Step")

  if len(steps_list) == 0:
    return text

  if len(steps_list[0]) < 2:
    return steps_list[1]
  else:
    return steps_list[0]

### Dataset

In [12]:
import pandas as pd

df = pd.read_parquet("hf://datasets/meta-llama/Llama-3.2-1B-Instruct-evals/Llama-3.2-1B-Instruct-evals/Details_math_2024-09-23T17-23-17.197184.parquet.gzip")

In [13]:
sample_prompt = df['input_final_prompts'][0][0]

In [None]:
ans = generate_next_steps(temperature=0.8, context=sample_prompt, max_new_tokens=204)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [None]:
for i in enumerate(ans):
  strings = slice_str(i[1])
  print(strings)
  print("----------------------")

 1: Use De Moivre's Theorem to find the expression for $(\cos12^\circ+i\sin12^\circ+\cos48^\circ+i\sin48^\circ)^6$
De Moivre's Theorem states that for any real number $x$ and integer $n$, $(\cos x + i\sin x)^n = \cos(nx) + i\sin(nx)$. Therefore, we can rewrite the expression as $[(\cos 12^\circ + i\sin 12^\circ) + (\cos 48^\circ + i\sin 48^\circ)]^6$.


----------------------
 1: Apply De Moivre's Theorem
To find the imaginary part of the expression, we can first express the given expression in polar form using De Moivre's Theorem, which states that $(\cos\theta+i\sin\theta)^n = \cos(n\theta)+i\sin(n\theta)$. In this case, we have $(\cos12^\circ+i\sin12^\circ+\cos48^\circ+i\sin48^\circ)^6$. We can express the first two terms as $(\cos12^\circ+i\sin12^\circ+\cos48^\circ+i\sin48^\circ) = (\cos(60^\circ)+i\sin(60^\circ))+(\cos(96^\circ)+i\sin(96^\circ))$.


----------------------
 1: Convert the complex number to polar form
We first express the given complex number in polar form: $re^{i\t

### Process Reward Model

In [18]:
next_step = slice_str(ans[0])

In [None]:
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
prm_tokenizer = AutoTokenizer.from_pretrained(model_name)
prm_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map=device,
    low_cpu_mem_usage=True
)


In [None]:
def score_intermediary_step(steps, positive_token="+", negative_token="-"):
    """
    Scores an intermediary step in a reasoning process using the model as a PRM.

    Args:
    - steps (list of str): List of steps in the reasoning process, where:
        - steps[0] is the task description,
        - steps[1:-1] are previous intermediary steps,
        - steps[-1] is the current intermediary step to evaluate.
    - positive_token (str): Token for positive reward (default: '+').
    - negative_token (str): Token for negative reward (default: '-')—used for comparison.

    Returns:
    - float: Normalized score between 0 and 1, where higher values indicate better steps.
    """

    task_description = steps[0]
    curr_step = steps[-1]
    prev_steps = steps[1:-1]

    prev_steps_str = "\n\n## Step"+"\n\n## Step".join(prev_steps)

    prompt = f"""<|system|>
      You are a Process Reward Model. Evaluate the following current intermediary step in the reasoning process. Output only a single token: '+' if the step is correct and helpful, or '-' if it's incorrect or unhelpful.</s>
      <|user|>
      Task: {task_description}
      Previous Intermediary Steps:{prev_steps_str}
      Current Intermediary Step: {curr_step}</s>
      <|assistant|>
      Verdict:"""

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits[:, -1, :]

    pos_id = tokenizer.convert_tokens_to_ids(positive_token)
    neg_id = tokenizer.convert_tokens_to_ids(negative_token)

    if pos_id == tokenizer.unk_token_id or neg_id == tokenizer.unk_token_id:
        raise ValueError(f"Tokens '{positive_token}' or '{negative_token}' not in vocab. Try different tokens (e.g., 'Yes'/'No').")

    # Compute logprobs
    logprobs = torch.log_softmax(logits, dim=-1)
    pos_logprob = logprobs[0, pos_id].item()
    neg_logprob = logprobs[0, neg_id].item()

    import math
    score_diff = pos_logprob - neg_logprob
    normalized_score = 1 / (1 + math.exp(-score_diff))

    verdict = positive_token if pos_logprob > neg_logprob else negative_token

    return normalized_score

In [22]:
for step in enumerate(ans):
  result = score_intermediary_step(sample_prompt, step[1])
  print(result)
  print("----------------------")

0.985155100397563
----------------------
0.9877642469621631
----------------------
0.9854197920797677
----------------------
0.9864782356042919
----------------------
0.981159061791835
----------------------


### Generation loop

In [23]:
eos_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
eos_token = eos_tokenizer.eos_token

In [24]:
eos_token

'<|eot_id|>'

In [None]:
class Node:
    def __init__(self, text, score=None, parent=None, depth=0, is_terminal=False):
        self.text = text
        self.score = score
        self.parent = parent
        self.children = []
        self.depth = depth
        self.is_terminal = False

    def add_children(self, texts, eos_token="<|eot_id|>"):
        self.children = [Node(t, parent=self, depth=self.depth + 1, is_terminal=(eos_token in t)) for t in texts]
        return self.children

    def path(self):
        # Return full path from root to this node
        node, path = self, []
        while node:
            path.append(node.text)
            node = node.parent
        return path[::-1]

class TreeSampler:
    """A class to manage a tree of nodes, allowing expansion and scoring of nodes.
    It starts with a root node and expands it by generating candidates and scoring them.
    The leaves of the tree are the active nodes that can be expanded further.
    Each node can be terminal or non-terminal, and the tree can be pruned to keep only the top K nodes based on their scores, somewhat similar to beam search.
    """

    def __init__(self, root_text="", eos_token="<|eot_id|>"):
        self.root = Node(root_text)
        self.leaves = [self.root]
        self.eos_token = eos_token

    def expand_and_score(self, N, generator_fn, scorer_fn):
        """
        -> generator_fn - Expects a function that returns a list of candidates, and takes node.path(), N as input
        -> scorer_fn - Expects a function that returns a score, and takes candidate.path() as input
        """
        new_leaves = []
        for node in self.leaves:
            if node.is_terminal:
                new_leaves.append(node)
                continue
            candidates = generator_fn(node.path(), N)
            children = node.add_children(candidates, eos_token=self.eos_token)
            for child in children:
                child.score = scorer_fn(child.path())
            new_leaves.extend(children)
        self.leaves = new_leaves

    def select_top_k(self, K):
        self.leaves = sorted(self.leaves, key=lambda x: x.score, reverse=True)[:K]

    def get_top_paths(self, K=1):
        top_nodes = sorted(self.leaves, key=lambda x: x.score, reverse=True)[:K]
        return [node.path() for node in top_nodes]


In [None]:
def gen(path, N):
  responses = []
  if len(path) < 2:
    path.append(" ") # To help the model generate the required format
    
  response = generate_next_steps(temperature=0.8, context="\n\n## Step".join(path), max_new_tokens=204)
  for step in enumerate(response):
    curr_step = slice_str(step[1])
    responses.append(curr_step)
  return responses

In [37]:
solution_tree = TreeSampler(root_text=sample_prompt, eos_token=eos_token)

max_depth = 5

for _ in range(max_depth):
    solution_tree.expand_and_score(
        N=4,
        generator_fn=gen,
        scorer_fn=score_intermediary_step
    )
    solution_tree.select_top_k(K=2)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [None]:
for path in enumerate(solution_tree.get_top_paths(K=2)):
    print(path[1][-3])
    print("----------------------")

3: Find the imaginary part
Now we need to find the imaginary part of $(\cos60^\circ+i\sin60^\circ)$. The imaginary part of a complex number in polar form is given by $i\sin\theta$, where $\theta$ is the angle in radians. Here, $\theta = 60^\circ = \frac{\pi}{3}$ radians. Therefore, the imaginary part is $64i\sin\frac{\pi}{3} = 64i\cdot \frac{\sqrt{3}}{2} = 32i\sqrt{3}$.

The final answer is: $\boxed{32i\sqrt{3}}$. I hope it is correct.<|eot_id|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_text|><|end_of_te

### Further Work
The EOT token is not working. The model scores and generates even after a eot token is generated