<a href="https://colab.research.google.com/github/Mikethebot44/aimo-math/blob/main/competion_math_cot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 0 – run once in your environment (not needed if already installed)
!pip install -U "openai>=1.60.0" "datasets>=2.18.0" "huggingface_hub>=0.26.0"


Collecting datasets>=2.18.0
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting huggingface_hub>=0.26.0
  Downloading huggingface_hub-1.1.5-py3-none-any.whl.metadata (13 kB)
Collecting pyarrow>=21.0.0 (from datasets>=2.18.0)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-1.1.5-py3-none-any.whl (516 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m516.0/516.0 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, huggingface_hub, datasets
  Attempting uninstall: pyarrow
    F

In [46]:
# Cell 1 – imports and configuration

import os
import time
import random
import re
from typing import Any, Dict, List

from datasets import load_dataset
from openai import OpenAI
from google.colab import userdata
userdata.get('OPENAI_API_KEY')
userdata.get('HF_TOKEN')

# HF auth (you said you'll use a token)
HF_TOKEN = userdata.get('HF_TOKEN')
if HF_TOKEN is None:
    raise RuntimeError("Set HF_TOKEN in your environment before running.")

# OpenAI key – must be set
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
if OPENAI_API_KEY is None:
    raise RuntimeError("Set OPENAI_API_KEY in your environment before running.")

client = OpenAI(api_key=OPENAI_API_KEY)

# Model and generation settings
MODEL_NAME = "gpt-5.1"          # change to "gpt-5" if you want
REASONING_EFFORT = "medium"     # "low" / "medium" / "high"
MAX_OUTPUT_TOKENS = 2048

# HF dataset mapping settings
BATCH_SIZE = 8                  # as requested (8–16 is fine)
DRY_RUN_COUNT = 8               # number of examples for small test pass
RATE_LIMIT_SLEEP = 0.5          # seconds between API calls – be conservative

# Output configuration
NEW_COLUMN_NAME = "cot"
OUTPUT_REPO_ID = "juppy44/competition_math_cot_v1"  # change this


In [28]:
# Cell 2 – load the base dataset

load_kwargs: Dict[str, Any] = {"split": "train"}
if HF_TOKEN is not None:
    load_kwargs["token"] = HF_TOKEN

ds = load_dataset("qwedsacf/competition_math", **load_kwargs)
print(ds)
print("Columns:", ds.column_names)
print("First example:\n", ds[0])


Dataset({
    features: ['problem', 'level', 'type', 'solution'],
    num_rows: 12500
})
Columns: ['problem', 'level', 'type', 'solution']
First example:
 {'problem': 'Let \\[f(x) = \\left\\{\n\\begin{array}{cl} ax+3, &\\text{ if }x>2, \\\\\nx-5 &\\text{ if } -2 \\le x \\le 2, \\\\\n2x-b &\\text{ if } x <-2.\n\\end{array}\n\\right.\\]Find $a+b$ if the piecewise function is continuous (which means that its graph can be drawn without lifting your pencil from the paper).', 'level': 'Level 5', 'type': 'Algebra', 'solution': 'For the piecewise function to be continuous, the cases must "meet" at $2$ and $-2$. For example, $ax+3$ and $x-5$ must be equal when $x=2$. This implies $a(2)+3=2-5$, which we solve to get $2a=-6 \\Rightarrow a=-3$. Similarly, $x-5$ and $2x-b$ must be equal when $x=-2$. Substituting, we get $-2-5=2(-2)-b$, which implies $b=3$. So $a+b=-3+3=\\boxed{0}$.'}


In [33]:
# Cell 2b – clean original problems' and solutions' LaTeX

def clean_problem_and_solution_batch(batch):
    probs = batch["problem"]
    sols = batch["solution"]

    cleaned_probs = []
    cleaned_sols = []

    for p in probs:
        if p is None:
            cleaned_probs.append(p)
            continue
        t = normalize_latex_brackets(p)   # converts \[...\], $$...$$, \(...\) → $...$ and fixes final \boxed
        cleaned_probs.append(t)

    for s in sols:
        if s is None:
            cleaned_sols.append(s)
            continue
        t = normalize_latex_brackets(s)   # same cleaning for solutions
        cleaned_sols.append(t)

    return {
        "problem": cleaned_probs,
        "solution": cleaned_sols,
    }

ds = ds.map(
    clean_problem_and_solution_batch,
    batched=True,
    batch_size=BATCH_SIZE,
    desc="Cleaning LaTeX in original problems and solutions",
)

# Quick spot check
for i in range(3):
    print("=" * 60)
    print("Problem:")
    print(ds[i]["problem"])
    print("-" * 30)
    print("Solution:")
    print(ds[i]["solution"])



Cleaning LaTeX in original problems and solutions:   0%|          | 0/12500 [00:00<?, ? examples/s]

Problem:
Let $f(x) = \left\{
\begin{array}{cl} ax+3, &\text{ if }x>2, \\
x-5 &\text{ if } -2 \le x \le 2, \\
2x-b &\text{ if } x <-2.
\end{array}
\right.$Find $a+b$ if the piecewise function is continuous (which means that its graph can be drawn without lifting your pencil from the paper).
------------------------------
Solution:
For the piecewise function to be continuous, the cases must "meet" at $2$ and $-2$. For example, $ax+3$ and $x-5$ must be equal when $x=2$. This implies $a(2)+3=2-5$, which we solve to get $2a=-6 \Rightarrow a=-3$. Similarly, $x-5$ and $2x-b$ must be equal when $x=-2$. Substituting, we get $-2-5=2(-2)-b$, which implies $b=3$. So $a+b=-3+3=\boxed{0}$.
Problem:
A rectangular band formation is a formation with $m$ band members in each of $r$ rows, where $m$ and $r$ are integers. A particular band has less than 100 band members. The director arranges them in a rectangular formation and finds that he has two members left over. If he increases the number of members 

In [34]:
# Cell 3 – system instructions and prompt template

SYSTEM_INSTRUCTIONS = '''
You are a math contest assistant. You generate concise, competition-style
chain-of-thought solutions for problems, not verbose expository essays.

Follow this exact output structure:

Let's solve the problem step by step.

(1) Problem restatement
[Short 1–2 sentence summary of what the problem asks. Not verbose.]

(2) Key observations
[List 2–5 concrete insights or sub-goals. If it’s geometry, outline key relations;
if number theory, key congruences; if algebra, main substitutions or transformations.]

(3) Structured reasoning with numbered equations
Write clean, numbered steps with equations. Keep them tight and computational.

For example:
1. ...
2. ...
Thus: ...

(4) Optional verification block (Python)
Only include this if it genuinely simplifies verification of a numeric step. If you do not need Python verification, still include section (4) with a short
comment such as "No additional verification code is needed."
Wrap the code in a fenced Python code block using standard Markdown syntax.
Keep the code short; no heavy loops or symbolic libraries.

(5) Final consolidation
Explain the final arithmetic.
- If the final answer is a specific integer, explicitly reduce it modulo 1000.
- If the final answer is an algebraic expression (not a single integer), do NOT talk about modulo 1000; just present the simplified expression in \boxed{...}.


The final answer is:
\\boxed{XYZ}

CRITICAL STYLE RULES:
- Length: about 150–350 tokens. No rambly OpenMath-style essays.
- Python snippets: optional and short; many problems do not need them.
- Final answer: ALWAYS reduced modulo 1000 (if integer) and written as \\boxed{...}.
- Mindset: competition solver, not an expository textbook.
- Avoid proof-like language such as "we now claim"; keep it computational and direct.
- You must ALWAYS include all five section headers exactly in that order
- Do NOT use LaTeX \tag{...}. If you want to reference equations, just use the numbered list ("1.", "2.", etc.).
- Do NOT use align environments or equation tags. Use simple single-line equations in $...$.
- Keep the ENTIRE response (all sections combined) to roughly 150–300 tokens.

'''.strip()


def build_user_input(problem: str, solution: str) -> str:
    """
    Build the user input string that includes BOTH the original problem and solution.
    GPT-5 uses the existing solution as reference but must produce a new CoT
    in the stricter target format.
    """
    return (
        "You are given a competition math problem and an existing worked solution.\n"
        "Use them as references to construct a NEW chain-of-thought solution that follows "
        "the TARGET FORMAT described in the system instructions.\n\n"
        "Requirements:\n"
        "- Use the original problem statement where helpful, but restate it concisely.\n"
        "- Use the existing solution only as a guide; do NOT copy text verbatim.\n"
        "- Your reasoning must follow the structure:\n"
        "  (1) Problem restatement\n"
        "  (2) Key observations\n"
        "  (3) Structured reasoning with numbered equations\n"
        "  (4) Optional Python verification (short, only if useful)\n"
        "  (5) Final consolidation and final answer modulo 1000 in \\boxed{...}.\n"
        "- The final line of your response MUST end with the LaTeX form: \\boxed{...}.\n"
        "- Length target: roughly 150–350 tokens.\n\n"
        "---\n"
        "Problem:\n"
        f"{problem}\n\n"
        "---\n"
        "Existing solution (reference only, do not copy):\n"
        f"{solution}\n\n"
        "---\n"
        "Now write the assistant's chain-of-thought solution in the TARGET FORMAT.\n"
        "Make sure the final numeric answer is explicitly reduced modulo 1000 and "
        "returned as \\boxed{...} at the very end.\n"
        "You must finish all five sections within about 150–300 tokens. Do not exceed this by much."
    )


In [38]:
# Cell 4 – utilities: boxed answer extraction and basic validation (nested-brace safe)

BOXED_MARKER = r"\boxed{"


def _find_last_boxed_span(text: str):
    """
    Find the start/end indices of the LAST occurrence of \boxed{...},
    where '...' may contain nested braces like \frac{3}{2}.

    Returns (start, end) where end is the index *after* the closing '}'.
    If not found or braces don't match, returns None.
    """
    if not isinstance(text, str):
        return None

    start = text.rfind(BOXED_MARKER)
    if start == -1:
        return None

    i = start + len(BOXED_MARKER)
    depth = 1
    while i < len(text) and depth > 0:
        ch = text[i]
        if ch == "{":
            depth += 1
        elif ch == "}":
            depth -= 1
        i += 1

    if depth != 0:
        # Unbalanced braces; give up
        return None

    # [start, i) is the full "\boxed{...}" span
    return start, i


def extract_boxed_answer(text: str) -> str:
    """
    Extract the content inside the LAST \boxed{...}, allowing nested braces.
    Returns "" if none found or if unbalanced.
    """
    span = _find_last_boxed_span(text)
    if span is None:
        return ""

    start, end = span
    inner_start = start + len(BOXED_MARKER)
    inner_end = end - 1  # skip the final closing '}'
    inner = text[inner_start:inner_end]
    return inner.strip()


# still strip \tag{...} if it ever appears
TAG_PATTERN = re.compile(r"\\tag\{[^}]*\}")
def strip_tags(text: str) -> str:
    if not isinstance(text, str):
        return text
    return TAG_PATTERN.sub("", text)


def basic_validate_cot(cot_text: str) -> Dict[str, Any]:
    result = {
        "has_boxed": False,
        "has_all_sections": False,
        "final_answer_raw": "",
    }
    if not isinstance(cot_text, str) or not cot_text:
        return result

    final_ans = extract_boxed_answer(cot_text)
    if final_ans:
        result["has_boxed"] = True
        result["final_answer_raw"] = final_ans

    required_headers = [
        "(1) Problem restatement",
        "(2) Key observations",
        "(3) Structured reasoning",
        "(4) Optional verification block",
        "(5) Final consolidation",
    ]
    if all(h in cot_text for h in required_headers):
        result["has_all_sections"] = True

    return result


In [39]:
# Cell 4b – normalize GPT-5 LaTeX to $...$ style and fix final \boxed

# Match \[ ... \]
LATEX_SQUARE_ENV = re.compile(r"\\\[\s*(.+?)\s*\\\]", re.DOTALL)
# Match $$ ... $$
LATEX_DOLLAR_ENV = re.compile(r"\$\$\s*(.+?)\s*\$\$", re.DOTALL)
# Match \( ... \)
LATEX_PAREN_ENV = re.compile(r"\\\(\s*(.+?)\s*\\\)", re.DOTALL)


def _to_inline(match: re.Match) -> str:
    inner = match.group(1).strip()
    return f"${inner}$"


def _ensure_final_boxed_inline(text: str) -> str:
    """
    Ensure the LAST \boxed{...} appears as $\\boxed{...}$ if it is currently bare
    on its line (no $ characters in that line).

    This targets the final answer line:
        The final answer is:
        \boxed{351}
    ->    $\\boxed{351}$
    """
    if not isinstance(text, str):
        return text

    span = _find_last_boxed_span(text)
    if span is None:
        return text

    start, end = span

    # Find the line containing this \boxed{...}
    line_start = text.rfind("\n", 0, start)
    if line_start == -1:
        line_start = 0
    else:
        line_start += 1  # move past the newline

    line_end = text.find("\n", end)
    if line_end == -1:
        line_end = len(text)

    line = text[line_start:line_end]

    # If there's already a $ on this line, assume it's already in math mode; leave it.
    if "$" in line:
        return text

    # Otherwise, wrap JUST this occurrence in $...$
    boxed_fragment = text[start:end]         # full "\boxed{...}" including nested braces
    wrapped = f"${boxed_fragment}$"
    return text[:start] + wrapped + text[end:]


def normalize_latex_brackets(text: str) -> str:
    """
    Convert GPT-5's common delimiters (\\[...\\], $$...$$, \\(...\\)) to $...$,
    and ensure the final \boxed{...} is inside $...$ if it is bare on its line.
    """
    if not isinstance(text, str):
        return text

    # Normalise bracket/dollar environments to $...$
    text = LATEX_DOLLAR_ENV.sub(_to_inline, text)
    text = LATEX_SQUARE_ENV.sub(_to_inline, text)
    text = LATEX_PAREN_ENV.sub(_to_inline, text)

    # Fix final bare \boxed{...}
    text = _ensure_final_boxed_inline(text)

    return text


In [47]:
# Cell 5 – single-example generator for manual inspection (with fixed-index support)

def generate_cot_for_example(example: Dict[str, Any], verbose: bool = True) -> str:
    """
    Call GPT-5.x via the Responses API for a single dataset example.
    """
    problem = example["problem"]
    solution = example["solution"]

    user_input = build_user_input(problem, solution)

    response = client.responses.create(
        model=MODEL_NAME,
        instructions=SYSTEM_INSTRUCTIONS,
        input=user_input,
        reasoning={"effort": REASONING_EFFORT},
        max_output_tokens=MAX_OUTPUT_TOKENS
    )

    cot_text = response.output_text
    cot_text = strip_tags(cot_text)
    cot_text = normalize_latex_brackets(cot_text)

    if verbose:
        print("=" * 80)
        print("PROBLEM:")
        print(problem)
        print("-" * 80)
        print("ORIGINAL SOLUTION:")
        print(solution)
        print("-" * 80)
        print("GENERATED CoT:")
        print(cot_text)
        print("-" * 80)
        print("Validation:", basic_validate_cot(cot_text))

    return cot_text


def run_manual_tests(indices: List[int] | None = None,
                     k: int = 3,
                     verbose: bool = True) -> None:
    """
    Run generate_cot_for_example on either:
      - a fixed list of indices (if 'indices' is provided), or
      - k random indices (if 'indices' is None).
    """
    if indices is None:
        # Random mode
        chosen_indices = random.sample(range(len(ds)), k=min(k, len(ds)))
    else:
        # Fixed mode – trust the caller
        chosen_indices = [i for i in indices if 0 <= i < len(ds)]

    print("Running single-example tests on indices:", chosen_indices)
    for idx in chosen_indices:
        _ = generate_cot_for_example(ds[idx], verbose=verbose)
        time.sleep(RATE_LIMIT_SLEEP)


# ----- Choose how you want to test -----

# Option A: fixed, reproducible indices (non-random mode)
fixed_test_indices = [3, 4, 5, 6, 7, 8, 9, 10]   # change this list to whatever problems you want
run_manual_tests(indices=fixed_test_indices, verbose=True)

# Option B: random examples (commented out by default)
# run_manual_tests(indices=None, k=3, verbose=True)

Running single-example tests on indices: [3, 4, 5, 6, 7, 8, 9, 10]
PROBLEM:
Evaluate $\left\lceil3\left(6-\frac12\right)\right\rceil$.
--------------------------------------------------------------------------------
ORIGINAL SOLUTION:
Firstly, $3\left(6-\frac12\right)=18-1-\frac12=17-\frac12$.  Because $0\le\frac12<1$, we have $\left\lceil17-\frac12\right\rceil=\boxed{17}$.
--------------------------------------------------------------------------------
GENERATED CoT:
Let's solve the problem step by step.

(1) Problem restatement  
Compute $\left\lceil3\left(6-\frac12\right)\right\rceil$, the ceiling of a simple arithmetic expression.

(2) Key observations  
- Simplify inside the parentheses first.  
- Multiply by 3.  
- Apply the ceiling function: smallest integer $\ge$ the value.  

(3) Structured reasoning with numbered equations  
1. Simplify the inner expression:  
   $6 - \frac12 = \frac{12}{2} - \frac12 = \frac{11}{2}.$
2. Multiply by 3:  
   $3\left(6 - \frac12\right) = 3 \cdot

In [None]:
# Cell 6 – batched generator for ds.map (batched=True)

def generate_cot_batch(batch: Dict[str, List[Any]]) -> Dict[str, List[str]]:
    """
    HuggingFace Datasets map-compatible function.
    Takes a batch dict and returns {NEW_COLUMN_NAME: list[str]}.

    We iterate within the batch and call the Responses API once per example.
    """
    problems = batch["problem"]
    solutions = batch["solution"]

    cots: List[str] = []

    for problem, solution in zip(problems, solutions):
        if solution is None:
            solution = ""
        try:
            user_input = build_user_input(problem, solution)
            response = client.responses.create(
                model=MODEL_NAME,
                instructions=SYSTEM_INSTRUCTIONS,
                input=user_input,
                reasoning={"effort": REASONING_EFFORT},
                max_output_tokens=MAX_OUTPUT_TOKENS
            )
            cot_text = response.output_text
            cot_text = strip_tags(cot_text)
            cot_text = normalize_latex_brackets(cot_text)
        except Exception as e:
            print(f"[ERROR] Failed to generate CoT: {e}")
            cot_text = ""

        cots.append(cot_text)
        time.sleep(RATE_LIMIT_SLEEP)

    return {NEW_COLUMN_NAME: cots}


In [None]:
# Cell 7 – dry run with ds.map on a small subset

ds_test = ds.select(range(min(DRY_RUN_COUNT, len(ds))))

ds_test_with_cot = ds_test.map(
    generate_cot_batch,
    batched=True,
    batch_size=BATCH_SIZE,
    desc="Dry-run: generating CoT on small subset",
)

for i in range(min(3, len(ds_test_with_cot))):
    row = ds_test_with_cot[i]
    print("=" * 80)
    print(f"Index {i}")
    print("Problem:", row["problem"])
    print("Original solution:", row["solution"])
    print("Generated CoT:\n", row[NEW_COLUMN_NAME])
    print("Validation:", basic_validate_cot(row[NEW_COLUMN_NAME]))


In [None]:
# Cell 8 – full dataset generation (expensive, run only after you are happy with dry run)

ds_with_cot = ds.map(
    generate_cot_batch,
    batched=True,
    batch_size=BATCH_SIZE,
    desc="Full pass: generating GPT-5 CoT for all examples",
)

print(ds_with_cot)
print("Preview of first CoT:\n", ds_with_cot[0][NEW_COLUMN_NAME][:500])


In [None]:
# Cell 9 – push augmented dataset to Hugging Face

if HF_TOKEN is None:
    raise RuntimeError(
        "HF token not set. Set HF_TOKEN / HUGGINGFACE_TOKEN / HUGGINGFACE_HUB_TOKEN "
        "or log in with `huggingface-cli login`."
    )

print("Pushing dataset with CoT column to:", OUTPUT_REPO_ID)

ds_with_cot.push_to_hub(OUTPUT_REPO_ID, token=HF_TOKEN)

print("Done. New dataset pushed with column:", NEW_COLUMN_NAME)
