<a href="https://colab.research.google.com/github/Mikethebot44/aimo-math/blob/main/numina-math.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
# If needed, install the datasets library:
# !pip install -q datasets

from datasets import load_dataset
import re


In [39]:
dataset_name = "AI-MO/NuminaMath-CoT"

ds = load_dataset(dataset_name, split="train")
ds


Dataset({
    features: ['source', 'problem', 'solution', 'messages'],
    num_rows: 859494
})

In [40]:
# Match $$ ... $$ including newlines
double_dollar_pattern = re.compile(r"\$\$(.*?)\$\$", re.DOTALL)

# Match lines that are just $ ... $ (possibly with spaces / tabs)
standalone_dollar_pattern = re.compile(
    r"^[ \t]*\$(.+?)\$[ \t]*$", re.MULTILINE
)

# Optional: normalize existing \[ ... \] spacing if you want, currently unused
display_bracket_pattern = re.compile(r"\\\[(.*?)\\\]", re.DOTALL)


def normalize_to_bracket_display(text: str) -> str:
    r"""
    Convert display math environments to the \[ ... \] style GPT-OSS-20B prefers.
    - $$ ... $$           -> \[ ... \]
    - standalone $ ... $  -> \[ ... \]
    Inline $...$ in sentences is left unchanged.
    Existing \[ ... \] is preserved.
    """
    if text is None:
        return text

    # 1) $$ ... $$  ->  \[ ... \]
    def _replace_double(m):
        inner = m.group(1).strip()
        return r"\[" + inner + r"\]"

    text = double_dollar_pattern.sub(_replace_double, text)

    # 2) Lines that are just $ ... $ -> \[ ... \]
    def _replace_standalone(m):
        inner = m.group(1).strip()
        return r"\[" + inner + r"\]"

    text = standalone_dollar_pattern.sub(_replace_standalone, text)

    # 3) (Optional) canonicalize existing \[ ... \] spacing
    # If you want to normalize:
    # def _normalize_brackets(m):
    #     inner = m.group(1).strip()
    #     return r"\[" + inner + r"\]"
    # text = display_bracket_pattern.sub(_normalize_brackets, text)

    return text


In [41]:
def extract_last_boxed_inner(text: str):
    """
    Find the last occurrence of \boxed{...} in `text` and return the content
    inside the outermost braces, handling nested braces.
    """
    if text is None:
        return None

    marker = r"\boxed{"
    start = text.rfind(marker)
    if start == -1:
        return None

    i = start + len(marker)
    depth = 1
    content_chars = []

    while i < len(text) and depth > 0:
        ch = text[i]
        if ch == "{":
            depth += 1
            content_chars.append(ch)
        elif ch == "}":
            depth -= 1
            if depth > 0:
                content_chars.append(ch)
        else:
            content_chars.append(ch)
        i += 1

    if depth != 0:
        # Unbalanced braces, bail
        return None

    inner = "".join(content_chars).strip()
    return inner if inner else None


In [42]:
def process_example(example):
    sol = example.get("solution", None)

    if sol is None:
        example["solution"] = None
        example["answer"] = None
        return example

    # Normalize display-style LaTeX to \[ ... \]
    normalized_solution = normalize_to_bracket_display(sol)

    # Extract last \boxed{...} (try on normalized text first)
    inner = extract_last_boxed_inner(normalized_solution)
    if inner is None:
        # Fallback to original string just in case
        inner = extract_last_boxed_inner(sol)

    answer = f"${inner}$" if inner is not None else None

    example["solution"] = normalized_solution
    example["answer"] = answer
    return example


In [43]:
ds_processed = ds.map(
    process_example,
    desc="Normalizing LaTeX to \\[...\\] and extracting \\boxed answers",
)

# Drop the messages column if it exists
if "messages" in ds_processed.column_names:
    ds_processed = ds_processed.remove_columns(["messages"])

ds_processed


Normalizing LaTeX to \[...\] and extracting \boxed answers:   0%|          | 0/859494 [00:00<?, ? examples/s]

Dataset({
    features: ['source', 'problem', 'solution', 'answer'],
    num_rows: 859494
})

In [44]:
ds_processed.column_names
# ['source', 'problem', 'solution', 'answer']


['source', 'problem', 'solution', 'answer']

In [45]:
# Find an example containing the "arithmetic sequence" wording
def find_example_containing(substring, ds, column="problem", max_hits=3):
    hits = []
    for i, text in enumerate(ds[column]):
        if substring in text:
            hits.append(i)
            if len(hits) >= max_hits:
                break
    return hits

indices = find_example_containing("arithmetic sequence", ds_processed, column="problem")
indices


[0, 103, 182]

In [47]:
# Look at the first matching example
idx = indices[1]
print("PROBLEM:\n", ds_processed[idx]["problem"])
print("\nSOLUTION (normalized):\n", ds_processed[idx]["solution"])
print("\nANSWER column:\n", ds_processed[idx]["answer"])


PROBLEM:
 In the arithmetic sequence $\{a_n\}$, $a_2 = -5$ and $d = 3$. Find $a_1$.

SOLUTION (normalized):
 Since in the arithmetic sequence $\{a_n\}$, $a_2 = -5$ and $d = 3$,  
we have $a_1 + d = a_2$. Substituting the given values, we get $a_1 + 3 = -5$,  
solving this equation, we find $a_1 = -8$.  
Therefore, the answer is $\boxed{-8}$.  

**Analysis:** The solution is obtained by using the general formula of an arithmetic sequence and the given data.

ANSWER column:
 $-8$


In [48]:
from huggingface_hub import HfApi, HfFolder
import getpass

# ---- Step 1: Login to Hugging Face ----
# If you already know your hf_token, set it directly:
# hf_token = "hf_xxx..."

hf_token = getpass.getpass("Enter your Hugging Face token: ")
HfFolder.save_token(hf_token)
api = HfApi()

# ---- Step 2: Define repo details ----
# Replace this with your HF username and desired dataset name.
username = api.whoami(token=hf_token)["name"]
repo_name = "NuminaMath-CoT-Processed"   # change if you want
repo_id = f"{username}/{repo_name}"

# ---- Step 3: Create the dataset repo (if not exists) ----
api.create_repo(repo_id, repo_type="dataset", exist_ok=True, token=hf_token)

# ---- Step 4: Push the dataset to HuggingFace ----
# `ds_processed` must be a Dataset object (from datasets library)
ds_processed.push_to_hub(repo_id, token=hf_token)

print(f"Uploaded dataset to https://huggingface.co/{repo_id}")


Enter your Hugging Face token: ··········


Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/287 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              :   1%|          | 1.24MB /  212MB            

Creating parquet from Arrow format:   0%|          | 0/287 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              :   1%|          | 1.10MB /  212MB            

Creating parquet from Arrow format:   0%|          | 0/287 [00:00<?, ?ba/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

                              :   1%|1         | 2.26MB /  212MB            

Uploaded dataset to https://huggingface.co/juppy44/NuminaMath-CoT-Processed
