# Imports

In [1]:
import json
import re
import torch
from datasets import load_dataset
from vllm import LLM, SamplingParams
from sympy import sympify, simplify 
from typing import Callable, List, Dict
from pathlib import Path
from cs336_alignment.drgrpo_grader import r1_zero_reward_fn

INFO 12-10 18:33:14 __init__.py:190] Automatically detected platform cuda.


In [2]:
# Device setup (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Output directory for serialized results
OUTPUT_DIR = Path("zero_shot_results")
OUTPUT_DIR.mkdir(exist_ok=True)

# Setup

## Prompt

In [4]:
R1_ZERO_PROMPT = """
A conversation between User and Assistant. The User asks a question, and the Assistant solves it. The Assistant first thinks about the reasoning process in the mind and then provides the User with the answer. The reasoning process is enclosed within <think> </think> and answer is enclosed within <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.
User: {question}
Assistant: <think>
"""

## Loading the data

In [5]:
# Cell 2: Load the Dataset
# Load GSM8K test split (1339 examples)
dataset = load_dataset("gsm8k", "main", split="test")

In [6]:
# Prepare list of prompts and ground truths
prompts = []
ground_truths = []
for example in dataset:
    question = example["question"].strip()
    prompt = R1_ZERO_PROMPT.format(question=question)
    prompts.append(prompt)
    # GSM8K ground truth is in "answer" field, e.g., "Natalia sold clips... ### 72"
    # Extract the final answer (after "###")
    gt = example["answer"].split("###")[-1].strip()
    ground_truths.append(gt)

print(f"Loaded {len(prompts)} examples from GSM8K test set.")

Loaded 1319 examples from GSM8K test set.


## Loading the model

In [7]:
# Cell 3: Load the Model with vLLM
# Load Qwen2.5-Math-1.5B with vLLM (downloads automatically if not cached)
model_path = "Qwen/Qwen2.5-Math-1.5B"
vllm_model = LLM(
    model=model_path,
    dtype="float16",  # As recommended in PDF for memory efficiency
    gpu_memory_utilization=0.9,  # Adjust if OOM errors
    tensor_parallel_size=1,  # Single GPU
)

# Sampling params (from PDF: temp=1.0, top_p=1.0, max=1024, stop on </answer>)
sampling_params = SamplingParams(
    temperature=1.0,
    top_p=1.0,
    max_tokens=1024,
    stop=["</answer>"],
    include_stop_str_in_output=True  # To include the stop token in output if generated
)

print("Model loaded.")

INFO 12-10 18:33:24 config.py:542] This model supports multiple tasks: {'classify', 'score', 'embed', 'generate', 'reward'}. Defaulting to 'generate'.
INFO 12-10 18:33:24 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.2) with config: model='Qwen/Qwen2.5-Math-1.5B', speculative_config=None, tokenizer='Qwen/Qwen2.5-Math-1.5B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=Qwen/Qwen2.5-Math-1.5B, num_scheduler_ste

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 12-10 18:33:28 model_runner.py:1115] Loading model weights took 2.8797 GB
INFO 12-10 18:33:30 worker.py:267] Memory profiling takes 0.94 seconds
INFO 12-10 18:33:30 worker.py:267] the current vLLM instance can use total_gpu_memory (14.56GiB) x gpu_memory_utilization (0.90) = 13.11GiB
INFO 12-10 18:33:30 worker.py:267] model weights take 2.88GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 1.40GiB; the rest of the memory reserved for KV Cache is 8.78GiB.
INFO 12-10 18:33:30 executor_base.py:110] # CUDA blocks: 20557, # CPU blocks: 9362
INFO 12-10 18:33:30 executor_base.py:115] Maximum concurrency for 4096 tokens per request: 80.30x
INFO 12-10 18:33:33 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_uti

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:18<00:00,  1.90it/s]

INFO 12-10 18:33:52 model_runner.py:1562] Graph capturing finished in 18 secs, took 0.17 GiB
INFO 12-10 18:33:52 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 23.52 seconds





Model loaded.


# Zero-shot benchmark

In [8]:
# Cell 5: Evaluation Function
def evaluate_vllm(
    vllm_model: LLM,
    reward_fn: Callable[[str, str], Dict[str, float]],
    prompts: List[str],
    ground_truths: List[str],
    eval_sampling_params: SamplingParams,
    output_path: str = "results.json"
) -> List[Dict]:
    """
    Evaluate model on prompts, compute metrics, serialize to disk.
    Returns list of dicts with example, generation, scores.
    """
    outputs = vllm_model.generate(prompts, eval_sampling_params)
    
    results = []
    for idx, output in enumerate(outputs):
        generated_text = output.outputs[0].text.strip()  # vLLM generated continuation
        full_response = output.prompt + generated_text  # Full for analysis (prompt + generation)
        
        # Pass full_response or generated_text to reward_fn – test which one works with the official fn
        # From PDF, parsing the "model’s output" likely means the generation after <think>
        # Try generated_text first; if not, switch to full_response
        rewards = reward_fn(generated_text, ground_truths[idx])  # <-- Updated to use official fn; adjust arg if needed
        
        result = {
            "prompt": prompts[idx],
            "generation": generated_text,
            "full_response": full_response,
            "ground_truth": ground_truths[idx],
            "format_reward": rewards["format_reward"],
            "answer_reward": rewards["answer_reward"]
        }
        results.append(result)
    
    # Serialize to disk
    with open(OUTPUT_DIR / output_path, "w") as f:
        json.dump(results, f, indent=4)
    
    print(f"Results saved to {OUTPUT_DIR / output_path}")
    return results

In [9]:
# The ground truths contain a '# ' before every number which causes the r1_zero_reward_fn function to fail, we remove it here. 
ground_truths_v2 = [x[2:] for x in ground_truths]

In [10]:
# Cell 6: Run Evaluation
# Run evaluation on full dataset (or slice for testing: prompts[:100])
results = evaluate_vllm(
    vllm_model=vllm_model,
    reward_fn=r1_zero_reward_fn,  # <-- Now using the official function
    prompts=prompts,
    ground_truths=ground_truths_v2,
    eval_sampling_params=sampling_params,
    output_path="gsm8k_zero_shot.json"
)

Processed prompts: 100%|██████████| 1319/1319 [02:43<00:00,  8.09it/s, est. speed input: 1253.69 toks/s, output: 2232.35 toks/s]


Results saved to zero_shot_results/gsm8k_zero_shot.json


In [12]:
# Categorize generations
category_counts = {
    "correct_both": 0,  # format=1, answer=1
    "format_ok_answer_wrong": 0,  # format=1, answer=0
    "both_wrong": 0  # format=0, answer=0
}

examples = {
    "correct_both": [],
    "format_ok_answer_wrong": [],
    "both_wrong": []
}

for res in results:
    fr = res["format_reward"]
    ar = res["answer_reward"]
    if fr == 1 and ar == 1:
        category_counts["correct_both"] += 1
        if len(examples["correct_both"]) < 10:
            examples["correct_both"].append(res)
    elif fr == 1 and ar == 0:
        category_counts["format_ok_answer_wrong"] += 1
        if len(examples["format_ok_answer_wrong"]) < 10:
            examples["format_ok_answer_wrong"].append(res)
    else:
        category_counts["both_wrong"] += 1
        if len(examples["both_wrong"]) < 10:
            examples["both_wrong"].append(res)

print("Category Counts:")
print(category_counts)

# Overall metrics
total = len(results)
accuracy = (category_counts["correct_both"] / total) * 100 if total > 0 else 0
format_rate = ((category_counts["correct_both"] + category_counts["format_ok_answer_wrong"]) / total) * 100
print(f"\nOverall Accuracy (correct both): {accuracy:.2f}%")
print(f"Format Success Rate: {format_rate:.2f}%")

# For writeup: Inspect examples (observe at least 10 per category if available)
print("\nExamples where both correct:")
for ex in examples["correct_both"]:
    print(f"Prompt: {ex['prompt'][:100]}...")
    print(f"Generation: {ex['generation']}")
    print(f"Ground Truth: {ex['ground_truth']}")
    print("---")

print("\nExamples where format OK but answer wrong:")
for ex in examples["format_ok_answer_wrong"]:
    print(f"Prompt: {ex['prompt'][:100]}...")
    print(f"Generation: {ex['generation']}")
    print(f"Ground Truth: {ex['ground_truth']}")
    print("---")

print("\nExamples where both wrong (format failed):")
for ex in examples["both_wrong"]:
    print(f"Prompt: {ex['prompt'][:100]}...")
    print(f"Generation: {ex['generation']}")
    print(f"Ground Truth: {ex['ground_truth']}")
    print("---")

Category Counts:
{'correct_both': 6, 'format_ok_answer_wrong': 22, 'both_wrong': 1291}

Overall Accuracy (correct both): 0.45%
Format Success Rate: 2.12%

Examples where both correct:
Prompt: 
A conversation between User and Assistant. The User asks a question, and the Assistant solves it. T...
Generation: The bagel cost $4.
  The soup cost 25% more than the bagel, so the soup cost $4 x 1.25 = $5.
  The cake cost half of the price of the bagel, so the cake cost $4 / 2 = $2.
  The total cost of the dinner is $4 (bagel) + $5 (soup) + $2 (cake) = $11.
</think> <answer> $11 </answer>
Ground Truth: 11
---
Prompt: 
A conversation between User and Assistant. The User asks a question, and the Assistant solves it. T...
Generation: The starting value is 20. Let's call the starting value \( S \). So, \( S = 20 \).

The calculation is:
\[ S + \frac{S}{2} \div 5 \]
First, calculate half of the starting value:
\[ \frac{S}{2} = \frac{20}{2} = 10 \]
Now, add this to the starting value:
\[ S + 10 = 20 

# Supervised fine-tuning

In [1]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import json
from pathlib import Path

MODEL_NAME = "Qwen/Qwen2.5-Math-1.5B"
OUTPUT_DIR = Path("sft_checkpoints")
OUTPUT_DIR.mkdir(exist_ok=True)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [2]:
# Load the high-quality math SFT dataset (220k verified traces from DeepSeek-R1)
sft_dataset = load_dataset("open-r1/OpenR1-Math-220k", split="train")

print(f"Loaded {len(sft_dataset)} high-quality math SFT examples")
print("Features:", sft_dataset.features)

# Inspect first example
print("\nFirst example:")
print(json.dumps(sft_dataset[0], indent=2)[:1000] + "..." if len(str(sft_dataset[0])) > 1000 else json.dumps(sft_dataset[0], indent=2))

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Loaded 93733 high-quality math SFT examples
Features: {'problem': Value('string'), 'solution': Value('string'), 'answer': Value('string'), 'problem_type': Value('string'), 'question_type': Value('string'), 'source': Value('string'), 'uuid': Value('string'), 'is_reasoning_complete': List(Value('bool')), 'generations': List(Value('string')), 'correctness_math_verify': List(Value('bool')), 'correctness_llama': List(Value('bool')), 'finish_reasons': List(Value('string')), 'correctness_count': Value('int64'), 'messages': List({'content': Value('string'), 'role': Value('string')})}

First example:
{
  "problem": "## Task B-1.3.\n\nA ship traveling along a river has covered $24 \\mathrm{~km}$ upstream and $28 \\mathrm{~km}$ downstream. For this journey, it took half an hour less than for traveling $30 \\mathrm{~km}$ upstream and $21 \\mathrm{~km}$ downstream, or half an hour more than for traveling $15 \\mathrm{~km}$ upstream and $42 \\mathrm{~km}$ downstream, assuming that both the ship and 

In [3]:
from datasets import load_dataset

# You already loaded it — reuse the variable
# sft_dataset = load_dataset("open-r1/OpenR1-Math-220k", split="train")

R1_ZERO_SYSTEM = """A conversation between User and Assistant. The User asks a question, and the Assistant solves it. The Assistant first thinks about the reasoning process in the mind and then provides the User with the answer. The reasoning process is enclosed within <think> </think> and answer is enclosed within <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>."""

def format_example(ex):
    problem = ex["problem"].strip()
    solution = ex["solution"].strip()
    answer = ex["answer"].strip()
    
    # Clean up solution: remove final boxed answer if present (avoid duplication)
    import re
    cleaned_solution = re.sub(r"\\boxed\{.*?\}", "", solution, flags=re.DOTALL).strip()
    
    full_text = f"{R1_ZERO_SYSTEM}\n\nUser: {problem}\nAssistant: <think>{cleaned_solution}</think><answer>{answer}</answer>"
    return {"text": full_text}

# Format 8000 examples (fast & sufficient)
train_data = []
for ex in sft_dataset.select(range(10000)):  # 10k → ~9k valid
    formatted = format_example(ex)
    train_data.append(formatted)
    if len(train_data) >= 8000:
        break

print(f"Successfully formatted {len(train_data)} examples!")

# Show one — should be PERFECT
print("\n--- Sample Formatted Example ---")
print(train_data[0]["text"][:1500] + "\n...\n" + train_data[0]["text"][-300:])

Successfully formatted 8000 examples!

--- Sample Formatted Example ---
A conversation between User and Assistant. The User asks a question, and the Assistant solves it. The Assistant first thinks about the reasoning process in the mind and then provides the User with the answer. The reasoning process is enclosed within <think> </think> and answer is enclosed within <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.

User: ## Task B-1.3.

A ship traveling along a river has covered $24 \mathrm{~km}$ upstream and $28 \mathrm{~km}$ downstream. For this journey, it took half an hour less than for traveling $30 \mathrm{~km}$ upstream and $21 \mathrm{~km}$ downstream, or half an hour more than for traveling $15 \mathrm{~km}$ upstream and $42 \mathrm{~km}$ downstream, assuming that both the ship and the river move uniformly.

Determine the speed of the ship in still water and the speed of the river.
Assistant: <think>## Solutio

In [4]:
# SFT training run

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import torch
from pathlib import Path
from datasets import Dataset   # ← This is the missing import!

MODEL_NAME = "Qwen/Qwen2.5-Math-1.5B"
OUTPUT_DIR = Path("sft_final_r1_style")
OUTPUT_DIR.mkdir(exist_ok=True)

# Tokenizer (already loaded, but re-run if needed)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # ← This handles the 4-bit loading
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Model (REMOVED load_in_4bit=True from here — fixed!)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    # load_in_4bit=True,  # ← DELETE THIS LINE — it's redundant!
    quantization_config=bnb_config,  # ← This is enough
    trust_remote_code=True
)

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # ~40M trainable params

#train_dataset = Dataset.from_list(train_data)   # ← CONVERT LIST → HF DATASET
train_dataset = Dataset.from_list(train_data[:64])

print(f"Converted to Dataset: {len(train_dataset)} examples")
print("Columns:", train_dataset.column_names)
print("First example keys:", train_dataset[0].keys())

# 2. Let SFTTrainer tokenize automatically (this is the easiest & safest way)
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=TrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=1,           # ← Fits in 15GB
        gradient_accumulation_steps=32,          # ← Effective batch = 32
        learning_rate=2e-4,
        num_train_epochs=1,
        logging_steps=20,
        save_strategy="epoch",
        bf16=True,
        report_to="none",
        gradient_checkpointing=True,             # ← Huge memory saver
        gradient_checkpointing_kwargs={"use_reentrant": False},  # ← Required for Qwen
        optim="paged_adamw_8bit",                # ← Extra memory saving
        fp16=False,
    ),
    train_dataset=train_dataset,
    dataset_text_field="text",        # ← SFTTrainer will tokenize this field automatically
    max_seq_length=2048,
    packing=False,                    # ← Keep False (packing + gradient_checkpointing can conflict)
)

print("Starting FINAL SFT — this WILL work and finish in ~60–90 min (or ~15 min if you use 2000 examples)")
trainer.train()
trainer.save_model(OUTPUT_DIR)
print("SFT DONE! Your LoRA adapter is saved in:", OUTPUT_DIR)

In [None]:
from vllm import LLM, SamplingParams

sft_model = LLM(
    model="Qwen/Qwen2.5-Math-1.5B",
    dtype="bfloat16",
    load_format="bitsandbytes",
    tensor_parallel_size=1,
    gpu_memory_utilization=0.9,
)

sft_model.load_lora_weights("sft_final_r1_style")  # ← your trained adapter

# Test on 10 random GSM8K problems
test_prompts = prompts[::130]  # every 130th → 10 diverse examples
sampling_params = SamplingParams(
    temperature=0.0,      # greedy for clean format
    max_tokens=1024,
    stop=["</answer>"]
)

print("Generating from your SFT model...\n" + "="*60)
outputs = sft_model.generate(test_prompts, sampling_params)

for i, out in enumerate(outputs):
    gen = out.outputs[0].text
    question = out.prompt.split("User:")[-1].split("Assistant:")[0].strip()[:120]
    print(f"\nExample {i+1} | Question: {question}...")
    print(gen.strip())
    print("Format correct?", "<think>" in gen and "</think>" in gen and "<answer>" in gen and "</answer>" in gen)
    print("-" * 60)

# Expert iteration

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from cs336_alignment.drgrpo_grader import r1_zero_reward_fn
from datasets import Dataset
from trl import SFTTrainer
from transformers import TrainingArguments
import torch
from pathlib import Path

# PATHS
SFT_CHECKPOINT = "sft_final_r1_style"   # Your trained model
EXPERT_DIR = Path("expert_iteration")
EXPERT_DIR.mkdir(exist_ok=True)

print("Loading your SFT model (merged checkpoint)...")
tokenizer = AutoTokenizer.from_pretrained(SFT_CHECKPOINT)
tokenizer.pad_token = tokenizer.eos_token

# Load your trained model directly (no PEFT, no bitsandbytes, no triton)
model = AutoModelForCausalLM.from_pretrained(
    SFT_CHECKPOINT,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

# Generation pipeline
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="auto",
    torch_dtype=torch.float16,
    max_new_tokens=1024,
    temperature=0.8,
    top_p=0.95,
    do_sample=True,
    num_return_sequences=4,
)

# Use 100 GSM8K problems
questions = [ex["question"] for ex in dataset.select(range(100))]
prompts = [R1_ZERO_PROMPT.format(question=q) for q in questions]

print("Generating expert traces...")
expert_traces = []

for i, prompt in enumerate(prompts):
    print(f"Problem {i+1}/100")
    outputs = generator(prompt, num_return_sequences=4)
    
    gt = ground_truths[i]
    for out in outputs:
        gen = out['generated_text'][len(prompt):]
        full = prompt + gen
        
        rewards = r1_zero_reward_fn(full, gt)
        if rewards["format_reward"] == 1.0 and rewards["answer_reward"] == 1.0:
            expert_traces.append({"text": full})
            print(f"  Found expert trace! Total: {len(expert_traces)}")
            break

print(f"\nFOUND {len(expert_traces)} EXPERT TRACES!")

if expert_traces:
    expert_dataset = Dataset.from_list(expert_traces)
    
    # Train new model on expert traces
    new_model = AutoModelForCausalLM.from_pretrained(
        "Qwen/Qwen2.5-Math-1.5B",
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
    )
    
    trainer = SFTTrainer(
        model=new_model,
        tokenizer=tokenizer,
        args=TrainingArguments(
            output_dir=EXPERT_DIR / "iter1",
            per_device_train_batch_size=1,
            gradient_accumulation_steps=16,
            learning_rate=2e-4,
            num_train_epochs=1,
            fp16=True,
            gradient_checkpointing=True,
            report_to="none",
        ),
        train_dataset=expert_dataset,
        dataset_text_field="text",
        max_seq_length=2048,
    )
    trainer.train()
    trainer.save_model(EXPERT_DIR / "iter1")
    print("EXPERT ITERATION COMPLETE!")