# About

This is a notebook attempting to reproduce the OpenAI paper "Training Verifiers to Solve Math Word Problems" https://arxiv.org/abs/2110.14168

So far only the generator training part is done. The verifier training should be done next.

#Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%autosave 60

!pip install datasets transformers
!pip install -U bitsandbytes

In [None]:
import math
import re
import time

import torch
import torch.nn.functional as F
import wandb
from datasets import Dataset, concatenate_datasets, load_from_disk
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftConfig, PeftModel
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, Trainer, TrainingArguments, \
    DataCollatorWithPadding, DataCollatorForLanguageModeling, default_data_collator, DataCollatorForSeq2Seq, \
    EarlyStoppingCallback, AutoModelForSequenceClassification, TrainerCallback
from transformers import trainer_utils

import numpy as np

import pandas as pd

from collections import Counter

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

SEED = 123  # for results reproducability

In [None]:
hf_token = "<hf_token>"

from huggingface_hub import login

login(token=hf_token)

wandb.login(key="<wandb_token>")

In [None]:
dataset = load_dataset("Justelioo/gsm8k-cleaned")
dataset

In [None]:
dataset["train"][0]

In [None]:
def extract_final_answer(model_output):
    matches = re.findall(r"####\s*\$?([0-9,.]+)", model_output)
    if matches:
        answer = matches[-1].replace(",", "")
        try:
            return float(answer)
        except ValueError:
            return None
    return None

#Load model

In [None]:
MODEL_CHECKPOINT = "meta-llama/Llama-3.2-3B"
MODEL_SAVE_PATH = "drive/MyDrive/gsm8k-generators/llama-3.2-3B"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [None]:
model = AutoModelForCausalLM.from_pretrained(MODEL_CHECKPOINT, device_map="cuda", dtype=torch.bfloat16)
model.config.pad_token = tokenizer.pad_token

In [None]:
print(next(model.parameters()).dtype)
print(model.device)

In [None]:
SYSTEM_PROMPT = """You will be given a math question.
You have to solve the question and provide the solution and the result.
The result MUST be given after the symbols '####'."""

def qa_prompt(question, answer):
  return f"{SYSTEM_PROMPT}\nQuestion:\n{question}\nAnswer:\n{answer}\n"

def q_prompt(question):
  return f"{SYSTEM_PROMPT}\nQuestion:\n{question}\nAnswer:\n"

In [None]:
print(qa_prompt("this is some question", "this is the answer"))

#Prepare training data

In [None]:
def prepare_training_text(example):
  questions = example["question"]
  answers = example["answer"]

  texts = []

  for q, a in zip(questions, answers):
    text = qa_prompt(q, a)
    texts.append(text)

  return {"text": texts}

In [None]:
dataset = dataset.map(prepare_training_text, batched=True)
dataset

In [None]:
print(dataset["train"]["text"][0])

In [None]:
dataset["test"] = dataset["test"].shuffle(seed=123).select(range(32*4))
dataset

In [None]:
def tokenize(example):
  question_prefix = q_prompt(example["question"])
  text = example["text"] + tokenizer.eos_token

  tokenized = tokenizer(text, add_special_tokens=True)

  input_ids = tokenized["input_ids"]

  prompt_enc = tokenizer(question_prefix, add_special_tokens=True)
  prompt_len = len(prompt_enc["input_ids"])

  labels = [-100] * prompt_len + input_ids[prompt_len:]

  tokenized["labels"] = labels

  return tokenized

In [None]:
dataset = dataset.map(tokenize, batched=False)
dataset

In [None]:
idx = 0
for input_id, label in zip(dataset["train"]["input_ids"][idx], dataset["train"]["labels"][idx]):
  print(input_id, label, tokenizer.decode(input_id))

#Load model for QLora

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    task_type="CAUSAL_LM",
)

In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

#Functions for evaluation metrics

pass@$1$ and pass@$n$ are reported, which represent if the problem was solved at least one time using $1$ and $n$ tries respectively.

pass@1 is done with temperature=0.

pass@n is done with temperature=0.7

pass@$n$ is run with $n=16$ (in the original paper $n=100$, but due to computational restrictions, lower $n$ is chosen)

pass@$n$ is run two times and both runs are reported to see the variation of different runs, since temperature is not 0

the evaluation set consists of only 128 problems, again, due to computational restrictions

evaluation is done using batches for efficiency

In [None]:
def generate_without_sampling(model, tokenized):
  with torch.no_grad():
      results = model.generate(
          **tokenized,
          max_new_tokens=400,
          do_sample=False,
          use_cache=True,
      )
  return results

In [None]:
def generate_with_sampling(model, tokenized, temperature, n_solutions):
  with torch.no_grad():
    outputs = model.generate(
        **tokenized,
        max_new_tokens=400,
        do_sample=True,
        temperature=temperature,
        use_cache=True,
        num_return_sequences=n_solutions,
    )
  return outputs

In [None]:
def eval_pass_at_1(model, questions, true_answers, problems_per_batch=32):
  generated_answers = []
  for start in tqdm(range(0, len(questions), problems_per_batch)):
    batch_questions = questions[start:start + problems_per_batch]

    prompts = [q_prompt(q) for q in batch_questions]

    tok_q = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,
    ).to("cuda")

    results = generate_without_sampling(model, tok_q)

    batch_answers = tokenizer.batch_decode(
        results,
        skip_special_tokens=True
    )
    generated_answers.extend(batch_answers)

    correct_count = count_correct_answer_count(generated_answers, true_answers)

  return correct_count


In [None]:
def is_correct(generated_answer, true_answer):
  extracted_true_answer = extract_final_answer(true_answer)
  extracted_generated_answer = extract_final_answer(generated_answer)
  return extracted_generated_answer == extracted_true_answer

In [None]:
def count_correct_answer_count(generated_answers, true_answers):
  correct_count = 0
  for generated_answer, true_answer in zip(generated_answers, true_answers):
    if is_correct(generated_answer, true_answer):
      correct_count += 1
  return correct_count

In [None]:
def eval_pass_at_n(questions, answers, model, tokenizer, n_solutions = 16, problems_per_batch=4, temperature=0.7):
  solved_count = 0

  for i in tqdm(range(0, len(questions), problems_per_batch)):
    question_batch = questions[i: i + problems_per_batch]
    answer_batch = answers[i: i + problems_per_batch]

    extracted_gold_answer_batch = [extract_final_answer(answer) for answer in answer_batch]

    question_prompt_batch = list(map(lambda q: q_prompt(q), question_batch))

    tokenized_batch = tokenizer(question_prompt_batch, return_tensors="pt", padding=True).to("cuda")

    outputs = generate_with_sampling(model, tokenized_batch, temperature, n_solutions)

    generated_answers = tokenizer.batch_decode(
        outputs,
        skip_special_tokens=True,
    )

    extracted_answer_batch = [extract_final_answer(answer) for answer in generated_answers]

    extracted_answer_batched = np.array(extracted_answer_batch).reshape(-1, n_solutions).tolist()

    # Calculate how many problems were solved at least once
    correctness = [gold_answer in generated_answers for gold_answer, generated_answers in zip(extracted_gold_answer_batch, extracted_answer_batched)]

    solved_count += sum(correctness)

  return solved_count

In [None]:
class PrinterCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics, model, eval_dataloader, **kwargs):

      print(f"epoch={state.epoch}")

      if (state.epoch is not None):
        questions = dataset["test"]["question"]
        true_answers = dataset["test"]["answer"]

        print(f"evaluating score@1...")
        correct_count = eval_pass_at_1(model, questions, true_answers)

        n_solutions = 16
        print(f"evaluating score@{n_solutions}...")

        correct_count_at_n = eval_pass_at_n(questions, true_answers, model, tokenizer, n_solutions=n_solutions)

        print(f"evaluating score@{n_solutions}...2")
        correct_count_at_n_2 = eval_pass_at_n(questions, true_answers, model, tokenizer, n_solutions=n_solutions)

        custom_metrics = {
          "eval_solve_rate@1": correct_count,
          "eval_solve_rate@1_%": correct_count / len(questions),

          "eval_solve_rate@n": correct_count_at_n,
          "eval_solve_rate@n_%": correct_count_at_n / len(questions),

          "eval_solve_rate@n_2": correct_count_at_n_2,
          "eval_solve_rate@n_2_%": correct_count_at_n_2 / len(questions),
        }

        trainer.log(custom_metrics)

        metrics.update(custom_metrics)

      print(metrics)

#Training the model

In [None]:
collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)

In [None]:
args = TrainingArguments(
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 5,
    logging_strategy="steps",
    logging_steps=0.05,
    eval_strategy="steps",
    report_to="wandb",
    learning_rate=1e-5,
    lr_scheduler_type="cosine",
    bf16=True,
    group_by_length=True,
)

trainer = Trainer(
    model = model,
    processing_class = tokenizer,
    train_dataset = dataset["train"],
    eval_dataset = dataset["test"],
    args = args,
    data_collator = collator,
    callbacks=[PrinterCallback],
)

In [None]:
train_dataloader = trainer.get_train_dataloader()
train_dataloader

In [None]:
batch = next(iter(train_dataloader))
batch

In [None]:
for input_id, label in zip(batch["input_ids"][0], batch["labels"][0]):
  print(input_id.item(), label.item(), tokenizer.decode(input_id))

In [None]:
trainer.evaluate()

In [None]:
# 128 test samples
# 16 solutions per problem
# 0.05 eval rate
# T = 0.7
training_results = trainer.train()
training_results

In [None]:
logs = trainer.state.log_history
df = pd.DataFrame(logs)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))

df[df["loss"].notna()].plot(
    x="step", y="loss", ax=plt.gca(), label="train loss"
)

df[df["eval_loss"].notna()].plot(
    x="step", y="eval_loss", ax=plt.gca(), label="eval loss"
)

plt.ylabel("Loss")
plt.title("Training vs Evaluation Loss")
plt.show()

In [None]:
custom_df = df[df["eval_solve_rate@1"].notna()]

plt.figure(figsize=(8, 5))

plt.plot(
    custom_df["step"],
    custom_df["eval_solve_rate@1_%"],
    marker="o",
    label="solve rate @1",
)

plt.plot(
    custom_df["step"],
    custom_df["eval_solve_rate@n_%"],
    marker="o",
    label="solve rate @n",
)

plt.plot(
    custom_df["step"],
    custom_df["eval_solve_rate@n_2_%"],
    marker="o",
    label="solve rate @n (run 2)",
)

plt.xlabel("Step")
plt.ylabel("Solve Rate")
plt.title("Solve Rate vs Training Step")
plt.legend()
plt.grid(True)
plt.ylim(-0.1, 1.1)
plt.tight_layout()
plt.show()


In [None]:
def save_trained(save_directory, trainer, tokenizer):
  trainer.save_model(save_directory)
  model.config.save_pretrained(save_directory)
  tokenizer.save_pretrained(save_directory)

In [None]:
save_trained(MODEL_SAVE_PATH, trainer, tokenizer)