In [2]:
from datasets import load_dataset, Dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, HfArgumentParser
import torch
import torch.nn as nn
import numpy as np
import random
import os
import json
from dataclasses import dataclass, field
from typing import Optional
from vllm import LLM, SamplingParams
from tqdm import tqdm
import argparse
import sys
sys.path.append('/scratch/jiarui14/EM-CoT/Online-DPO-R1')
import reward_labeling

@dataclass
class ScriptArguments:
    seed: Optional[int] = field(
        default=42,
        metadata={"help": "Random seed"}
    )
    max_length: Optional[int] = field(
        default=2048,
        metadata={"help": "Max length of newly generated tokens"}
    )
    model_name_or_path: Optional[str] = field(
        default='Qwen/Qwen2.5-Math-7B',
        metadata={"help": "Model name or path"}
    )
    epochs: Optional[int] = field(
        default=1,
        metadata={"help": "Number of epochs"}
    )
    alpha: Optional[float] = field(
        default=0.5,
        metadata={"help": "Penalty weight alpha"}
    )
    beta: Optional[float] = field(
        default=2.0,
        metadata={"help": "Penalty weight beta"}
    )
    lr: Optional[float] = field(
        default=0.5,
        metadata={"help": "Learning rate"}
    )
    start: Optional[int] = field(
        default=0,
        metadata={"help": "Start index"}
    )
    end: Optional[int] = field(
        default=5,
        metadata={"help": "End index"}
    )
    stage_1_samples: Optional[int] = field(
        default=8,
        metadata={"help": "Number of samples for stage 1 per example"}
    )
    stage_2_samples: Optional[int] = field(
        default=8,
        metadata={"help": "Number of samples for stage 2 per example"}
    )
    local_index: Optional[int] = field(
        default=0,
        metadata={"help": "Local index"}
    )

# parser = argparse.ArgumentParser()
# parser.add_argument('--seed', type=int, default=42, help='Random seed')
# parser.add_argument('--max_length', type=int, default=2028, help='Max length of newly generated tokens')
# parser.add_argument('--model_name_or_path', type=str, default='Qwen/Qwen2.5-Math-7B', help='Model name or path')
# parser.add_argument('--epochs', type=int, default=1, help='Number of epochs')
# parser.add_argument('--alpha', type=float, default=0.5, help='Penalty weight alpha')
# parser.add_argument('--beta', type=float, default=2.0, help='Penalty weight beta')
# parser.add_argument('--lr', type=float, default=0.5, help='Learning rate')
# script_args = parser.parse_args()

script_args = ScriptArguments()

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(script_args.seed)

In [4]:
with open(f'data/stage_1_collected_data_{script_args.local_index}.json', 'r') as f:
    ds = json.load(f)
ds = Dataset.from_list(ds)

with open(f'/scratch/jiarui14/EM-CoT/EM-CoT/data/sample_sizes_{script_args.local_index}.json', 'r') as f:
    sample_sizes = json.load(f)

with open(f'/scratch/jiarui14/EM-CoT/EM-CoT/data/stage_2_allOutputs_{script_args.local_index}.json') as f:
    stage_2_allOutputs = json.load(f)

stage_2_collected_data = []
corrects_2 = []
total_samples = 0
for i, item in enumerate(ds):
    collected_data = {
        'problem': item['problem'],
        'answer': item['answer'],
        'outputs': []
    }
    problem_corrects = []
    for j in range(len(stage_2_outputs[i])):
        # correct = reward_labeling.is_equal(outputs[i].outputs[j].text, item['answer'], dataset_name='math500')
        # correct = reward_labeling.is_equal(stage_2_outputs[i][j], item['answer'], dataset_name='math500')
        correct = utils.check_correct(stage_2_outputs[i][j], item['answer'], i)
        if correct:
            problem_corrects.append(j)
            # collected_data['outputs'].append(outputs[i].outputs[j].text)
            collected_data['outputs'].append(stage_2_outputs[i][j])
    corrects_2.append(problem_corrects)
    stage_2_collected_data.append(collected_data)
    total_samples += len(collected_data['outputs'])

22
21
17
19
20
18
16
18


: 

In [3]:
tokenizer = AutoTokenizer.from_pretrained(script_args.model_name_or_path)
max_len = -1
max_len_id = -1
with open('/scratch/jiarui14/EM-CoT/EM-CoT/data/stage_1_collected_data_2.json', 'r') as f:
    data = json.load(f)
for i,item in enumerate(tqdm(data)):
    conv = [
        {'role': 'system', 'content': 'Please reason step by step, and put your final answer within \\boxed{{}}.'},
        {'role': 'user', 'content': item['problem'] + f' Let\'s think step by step and output the final answer within \\boxed{{}}'}
    ]
    for j,output in enumerate(item['outputs']):
        conv_chat = tokenizer.apply_chat_template(conv, tokenize=False, add_generation_prompt=True)
        conv_chat += output
        input_ids = tokenizer(conv_chat, return_tensors='pt').input_ids
        if input_ids.shape[1] > max_len:
            max_len = input_ids.shape[1]
            max_len_id = (i,j)

100%|██████████| 1246/1246 [00:20<00:00, 62.22it/s]


In [14]:
# get the first 10000 training samples from numina prompt
ds = load_dataset('dsrtrain/numia_prompt')['train']
ds2 = load_dataset('FlippyDora/raft_train_numia_prompt')['train']

problems = []
for item in tqdm(ds2):
    problem = item['conversations'][0]['content'].split(' Let\'s think step by step and output the final answer within \\boxed{}')[0]
    if problem not in problems:
        problems.append(problem)

qas = {}
for item in tqdm(ds):
    if item['problem'] not in qas:
        qas[item['problem']] = item['reward_model']['ground_truth']

new_ds = []
for i in range(len(problems)):
    new_ds.append({'problem': problems[i], 'answer': qas[problems[i]]})

new_ds = Dataset.from_list(new_ds)
new_ds.push_to_hub('FlippyDora/raft1_train_numia_prompt_0-10000')

100%|██████████| 9999/9999 [00:00<00:00, 13660.91it/s]


In [5]:
# prepare dataset
ds = load_dataset('HuggingFaceH4/MATH-500')['test']
script_args.end = min(len(ds), script_args.end)
ds = ds.select(range(script_args.start, script_args.end))
tokenizer = AutoTokenizer.from_pretrained(script_args.model_name_or_path)

In [25]:
# prepare model
os.environ['CUDA_VISIBLE_DEVICES'] = '8'
llm = LLM(script_args.model_name_or_path, gpu_memory_utilization=0.5, dtype=torch.bfloat16)

INFO 02-27 23:15:23 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='Qwen/Qwen2.5-Math-7B', speculative_config=None, tokenizer='Qwen/Qwen2.5-Math-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=Qwen/Qwen2.5-Math-7B, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 02-27 23:15:24 model_runner.py:720] Starting to load model Qwen/Qwen2.5-Math-7B...


OutOfMemoryError: CUDA out of memory. Tried to allocate 260.00 MiB. GPU 0 has a total capacity of 47.53 GiB of which 197.81 MiB is free. Including non-PyTorch memory, this process has 5.31 GiB memory in use. Process 2940433 has 42.00 GiB memory in use. Of the allocated memory 5.00 GiB is allocated by PyTorch, and 15.79 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [4]:
def stage_1_sampling():
    sampling_params = SamplingParams(
        max_tokens=script_args.max_length,
        temperature=1.0,
        n=script_args.stage_1_samples,
    )
    prompts = []
    for i, item in enumerate(ds):
        conv = [{'role': 'user', 'content': item['problem'] + f' Let\'s think step by step and output the final answer within \\boxed{{}}'}]
        conv_chat = tokenizer.apply_chat_template(conv, tokenize=False, add_generation_prompt=True)
        prompts.append(conv_chat)
    outputs = llm.generate(prompts, sampling_params)
    return outputs

In [11]:
outputs = stage_1_sampling()

Processed prompts: 100%|██████████| 2/2 [01:04<00:00, 32.39s/it, est. speed input: 3.75 toks/s, output: 158.34 toks/s]


In [8]:
tmp_ds = load_dataset('FlippyDora/math500_Qwen2-7B-Instruct_n8')['train'].select(range(script_args.start, script_args.end))

In [10]:
#TODO: currently, stage 1 selects all outputs with correct answers
stage_1_collected_data = []
corrects = []
for i, item in enumerate(ds):
    collected_data = {
        'problem': item['problem'],
        'answer': item['answer'],
        'outputs': []
    }
    problem_corrects = []
    for j in range(script_args.stage_1_samples):
        # correct = reward_labeling.is_equal(outputs[i].outputs[j].text, item['answer'], dataset_name='math500')
        correct = reward_labeling.is_equal(tmp_ds[i]['outputs'][j]['output'], item['answer'], dataset_name='math500')
        if correct:
            problem_corrects.append(j)
            # collected_data['outputs'].append(outputs[i].outputs[j].text)
            collected_data['outputs'].append(tmp_ds[i]['outputs'][j]['output'])
    corrects.append(problem_corrects)
    stage_1_collected_data.append(collected_data)

print(corrects)


[[], [0, 1, 2, 3, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7]]


In [48]:
# calculate the accept rate from stage 1

def calc_accept_rate():
    accept_rates = []
    for item in stage_1_collected_data:
        accept_rate = len(item['outputs']) / script_args.stage_1_samples
        accept_rates.append(accept_rate)
    return accept_rates

def calc_sample_ratio(Gs, ps):
    # Gs: list of gradients
    # ps: list of accept rates
    sample_sizes = []
    for G, p in zip(Gs, ps):
        if G == 0 or p == 0:
            sample_size = 0
        else:
            sample_size = G / (np.sqrt(p + script_args.alpha / np.power(p, script_args.beta - 1.0)))    
        sample_sizes.append(sample_size)
    total = sum(sample_sizes)
    sample_sizes = [sample_size / total for sample_size in sample_sizes]
    return sample_sizes
    

In [13]:
# load model for gradient calculation
model = AutoModelForCausalLM.from_pretrained(script_args.model_name_or_path, torch_dtype=torch.bfloat16)
model.to(torch.device('cuda:8'))

Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00,  5.21it/s]


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((3584,), eps=1e-06)
    (rotary_emb):

In [8]:
stage_1_collected_data = [
    {},
    {'problem': ds[0]['problem'],
     'answer': ds[0]['answer'],
     'outputs': ['The answer is 1.']}
]

In [9]:
conv = [{'role': 'user', 'content': ds[1]['problem'] + f' Let\'s think step by step and output the final answer within \\boxed{{}}'}]
#  {'role': 'assistant', 'content': stage_1_collected_data[1]['outputs'][0]}]
conv_chat = tokenizer.apply_chat_template(conv, tokenize=False, add_generation_prompt=True)
conv_chat += stage_1_collected_data[1]['outputs'][0]
input_ids = tokenizer(conv_chat, return_tensors='pt').input_ids.to(torch.device('cuda:8'))

In [14]:
def find_prompt_end(input_ids):
    end = tokenizer('<|im_start|>assistant\n')['input_ids']
    end_len = len(end)
    input_len = len(input_ids)
    for i in range(input_len - end_len):
        found = True
        for j in range(end_len):
            if input_ids[i + j] != end[j]:
                found = False
                break
        if found:
            return i + end_len


In [40]:
def calc_grad():
    all_grads = []
    for i, item in enumerate(tqdm(stage_1_collected_data, desc='Calculating gradients')):
        if len(item['outputs']) == 0:
            mean_grad = 0
        else:
            grads = []
            for output in item['outputs']:
                conv = [
                    {'role': 'system', 'content': 'Please reason step by step, and put your final answer within \\boxed{{}}.'},
                    {'role': 'user', 'content': item['problem'] + f' Let\'s think step by step and output the final answer within \\boxed{{}}'}
                ]
                conv_chat = tokenizer.apply_chat_template(conv, tokenize=False, add_generation_prompt=True)
                conv_chat += output
                input_ids = tokenizer(conv_chat, return_tensors='pt').input_ids.to(model.device)
                o = model(input_ids, output_hidden_states=True)
                logits = o.logits
                log_probs = nn.functional.log_softmax(logits, dim=-1)
                resp_start = find_prompt_end(input_ids[0].tolist())
                output_log_probs = log_probs[0, resp_start:]
                output_log_probs_sen = output_log_probs.sum(dim=0)
                
                # get the gradient by loss backpropagation
                loss = -output_log_probs_sen.mean() / (len(input_ids[0]) - resp_start)
                loss.backward()
                grad_norm = torch.norm(model.lm_head.weight.grad, p=2).item()
                grads.append(grad_norm)
                model.zero_grad()

            mean_grad = np.mean(grads)
        all_grads.append(mean_grad)

    return all_grads

In [41]:
all_grads = calc_grad()

Calculating gradients: 100%|██████████| 5/5 [00:13<00:00,  2.77s/it]


In [49]:
accept_rates = calc_accept_rate()
sample_sizes = calc_sample_ratio(all_grads, accept_rates)

In [52]:
def float_to_int_preserve_sum(arr, N):
    # 1. 初步缩放并四舍五入
    scaled_arr = np.array(arr) * N
    int_arr = np.round(scaled_arr).astype(int)
    print(int_arr)

    # 2. 计算误差
    error = N - np.sum(int_arr)

    # 3. 误差修正：根据四舍五入前的误差最小调整
    if error != 0:
        # 计算原始浮点数和转换后整数的误差
        residuals = scaled_arr - int_arr
        # 按误差绝对值最大调整
        indices = np.argsort(-residuals if error > 0 else residuals)[:abs(error)]
        int_arr[indices] += np.sign(error)  # 调整以匹配总和

    return int_arr.tolist()

sample_sizes = float_to_int_preserve_sum(sample_sizes, script_args.stage_2_samples)
print(sample_sizes)

[0 2 2 2 2]
[0, 2, 2, 2, 2]


In [None]:
def stage_2_sampling(sample_sizes):
    sampling_params = SamplingParams(
        max_tokens=script_args.max_length,
        temperature=1.0,
        n=1,
    )
    prompts = []
    for i, item in enumerate(ds):
        conv = [{'role': 'user', 'content': item['problem'] + f' Let\'s think step by step and output the final answer within \\boxed{{}}'}]
        conv_chat = tokenizer.apply_chat_template(conv, tokenize=False, add_generation_prompt=True)
        for _ in sample_sizes[i]:
            prompts.append(conv_chat)
    outputs = llm.generate(prompts, sampling_params)

    idx_sum = 0
    new_outputs = []
    for i in range(len(sample_sizes)):
        new_outputs.append([])
        for idx in range(idx_sum, idx_sum + sample_sizes[i]):
            new_outputs[-1].append(outputs[idx].outputs[0].text)

        idx_sum += sample_sizes[i]

    return outputs


In [None]:
stage_2_outputs = stage_2_sampling(sample_sizes)
stage_2_collected_data = []
corrects_2 = []
for i, item in enumerate(ds):
    collected_data = {
        'problem': item['problem'],
        'answer': item['answer'],
        'outputs': []
    }
    problem_corrects = []
    for j in range(len(stage_2_outputs[i])):
        # correct = reward_labeling.is_equal(outputs[i].outputs[j].text, item['answer'], dataset_name='math500')
        correct = reward_labeling.is_equal(stage_2_outputs[i][j], item['answer'], dataset_name='math500')
        if correct:
            problem_corrects.append(j)
            # collected_data['outputs'].append(outputs[i].outputs[j].text)
            collected_data['outputs'].append(tmp_ds[i]['outputs'][j]['output'])
    corrects_2.append(problem_corrects)
    stage_2_collected_data.append(collected_data)

In [10]:
ds = load_dataset('dsrtrain/numia_prompt')['train'].shuffle(seed=1).select(range(10000))
problems = []
for item in tqdm(ds):
    if item['problem'] not in problems:
        problems.append(item['problem'])

100%|██████████| 10000/10000 [00:01<00:00, 6639.56it/s]


In [11]:
ds[:5]

{'data_source': ['numina_amc_aime',
  'numina_synthetic_math',
  'numina_synthetic_amc',
  'numina_synthetic_math',
  'numina_cn_k12'],
 'prompt': [[{'content': '\nWhen tackling complex reasoning tasks, you have access to the following actions. Use them as needed to progress through your thought process.\n\n[ASSESS]\n\n[ADVANCE]\n\n[VERIFY]\n\n[SIMPLIFY]\n\n[SYNTHESIZE]\n\n[PIVOT]\n\n[OUTPUT]\n\nYou should strictly follow the format below:\n\n[ACTION NAME]\n\n# Your action step 1\n\n# Your action step 2\n\n# Your action step 3\n\n...\n\nNext action: [NEXT ACTION NAME]\n\n',
    'role': 'system'},
   {'content': 'Given the parabola $y = x^2 - 8x + c$, determine the value of $c$ for which the vertex of the parabola will be a point on the $x$-axis.\n\nPresent the answer in LaTex format: \\boxed{Your answer}',
    'role': 'user'}],
  [{'content': '\nWhen tackling complex reasoning tasks, you have access to the following actions. Use them as needed to progress through your thought process.\

In [16]:
o = model(input_ids, return_dict=True)

In [17]:
logits = o.logits
log_probs = nn.functional.log_softmax(logits, dim=-1)
output_log_probs = log_probs[0, find_prompt_end(input_ids[0].tolist()):]
output_log_probs_sen = output_log_probs.sum(dim=0) # whole sequence

In [18]:
loss = -output_log_probs_sen.mean()
loss.backward()

In [None]:
# sampling_params = SamplingParams(
#     temperature=1.0,
#     n=8,
#     max_tokens=script_args.max_length,
#     logprobs=1,
# )

# # generate
# conv = [{'role': 'user', 'content': ds[0]['problem'] + f' Let\'s think step by step and output the final answer within \\boxed{{}}'}]
# conv_chat = tokenizer.apply_chat_template(conv, tokenize=False, add_generation_prompt=True)
# print(conv_chat)
# prompts = [conv_chat]
# outputs = llm.generate(prompts, sampling_params)

# print(type(outputs))

# def get_logprobs_vllm(prompts, sampling_params):
#     outputs = llm.generate(prompts, sampling_params)
#     logprobs = []
#     for output in outputs:
#         logprobs.append([])
#         for item in output.outputs:
#             logprobs[-1].append(item.cumulative_logprob)

#     return logprobs

# def get_uniform_rand(l, r):
#     return np.random.uniform(l, r)

# print('done!')