In [1]:
from datasets import load_dataset, Dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, HfArgumentParser
import torch
import torch.nn as nn
import numpy as np
import random
import os
import json
from dataclasses import dataclass, field
from typing import Optional
from vllm import LLM, SamplingParams
import argparse
import sys
sys.path.append('/home/jiarui/EM-CoT/Online-DPO-R1')
import reward_labeling

@dataclass
class ScriptArguments:
    seed: Optional[int] = field(
        default=42,
        metadata={"help": "Random seed"}
    )
    max_length: Optional[int] = field(
        default=2048,
        metadata={"help": "Max length of newly generated tokens"}
    )
    model_name_or_path: Optional[str] = field(
        default='Qwen/Qwen2.5-Math-7B',
        metadata={"help": "Model name or path"}
    )
    epochs: Optional[int] = field(
        default=1,
        metadata={"help": "Number of epochs"}
    )
    alpha: Optional[float] = field(
        default=0.5,
        metadata={"help": "Penalty weight alpha"}
    )
    beta: Optional[float] = field(
        default=2.0,
        metadata={"help": "Penalty weight beta"}
    )
    lr: Optional[float] = field(
        default=0.5,
        metadata={"help": "Learning rate"}
    )
    end: Optional[int] = field(
        default=2,
        metadata={"help": "End index"}
    )
    stage_1_samples: Optional[int] = field(
        default=8,
        metadata={"help": "Number of samples for stage 1 per example"}
    )

# parser = argparse.ArgumentParser()
# parser.add_argument('--seed', type=int, default=42, help='Random seed')
# parser.add_argument('--max_length', type=int, default=2028, help='Max length of newly generated tokens')
# parser.add_argument('--model_name_or_path', type=str, default='Qwen/Qwen2.5-Math-7B', help='Model name or path')
# parser.add_argument('--epochs', type=int, default=1, help='Number of epochs')
# parser.add_argument('--alpha', type=float, default=0.5, help='Penalty weight alpha')
# parser.add_argument('--beta', type=float, default=2.0, help='Penalty weight beta')
# parser.add_argument('--lr', type=float, default=0.5, help='Learning rate')
# script_args = parser.parse_args()

script_args = ScriptArguments()

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(script_args.seed)

In [2]:
# prepare dataset
ds = load_dataset('HuggingFaceH4/MATH-500')['test']
script_args.end = min(len(ds), script_args.end)
ds = ds.select(range(script_args.end))
tokenizer = AutoTokenizer.from_pretrained(script_args.model_name_or_path)

In [16]:
# prepare model
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
llm = LLM(script_args.model_name_or_path, gpu_memory_utilization=0.5)

INFO 02-26 00:41:02 llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='Qwen/Qwen2.5-Math-7B', speculative_config=None, tokenizer='Qwen/Qwen2.5-Math-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=Qwen/Qwen2.5-Math-7B, use_v2_block_manager=False, enable_prefix_caching=False)


INFO 02-26 00:41:03 model_runner.py:720] Starting to load model Qwen/Qwen2.5-Math-7B...
INFO 02-26 00:41:03 weight_utils.py:225] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 02-26 00:41:06 model_runner.py:732] Loading model weights took 14.2418 GB
INFO 02-26 00:41:06 gpu_executor.py:102] # GPU blocks: 27574, # CPU blocks: 4681
INFO 02-26 00:41:09 model_runner.py:1024] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 02-26 00:41:09 model_runner.py:1028] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 02-26 00:41:18 model_runner.py:1225] Graph capturing finished in 9 secs.


In [17]:
def stage_1_sampling():
    sampling_params = SamplingParams(
        max_tokens=script_args.max_length,
        temperature=1.0,
        n=script_args.stage_1_samples,
    )
    prompts = []
    for i, item in enumerate(ds):
        conv = [{'role': 'user', 'content': item['problem'] + f' Let\'s think step by step and output the final answer within \\boxed{{}}'}]
        conv_chat = tokenizer.apply_chat_template(conv, tokenize=False, add_generation_prompt=True)
        prompts.append(conv_chat)
    outputs = llm.generate(prompts, sampling_params)
    return outputs

In [11]:
outputs = stage_1_sampling()

Processed prompts: 100%|██████████| 2/2 [01:04<00:00, 32.39s/it, est. speed input: 3.75 toks/s, output: 158.34 toks/s]


In [40]:
stage_1_collected_data = []
corrects = []
for i, item in enumerate(ds):
    collected_data = {
        'problem': item['problem'],
        'answer': item['answer'],
        'outputs': []
    }
    problem_corrects = []
    for j in range(script_args.stage_1_samples):
        correct = reward_labeling.is_equal(outputs[i].outputs[j].text, item['answer'], dataset_name='math500')
        if correct:
            problem_corrects.append(j)
            collected_data['outputs'].append(outputs[i].outputs[j].text)
    corrects.append(problem_corrects)
    stage_1_collected_data.append(collected_data)

print(corrects)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[[], [0, 1, 5]]


In [None]:
# calculate the accept rate from stage 1



In [4]:
# load model for gradient calculation
model = AutoModelForCausalLM.from_pretrained(script_args.model_name_or_path, torch_dtype=torch.bfloat16)
model.to(torch.device('cuda:2'))

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(152064, 3584)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
          (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((3584,), eps=1e-06)
    (rotary_emb):

In [6]:
stage_1_collected_data = [
    {},
    {'problem': ds[0]['problem'],
     'answer': ds[0]['answer'],
     'outputs': ['The answer is 1.']}
]

In [7]:
conv = [{'role': 'user', 'content': ds[1]['problem'] + f' Let\'s think step by step and output the final answer within \\boxed{{}}'},
 {'role': 'assistant', 'content': stage_1_collected_data[1]['outputs'][0]}]
conv_chat = tokenizer.apply_chat_template(conv, tokenize=False, add_generation_prompt=False)
input_ids = tokenizer(conv_chat, return_tensors='pt').input_ids.to(torch.device('cuda:2'))

In [8]:
conv = [{'role': 'user', 'content': ds[1]['problem'] + f' Let\'s think step by step and output the final answer within \\boxed{{}}'}]
conv_chat = tokenizer.apply_chat_template(conv, tokenize=False, add_generation_prompt=True)

In [9]:
def find_prompt_end(input_ids):
    end = tokenizer('<|im_start|>assistant')['input_ids']
    end_len = len(end)
    input_len = len(input_ids)
    for i in range(input_len - end_len):
        found = True
        for j in range(end_len):
            if input_ids[i + j] != end[j]:
                found = False
                break
        if found:
            return i + end_len


In [10]:
o = model(input_ids, return_dict=True)

In [11]:
logits = o.logits
log_probs = nn.functional.log_softmax(logits, dim=-1)
output_log_probs = log_probs[0, find_prompt_end(input_ids[0].tolist()):]
output_log_probs_sen = output_log_probs.sum(dim=0) # whole sequence

In [13]:
loss = -output_log_probs_sen.mean()
loss.backward()

In [15]:
for n, p in model.named_parameters():
    print(n, p.grad)

model.embed_tokens.weight tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:2', dtype=torch.bfloat16)
model.layers.0.self_attn.q_proj.weight tensor([[ 0.0010, -0.0046,  0.0034,  ..., -0.0020, -0.0029,  0.0129],
        [ 0.0044, -0.0014,  0.0007,  ..., -0.0046,  0.0008,  0.0082],
        [ 0.0014,  0.0008, -0.0024,  ..., -0.0011, -0.0002, -0.0002],
        ...,
        [-0.0022, -0.0040,  0.0045,  ..., -0.0111, -0.0067,  0.0060],
        [ 0.0007, -0.0020,  0.0014,  ..., -0.0072, -0.0033,  0.0033],
        [ 0.0012, -0.0006, -0.0008,  ...,  0.0135,  0.0050, -0.0047]],
       device='cuda:2', dtype=torch.bfloat16)
model.layers.0.self_attn.q_proj.bias tensor([-0.0437,  0.0070, -0.0153,  ...,  0.6406,  0.3164, -0.3672],
       device='cuda:2', dtype=torch.bfloat16)
model.laye

In [None]:
sampling_params = SamplingParams(
    temperature=1.0,
    n=8,
    max_tokens=script_args.max_length,
    logprobs=1,
)

# generate
conv = [{'role': 'user', 'content': ds[0]['problem'] + f' Let\'s think step by step and output the final answer within \\boxed{{}}'}]
conv_chat = tokenizer.apply_chat_template(conv, tokenize=False, add_generation_prompt=True)
print(conv_chat)
prompts = [conv_chat]
outputs = llm.generate(prompts, sampling_params)

print(type(outputs))

def get_logprobs_vllm(prompts, sampling_params):
    outputs = llm.generate(prompts, sampling_params)
    logprobs = []
    for output in outputs:
        logprobs.append([])
        for item in output.outputs:
            logprobs[-1].append(item.cumulative_logprob)

    return logprobs

def get_uniform_rand(l, r):
    return np.random.uniform(l, r)

print('done!')