In [1]:
import json 
from tqdm import tqdm 
import pickle
import os 
from transformers import AutoTokenizer, AutoModelForCausalLM, default_data_collator
from torch.utils.data import Dataset, DataLoader

def load_dataset(json_file):
    data = []
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

data = load_dataset('/storage/hiu/project_2024/naacle/StudyNotes/datasets/MATH500/test.json')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = 'Qwen/Qwen2.5-Math-7B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.25it/s]


In [6]:
def apply_prompt(sample):
    messages = [
    {"role": "system", "content": "Solve the given math problem."},
    {"role": "user", "content": f"{sample['Question']}.\n"}
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return text 

In [6]:
batch_size = 8
text_list = []
generated_input_list = []
for sample in tqdm(data):
    text = apply_prompt(sample)
    if len(text_list) == batch_size:
        model_inputs = tokenizer(text_list, return_tensors="pt", padding=True, truncation=True).to(model.device)
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=512
        )
        labels = generated_ids.clone()
        labels[model_inputs['input_ids'].shape[1]:] = -100
        generated_inputs = {'input_ids': generated_ids,
                            'labels': labels,}
        generated_input_list.append(generated_inputs)
        text_list = []
    else:
        text_list.append(text)


os.makedirs("tmp", exist_ok=True)
with open(f"tmp/{model_name.replace('/', '_')}_generated_outputs.pkl", "wb") as out_f:  # Use 'wb' for binary write mode
        pickle.dump(generated_inputs, out_f)

  0%|          | 0/500 [00:00<?, ?it/s]

100%|██████████| 500/500 [11:38<00:00,  1.40s/it]


In [26]:
generated_ids.shape

torch.Size([1, 100])

In [27]:
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

'system\nSolve the given math problem.\nuser\nConvert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\\theta),$ where $r > 0$ and $0 \\le \\theta < 2 \\pi.$.\n\nassistant\nTo convert the point \\((0,3)\\) from rectangular coordinates to polar coordinates, we need to find the values of \\(r\\) and \\(\\theta'

In [40]:
import torch
labels = generated_ids[0].clone()
labels[:model_inputs.input_ids.shape[1]] = -100
labels

tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  1249,  5508,
          279,  1459,  1124,  1188,    15,    11,    18, 10699,     8,   504,
        51424, 13934,   311, 24660, 13934,    11,   582,  1184,   311,  1477,
          279,  2750,   315, 17767,    81, 57758,   323,  1124, 11520, 15976],
       device='cuda:0')

In [29]:
model_inputs.input_ids.shape

torch.Size([1, 68])

In [38]:
model_inputs.input_ids

tensor([[151644,   8948,    198,     50,   3948,    279,   2661,   6888,   3491,
             13, 151645,    198, 151644,    872,    198,  12012,    279,   1459,
           4930,     15,     11,     18,  15087,    304,  51424,  13934,    311,
          24660,  13934,     13,    220,  11252,    697,   4226,    304,    279,
           1352,   4930,     81,  26266,  15976,  98406,   1380,    400,     81,
            861,    220,     15,      3,    323,    400,     15,   1124,    273,
           1124,  15976,    366,    220,     17,   1124,   2493,   2418,    624,
         151645,    198, 151644,  77091,    198]], device='cuda:0')

In [39]:
generated_ids[0][labels == 1]

tensor([ 1249,  5508,   279,  1459,  1124,  1188,    15,    11,    18, 10699,
            8,   504, 51424, 13934,   311, 24660, 13934,    11,   582,  1184,
          311,  1477,   279,  2750,   315, 17767,    81, 57758,   323,  1124,
        11520, 15976], device='cuda:0')

In [41]:
generated_inputs = {'input_ids': generated_ids[0].unsqueeze(0),
                    'labels': labels.unsqueeze(0),}
model(**generated_inputs)

CausalLMOutputWithPast(loss=tensor(0.0175, device='cuda:0', grad_fn=<ToCopyBackward0>), logits=tensor([[[ 0.8438,  0.5859,  0.0996,  ...,  0.1523, -0.3594, -0.4746],
         [ 1.8750,  1.6250,  1.7656,  ...,  3.0938,  2.3906,  2.2344],
         [ 0.4238, -0.1660,  1.2656,  ..., -5.6875, -5.7188, -5.7500],
         ...,
         [-0.7227, -1.6328,  0.1426,  ..., -3.5781, -3.7031, -3.7344],
         [ 0.4414, -0.4062,  1.3359,  ..., -4.0000, -4.0938, -4.0938],
         [ 1.0938,  1.5781,  3.7812,  ..., -3.0625, -3.2031, -3.2188]]],
       device='cuda:0', dtype=torch.bfloat16, grad_fn=<ToCopyBackward0>), past_key_values=((tensor([[[[ 0.0000e+00,  1.5469e+00,  1.0703e+00,  ..., -1.2350e+02,
           -1.7000e+02, -1.2200e+02],
          [ 3.8906e+00,  1.0938e+00, -3.0078e-01,  ..., -1.2300e+02,
           -1.7000e+02, -1.2200e+02],
          [ 4.0000e+00,  1.1953e+00, -9.8438e-01,  ..., -1.2350e+02,
           -1.7000e+02, -1.2250e+02],
          ...,
          [ 2.4375e+00,  5.6250e-01

In [44]:
import pickle
import os 
os.makedirs("tmp", exist_ok=True)
with open(f"tmp/{model_name.replace('/', '_')}_generated_outputs.pkl", "wb") as out_f:  # Use 'wb' for binary write mode
        pickle.dump(generated_inputs, out_f)

In [10]:
data[0]['answer']

'\\left( 3, \\frac{\\pi}{2} \\right)'

In [45]:
with open(f"tmp/{model_name.replace('/', '_')}_generated_outputs.pkl", "rb") as in_f:  # Use 'rb' for binary read mode
    attention_data_base = pickle.load(in_f)

In [46]:
attention_data_base

{'input_ids': tensor([[151644,   8948,    198,     50,   3948,    279,   2661,   6888,   3491,
              13, 151645,    198, 151644,    872,    198,  12012,    279,   1459,
            4930,     15,     11,     18,  15087,    304,  51424,  13934,    311,
           24660,  13934,     13,    220,  11252,    697,   4226,    304,    279,
            1352,   4930,     81,  26266,  15976,  98406,   1380,    400,     81,
             861,    220,     15,      3,    323,    400,     15,   1124,    273,
            1124,  15976,    366,    220,     17,   1124,   2493,   2418,    624,
          151645,    198, 151644,  77091,    198,   1249,   5508,    279,   1459,
            1124,   1188,     15,     11,     18,  10699,      8,    504,  51424,
           13934,    311,  24660,  13934,     11,    582,   1184,    311,   1477,
             279,   2750,    315,  17767,     81,  57758,    323,   1124,  11520,
           15976]], device='cuda:0'),
 'labels': tensor([[ -100,  -100,  -100,  -100,