In [1]:
import torch
from transformers import DataCollatorWithPadding

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
texts = ["Hello world", "How are you?"]

In [3]:

# Tokenize
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.2-1B')
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("<|reserved_special_token_0|>")
tokens = [tokenizer(t) for t in texts]

# Default collate function 
collate_fn = DataCollatorWithPadding(tokenizer, padding=True) #padding=True, 'max_length'

dataloader = torch.utils.data.DataLoader(dataset=tokens, collate_fn=collate_fn, batch_size=2) 
for batch in dataloader:
    print(batch)
    break

{'input_ids': tensor([[128000,   9906,   1917, 128002, 128002],
        [128000,   4438,    527,    499,     30]]), 'attention_mask': tensor([[1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1]])}


In [5]:
from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer

texts = [
  "The quick brown fox jumps over the lazy dog.",
  "I am learning about NLP and AI today"  
]

# Tokenize
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokens = [tokenizer(t) for t in texts]

collate_fn = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
dataloader = torch.utils.data.DataLoader(dataset=tokens, collate_fn=collate_fn, batch_size=2)

for batch in dataloader:
    print(batch)

{'input_ids': tensor([[  101,  1996,  4248,  2829,  4419, 14523,  2058,  1996, 13971,  3899,
          1012,   102],
        [  101,  1045,  2572,  4083,  2055, 17953,  2361,  1998,  9932,  2651,
           102,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]), 'labels': tensor([[  101,  1996,  4248,  2829,  4419, 14523,  2058,  1996, 13971,  3899,
          1012,   102],
        [  101,  1045,  2572,  4083,  2055, 17953,  2361,  1998,  9932,  2651,
           102,  -100]])}


In [6]:
collate_fn(tokens)

{'input_ids': tensor([[  101,  1996,  4248,  2829,  4419, 14523,  2058,  1996, 13971,  3899,
          1012,   102],
        [  101,  1045,  2572,  4083,  2055, 17953,  2361,  1998,  9932,  2651,
           102,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]), 'labels': tensor([[  101,  1996,  4248,  2829,  4419, 14523,  2058,  1996, 13971,  3899,
          1012,   102],
        [  101,  1045,  2572,  4083,  2055, 17953,  2361,  1998,  9932,  2651,
           102,  -100]])}

In [8]:
import wandb
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    Trainer, 
    TrainingArguments, 
    DataCollatorForLanguageModeling
)
import math
import torch
from sklearn.metrics import accuracy_score  # This will no longer be used but kept for reference

In [10]:
#  Initialize WandB
wandb.init(
    project="COT",
    name="Value Model: Llama-3.2-1B-Instruct-LM"
)

# Load tokenizer and model
model_name = "meta-llama/Llama-3.2-1B-Instruct"  # Ensure this is the correct model name
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Initialize the model for causal language modeling
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add the special tokens to the tokenizer
special_tokens_dict = {
    'additional_special_tokens': [
        '<|reserved_special_token_10|>',
        '<|reserved_special_token_11|>',
        '<|reserved_special_token_12|>',
        '<|reserved_special_token_13|>',
        '[PAD]'
    ]
}
# Add pad token
tokenizer.add_special_tokens(special_tokens_dict)
tokenizer.pad_token = '[PAD]'
tokenizer.pad_token_id = tokenizer.eos_token_id  # Assuming pad token is same as eos
model.resize_token_embeddings(len(tokenizer))

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mshanghong_sim[0m ([33mstlm[0m). Use [1m`wandb login --relogin`[0m to force relogin


Embedding(128257, 2048)

In [11]:
# Define the dataset and tokenization function
dataset = load_dataset("LeonGuertler/PRM800K_train2_updated")
dataset = dataset["train"].train_test_split(test_size=0.01)

def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,          # Optional: Set max_length to control padding
        padding=False            # Let the data collator handle padding
    )
    tokenized_inputs["labels"] = examples["value_label"]  # Assign labels correctly
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Initialize DataCollatorWithPadding for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Map: 100%|██████████| 911716/911716 [01:31<00:00, 9921.72 examples/s] 
Map: 100%|██████████| 9210/9210 [00:01<00:00, 8278.97 examples/s] 


In [13]:
# **Optionally, inspect a batch to verify masking**
# This step is for debugging purposes and is not required for training.
# If you choose to keep it, ensure tensors are on the correct device.
print(tokenized_datasets["train"])
print("----")
print(tokenized_datasets["train"][:2])
print("----")
# batch = tokenized_datasets["train"][:2][]
batch = data_collator(batch)

Dataset({
    features: ['text', 'value_label', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 911716
})
----
{'text': ['We realize that $a^3+b^3$ is the sum of two cubes and thus can be expressed as $(a+b)(a^2-ab+b^2)$. From this, we have  \\begin{align*}\na^3 + b^3 & = (a+b)(a^2-ab+b^2) \\\\\n& = (a+b)((a^2+2ab+b^2)-3ab) \\\\\n& = (a+b)((a+b)^2-3ab)\n\\end{align*}Now, since $a+b=10$ and $ab=17$, we have $$a^3+b^3= (a+b)((a+b)^2-3ab)=10\\cdot(10^2-3\\cdot17)=10\\cdot49=', 'Let $y = f(x)$. Then, $f(f(x)) = f(y) = '], 'value_label': [1, 0], 'input_ids': [[128000, 1687, 13383, 430, 400, 64, 61, 18, 36193, 61, 18, 3, 374, 279, 2694, 315, 1403, 55204, 323, 8617, 649, 387, 13605, 439, 5035, 64, 36193, 2432, 64, 61, 17, 39130, 36193, 61, 17, 8, 13244, 5659, 420, 11, 584, 617, 220, 1144, 7413, 90, 6750, 9, 534, 64, 61, 18, 489, 293, 61, 18, 612, 284, 320, 64, 36193, 2432, 64, 61, 17, 39130, 36193, 61, 17, 8, 91255, 5, 284, 320, 64, 36193, 14699, 64, 61, 17, 10, 17, 370, 36193, 61, 17

In [14]:
batch

{'input_ids': tensor([[  101,  1996,  4248,  2829,  4419, 14523,  2058,  1996, 13971,  3899,
          1012,   102],
        [  101,  1045,  2572,  4083,  2055, 17953,  2361,  1998,  9932,  2651,
           102,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]), 'labels': tensor([[  101,  1996,  4248,  2829,  4419, 14523,  2058,  1996, 13971,  3899,
          1012,   102],
        [  101,  1045,  2572,  4083,  2055, 17953,  2361,  1998,  9932,  2651,
           102,  -100]])}