In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate
from transformers import DataCollatorForLanguageModeling

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


path = 'G:/GitCode/math_association/experiments/models/test/checkpoint-3000'
device = "cuda"


tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForCausalLM.from_pretrained(path).to(device)
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
dataset = load_dataset("math_dataset",'arithmetic__add_or_sub')['train'].shuffle(seed=42).select(range(500))

def preprocess_data(examples):
    # Combine question and answer into one string
    # The model will learn to generate the answer following the question
    texts = [q.lstrip("b'").replace("\\n'",'') + " Answer: " + a.lstrip("b'").replace("\\n'",'') for q, a in zip(examples['question'], examples['answer'])]
    
    # Tokenize the texts. This will automatically add the necessary special tokens
    tokenized_inputs = tokenizer(texts, truncation=True, padding='max_length', max_length=32)
    
    # For GPT-2 like models, the labels are usually the same as the input IDs
    # since the model is expected to predict the next token in the sequence
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    
    return tokenized_inputs

tokenized_datasets = dataset.map(preprocess_data, batched=True, remove_columns=dataset.column_names)
print(tokenized_datasets)
# tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
small_train_dataset = tokenized_datasets.select(range(400))
small_eval_dataset =  tokenized_datasets.select(range(400,500))


print(small_train_dataset[0])
print(data_collator([small_train_dataset[0],small_train_dataset[1]]))
print(tokenizer.decode(small_train_dataset[0]['input_ids']))
print(tokenizer.decode(small_train_dataset[0]['labels']))


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 500/500 [00:00<00:00, 6189.05 examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 500
})
{'input_ids': [7004, 83, 974, 532, 15, 13, 22, 422, 26422, 29703, 2816, 1120, 21033, 6469, 2637, 23998, 25, 26422, 29703, 2816, 1120, 21033, 6469, 13, 22, 6, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], 'labels': [7004, 83, 974, 532, 15, 13, 22, 422, 26422, 29703, 2816, 1120, 21033, 6469, 2637, 23998, 25, 26422, 29703, 2816, 1120, 21033, 6469, 13, 22, 6, 50256, 50256, 50256, 50256, 50256, 50256]}
{'input_ids': tensor([[ 7004,    83,   974,   532,    15,    13,    22,   422, 26422, 29703,
          2816,  1120, 21033,  6469,  2637, 23998,    25, 26422, 29703,  2816,
          1120, 21033,  6469,    13,    22,     6, 50256, 50256, 50256, 50256,
         50256, 50256],
        [13065, 21503,    13,  4089,   290, 19683, 24137,    17,    13,  2718,
          2637, 23998,    25, 19683,  1954, 




In [2]:


inputs = tokenizer.encode("Subtract -0.7 from 3' Answer: b'159", return_tensors="pt", max_length=100).to(device)
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Subtract -0.7 from 3' Answer: b'159.7\n'
