### Load dataset

In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Prevents warnings

def load_dataset(path):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=path,
        block_size=512
    )

train_dataset = load_dataset("../data/data.txt")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

  from .autonotebook import tqdm as notebook_tqdm


### Auto evaluation at end of each epoch

In [2]:
from transformers import TrainerCallback
import torch

class GenerationCallback(TrainerCallback):
    def __init__(self, tokenizer, model, prompt="Explain overfitting in machine learning.", max_length=150):
        self.tokenizer = tokenizer
        self.model = model
        self.prompt = prompt
        self.max_length = max_length

    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"\nSample generation after epoch {state.epoch:.0f}")
        self.model.eval()
        inputs = self.tokenizer(self.prompt, return_tensors="pt").to(self.model.device)
        outputs = self.model.generate(
            **inputs,
            max_length=self.max_length,
            do_sample=True,
            top_p=0.95,
            temperature=0.8,
        )
        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        print("\nSample Output:\n", generated_text)

        with open(f"sample_epoch_{int(state.epoch)}.txt", "w", encoding="utf-8") as f:
            f.write(generated_text)

### Training

In [12]:
from transformers import TrainingArguments, Trainer

model = GPT2LMHeadModel.from_pretrained("gpt2")

training_args = TrainingArguments(
    output_dir="./gpt2-arxiv",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=16,
    save_steps=1000,
    save_total_limit=None,
    prediction_loss_only=True,
    fp16=True,
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    callbacks=[GenerationCallback(tokenizer, model)]
)

trainer.train()

Step,Training Loss
100,3.892
200,3.7714
300,3.7284
400,3.691
500,3.6623
600,3.6443
700,3.6066
800,3.5984
900,3.5885
1000,3.5649


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Sample generation after epoch 1

Sample Output:
 Explain overfitting in machine learning. This paper examines the extent to which training samples overfitting leads to catastrophic forgetting of predictive data. We investigate the impact of data augmentation on this phenomenon and propose two techniques to mitigate catastrophic forgetting by augmenting training samples. First, we propose a novel pre-trained dataset augmentation method named Pre-AI, which consists of a pre-trained dataset and a pre-trained dataset augmentation method. The pre-trained dataset augmentation method is designed to ensure the model is able to accurately predict the predictive data and to mitigate catastrophic forgetting. Second, we propose a novel data augmentation technique that employs pre-trained datasets and pre-trained datasets augmentation. The pre-trained datasets and pre-trained datasets


KeyboardInterrupt: 

### Generate output

In [21]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

ckpt_path = "./gpt2-arxiv/checkpoint-12000"
model = GPT2LMHeadModel.from_pretrained(ckpt_path)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model.eval()

prompt = "What is machine learning?"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = model.generate(
    **inputs,
    max_new_tokens=60,
    eos_token_id=tokenizer.eos_token_id,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.9,
    repetition_penalty=1.2
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

with open("sample_from_checkpoint_12000.txt", "w", encoding="utf-8") as f:
    f.write(tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is machine learning? In this paper, we present a novel approach to learn the relationship between models and data. We propose an algorithm that learns relationships using deep neural networks (DNNs) with high accuracy on real-world datasets such as Amazon Mechanical Turk or Google Docbook from pretraining for tasks like image classification in
