### Process dataset

In [None]:
import json
import re

with open("data/sampled_arxiv.json", "r", encoding="utf-8") as f:
    data = json.load(f)

def clean_text(text, max_length=1000):
    if not isinstance(text, str):
        return None

    text = re.sub(r"\$.*?\$", "", text)
    text = re.sub(r"\\\[.*?\\\]", "", text)
    text = re.sub(r"\\begin\{.*?\}.*?\\end\{.*?\}", "", text, flags=re.DOTALL)
    text = re.sub(r"\s+", " ", text).strip()

    return text[:max_length]

cleaned_data = []
for entry in data:
    abstract = entry.get("abstract") or entry.get("summary")
    cleaned = clean_text(abstract)
    if cleaned and len(cleaned) > 100:
        cleaned_data.append(cleaned)

print(f"{len(cleaned_data)} cleaned entries.")

output_path = "data/data.txt"
with open(output_path, "w", encoding="utf-8") as f:
    for line in cleaned_data:
        f.write(line.strip() + "\n\n")

print(f"Saved data to {output_path}.")

### Load dataset

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Prevents warnings

def load_dataset(path):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=path,
        block_size=512
    )

train_dataset = load_dataset("../data/data.txt")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

### Auto evaluation at end of each epoch

In [None]:
from transformers import TrainerCallback

class GenerationCallback(TrainerCallback):
    def __init__(self, tokenizer, model, prompt="Explain overfitting in machine learning.", max_length=150):
        self.tokenizer = tokenizer
        self.model = model
        self.prompt = prompt
        self.max_length = max_length

    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"\nSample generation after epoch {state.epoch:.0f}")
        self.model.eval()
        inputs = self.tokenizer(self.prompt, return_tensors="pt").to(self.model.device)
        outputs = self.model.generate(
            **inputs,
            max_length=self.max_length,
            do_sample=True,
            top_p=0.95,
            temperature=0.8,
        )
        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        print("\nSample Output:\n", generated_text)

        with open(f"sample_epoch_{int(state.epoch)}.txt", "w", encoding="utf-8") as f:
            f.write(generated_text)

### Training

In [None]:
from transformers import TrainingArguments, Trainer

model = GPT2LMHeadModel.from_pretrained("gpt2")

training_args = TrainingArguments(
    output_dir="./gpt2-arxiv",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=16,
    save_steps=1000,
    save_total_limit=None,
    prediction_loss_only=True,
    fp16=True,
    logging_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    callbacks=[GenerationCallback(tokenizer, model)]
)

trainer.train()