### Setup

In [3]:
from pathlib import Path
import math
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.trainers import BpeTrainer
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    PreTrainedTokenizerFast,
    GPT2Config, GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    TrainingArguments, Trainer
)

### Model Configs
Setting up the dataset, Tokenizer, and model.

The dataset for this project is the full text of Moby Dick. It is spilt into two portions, a training and a validation text file.

Using a GPT2 architecture model with 8 heads.

In [4]:
# ---------- Train a tokenizer (Byte-level BPE) ----------
data_dir = Path("./data") 
data_dir.mkdir(exist_ok=True)


files = [str(data_dir/"train.txt")]
tokenizer = Tokenizer(BPE(unk_token="<unk>"))
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()
trainer = BpeTrainer(
    vocab_size=50257,
    min_frequency=2,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>"]
)
tokenizer.train(files=files, trainer=trainer)
tokenizer.save("tokenizer.json")

hf_tok = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer.json",
    bos_token="<s>", eos_token="</s>",
    unk_token="<unk>", pad_token="<pad>"
)
hf_tok.save_pretrained("./my-tokenizer")

# ---------- Build a GPT-style model from scratch ----------
config = GPT2Config(
    vocab_size=hf_tok.vocab_size,
    n_positions=1024,
    n_ctx=1024,
    n_embd=512,
    n_layer=6,
    n_head=8,
    bos_token_id=hf_tok.bos_token_id,
    eos_token_id=hf_tok.eos_token_id
)
model = GPT2LMHeadModel(config)
model.resize_token_embeddings(hf_tok.vocab_size)

# ---------- Load text dataset and tokenize ----------
raw = load_dataset("text", data_files={
    "train": str(data_dir/"train.txt"),
    "validation": str(data_dir/"val.txt")
})

def tok_fn(batch):
    return hf_tok(batch["text"])

tokenized = raw.map(tok_fn, batched=True, remove_columns=["text"])

# group into fixed-length blocks for causal LM
block_size = 1024
def group_texts(examples):
    # concatenate
    concat = {k: sum(examples[k], []) for k in examples.keys()}
    total_len = (len(concat["input_ids"]) // block_size) * block_size
    result = {
        k: [t[i:i+block_size] for i in range(0, total_len, block_size)]
        for k, t in concat.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_ds = tokenized.map(group_texts, batched=True)

# ensure pad/bos/eos are aligned everywhere
model.config.pad_token_id = hf_tok.pad_token_id
model.config.bos_token_id = hf_tok.bos_token_id
model.config.eos_token_id = hf_tok.eos_token_id

# generation config too
model.generation_config.pad_token_id = hf_tok.pad_token_id
model.generation_config.bos_token_id = hf_tok.bos_token_id
model.generation_config.eos_token_id = hf_tok.eos_token_id

model.config.use_cache = False
model.config.loss_type ="ForCausalLMLoss"

### Model Training

In [5]:
# ---------- Train ----------
data_collator = DataCollatorForLanguageModeling(tokenizer=hf_tok, mlm=False)

args = TrainingArguments(
    output_dir="./checkpoints",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    eval_steps=1000,
    logging_steps=100,
    save_steps=1000,
    save_total_limit=2,
    learning_rate=5e-4,
    weight_decay=0.1,
    warmup_steps=2000,
    lr_scheduler_type="cosine",
    num_train_epochs=10,
    fp16=True,                        # set bf16=True on Ampere+/TPUs if available
    gradient_checkpointing=True,      # helps memory
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=lm_ds["train"],
    eval_dataset=lm_ds["validation"],
    data_collator=data_collator,
    processing_class=hf_tok
)

trainer.train()

# ---------- Evaluate perplexity ----------
eval_res = trainer.evaluate()
print("Perplexity:", math.exp(eval_res["eval_loss"]))

# ---------- Save ----------
trainer.save_model("./my-tiny-gpt")
hf_tok.save_pretrained("./my-tiny-gpt")

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,8.7314


Perplexity: 925.7794436520039


('./my-tiny-gpt\\tokenizer_config.json',
 './my-tiny-gpt\\special_tokens_map.json',
 './my-tiny-gpt\\tokenizer.json')

In [6]:
# ---------- Quick generation test ----------
prompt = "Once upon a time"
inputs = hf_tok(prompt, return_tensors="pt").to(model.device)
gen_ids = model.generate(
    **inputs,
    max_new_tokens=50,
    do_sample=True,
    temperature=0.9,
    top_p=0.95
)
print(hf_tok.decode(gen_ids[0], skip_special_tokens=True))

Once upon a time, now over the Pequod its,, the sea.! them, and a; and to it of the whale, and the most, the. And now, and the with the’s, and I in the. But, and so,


### Loading model

Trained model utilizing google colab GPU resources. Loading that model for better results. The model does not produce valid sentences, however, it captures the style and feel of the book. Given the limited data and model size, these results are impressive

In [7]:
# --- paths ---
checkpoint_dir = "./best_model"   # change to your checkpoint-* dir

# --- load model & tokenizer from checkpoint ---
tokenizer = AutoTokenizer.from_pretrained(checkpoint_dir, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(checkpoint_dir)

# GPT2-like models may not have pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id


prompt = "Once upon a time"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
gen_ids = model.generate(
    **inputs,
    max_new_tokens=50,
    do_sample=True,
    temperature=0.9,
    top_p=0.95
)
print(tokenizer.decode(gen_ids[0], skip_special_tokens=True))

Once upon a time, and over cried out upon which I must?”“Come out a good fare about all eagerness to ye grin at thy black black vomit Ahab. I stood in a leg.When the skin of the insane’s too I got, I know him
