In [2]:
import torch
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizerFast
from transformers import TextDataset
from transformers import BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import json
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM
from datasets import load_dataset, Dataset

# Paths
filepath = "../out/cleaned_books_small.txt"
save_dir = "./custom_tokenizer"
vocab_file = f"{save_dir}/custom_vocab_small-vocab.txt"

  from .autonotebook import tqdm as notebook_tqdm


### Creating a small text of a big corpus for testing the workflow

In [9]:
filepath_original = "../out/cleaned_books.txt"

# I want to create a small version of the cleaned_books.txt file for testing
with open(filepath_original, "r") as f:
    lines = f.readlines()

with open("../out/cleaned_books_small.txt", "w") as f:
    f.writelines(lines[0:10])

### Training a tokenizer on own text data

This code trains a **WordPiece tokenizer** on a our own dataset and saves it for later use. Instead of tokenizing immediately, storing the trained tokenizer allows for on-the-fly tokenization. This way we can use it for different datasets and models as needed.

In [11]:
# Initialize a WordPiece tokenizer
tokenizer = BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=False,
)

# Train the tokenizer
tokenizer.train(
    files=[filepath],
    vocab_size=5000,
    min_frequency=3,
    limit_alphabet=1000,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
)

# Save tokenizer
tokenizer.save_model(save_dir, "custom_vocab_small")

print("Tokenizer training complete! Saved to", save_dir)




Tokenizer training complete! Saved to ./custom_tokenizer


### Sliding window

In [27]:
from transformers import BertTokenizer
from torch.utils.data import Dataset

# Load your trained tokenizer
tokenizer = BertTokenizer(vocab_file="custom_vocab-vocab.txt", do_lower_case=False)

# File containing books (one per line)
file_path = "../out/cleaned_books.txt"

# Parameters for sliding window
block_size = 128  # Max sequence length
stride = 64  # Overlap between chunks (adjust as needed)


class SlidingWindowDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size, stride):
        self.examples = []

        # Read the file
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                if line.strip():  # Skip empty lines
                    tokens = tokenizer.encode(line.strip(), add_special_tokens=True)

                    # Process the line using a sliding window
                    for i in range(0, len(tokens) - block_size + 1, stride):
                        chunk = tokens[i : i + block_size]  # Take a block of size 128
                        self.examples.append(chunk)

                    # If the last chunk is shorter than block_size, include it
                    if len(tokens) > block_size and len(tokens) % stride != 0:
                        self.examples.append(tokens[-block_size:])

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return {"input_ids": self.examples[i]}


# Create dataset using sliding window
dataset = SlidingWindowDataset(file_path, tokenizer, block_size, stride)

# Print dataset statistics
print(f"Total sequences: {len(dataset)}")  # More sequences now!


Total sequences: 437213


In [31]:
# Access the first item
example = dataset[1]  

# Print the tokenized output
print("First sample (token IDs):", example)
print(len(example["input_ids"]))

First sample (token IDs): {'input_ids': [45390, 5853, 18, 384, 1117, 300, 290, 3721, 877, 317, 302, 13828, 18, 2256, 290, 3721, 921, 302, 8053, 865, 4227, 18, 48353, 354, 1624, 18, 731, 3618, 302, 3997, 18, 48353, 2147, 422, 445, 427, 319, 728, 2304, 18, 430, 3997, 627, 302, 1111, 18, 49239, 2676, 18, 18, 18, 3907, 921, 48353, 610, 5853, 18, 17, 371, 18, 18, 18, 336, 562, 8385, 16, 908, 301, 1111, 18, 371, 18, 18, 18, 336, 374, 388, 308, 290, 3721, 8825, 18, 18, 18, 477, 18, 18, 18, 862, 562, 336, 35, 17, 583, 948, 317, 290, 5133, 2035, 300, 4438, 4230, 16, 908, 48353, 18, 17, 801, 302, 48069, 35, 8385, 2343, 308, 290, 2805, 18, 17, 2008, 546, 5, 5002, 537, 489, 1239, 8284, 334, 5]}
128


In [32]:
# Decode the first example back to text
decoded_text = tokenizer.decode(example["input_ids"], skip_special_tokens=False)
print("Decoded text:", decoded_text)

Decoded text: sluipt dichterbij. De deur van de kast staat op een kier. Uit de kast komt een schoentje tevoorschijn. Lila is bang. Er volgt een been. Lila vraagt zich af wat dat alles betekent. Het been wordt een kind. Ooooh... zachtjes komt Lila iets dichterbij. - Ik... ik ben Tom, zegt het kind. Ik... ik had me in de kast verstopt... En... Waar ben ik? - Je bent op de stortplaats van Merlijn, zegt Lila. - Op een vuilnisbelt? Tom kijkt in de verte. - Nou zeg! Mama zal wel erg ongerust zijn!


### Saving and Using a Custom Tokenizer

I save my trained tokenizer in **Hugging Face format**, which makes it reusable with models from the `transformers` library. In the second block, I use this tokenizer to process my dataset into **tokenized sequences of length 128**.


In [12]:
# Load tokenizer with only the vocab file
hf_tokenizer = BertTokenizerFast(vocab_file=vocab_file, do_lower_case=False)

# Add special tokens to tokenizer
hf_tokenizer.add_special_tokens({
    "unk_token": "[UNK]",
    "sep_token": "[SEP]",
    "pad_token": "[PAD]",
    "cls_token": "[CLS]",
    "mask_token": "[MASK]"
})

# Save tokenizer in Hugging Face format
hf_tokenizer.save_pretrained(save_dir)

print("Tokenizer successfully saved in Hugging Face format!")

Tokenizer successfully saved in Hugging Face format!


In [13]:
# Load dataset as a single string
with open("../out/cleaned_books_small.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("custom_tokenizer")

# Tokenize entire text
tokenized_text = tokenizer(text, return_tensors="pt")["input_ids"][0]  # Flatten tensor

# Split into chunks of 128 tokens
chunk_size = 128
chunks = [tokenized_text[i : i + chunk_size] for i in range(0, len(tokenized_text), chunk_size)]

# Convert to dataset format
dataset = Dataset.from_dict({"input_ids": chunks})

# Split into train (80%), val (10%), and test (10%)
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
valid_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)

# Final dataset splits
final_datasets = {
    "train": train_test_split["train"],
    "validation": valid_test_split["train"],
    "test": valid_test_split["test"],
}

print("Dataset:", final_datasets)

# Convert to PyTorch format
for split in final_datasets:
    final_datasets[split].set_format(type="torch", columns=["input_ids"])

print("Tokenization complete!")

# Print the first item in the test set
print(final_datasets["test"][0])


Dataset: {'train': Dataset({
    features: ['input_ids'],
    num_rows: 3616
}), 'validation': Dataset({
    features: ['input_ids'],
    num_rows: 452
}), 'test': Dataset({
    features: ['input_ids'],
    num_rows: 452
})}
Tokenization complete!
{'input_ids': tensor([  13, 4663,   14,  223,  338,  182, 3617,  311,  175, 2967,   14,    8,
           8,  427,   28,    8,    8,  463,  492,  306,  205,  182, 3617,  232,
          14,   14,   14,    8,    8,  527,   28,    8,    8, 1179,   12,  205,
         338,  182, 3617,  311,  175, 2967,   14,    8,    8,   43,   14, 1313,
        1720,  208,  659, 1787,   28,    8,    8,  797,   14,   14,   14,  208,
         201,  576,   14,   14,   14,  576,  182,  387,  183,  231,   14,    8,
           8, 1179,   12,  208,  734,  252,  359,  358,  348,   12,    8,  283,
         191, 2274,   14,    8,  527,   14,   14,   14,    8,  257, 2721,  216,
        1675,  348,  198,  216, 4015,  177,  205,  380, 1873,  182,  580,  391,
         939,  610

### Making the model

In [14]:
# Define a new BERT configuration
config = BertConfig(
    vocab_size=len(tokenizer),
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072, 
    max_position_embeddings=128, 
    type_vocab_size=1,
    pad_token_id=tokenizer.pad_token_id,
    bos_token_id=tokenizer.cls_token_id,
    eos_token_id=tokenizer.sep_token_id
)

# Initialize a new BERT model with this config
model = BertForMaskedLM(config)

# Adjust the model's vocabulary size to match the tokenizer
model.resize_token_embeddings(len(tokenizer))

print(model)


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(5000, 768, padding_idx=0)
      (position_embeddings): Embedding(128, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [15]:
# Randomly mask words in MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

In [42]:
training_args = TrainingArguments(
    output_dir="../bert_custom_checkpoints",
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=226,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=226, 
    num_train_epochs=4,
    save_total_limit=None,
    overwrite_output_dir=False,
    logging_dir="../logs",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_datasets["train"],
    eval_dataset=final_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


In [43]:
trainer.train()
trainer.save_model("../bert_custom_final")

KeyboardInterrupt: 

### Looking at the results of the model

#### Loading in the model and the tokenizer

In [4]:
model_path = "/Users/jonasklein/bert_custom_final"
tokenizer_path = "/Users/jonasklein/custom_tokenizer"

model = AutoModel.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

Some weights of BertModel were not initialized from the model checkpoint at /Users/jonasklein/bert_custom_final and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
model = AutoModelForMaskedLM.from_pretrained(model_path)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=final_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.evaluate()

  trainer = Trainer(


{'eval_loss': 6.088505268096924,
 'eval_model_preparation_time': 0.0011,
 'eval_runtime': 7.6665,
 'eval_samples_per_second': 58.958,
 'eval_steps_per_second': 7.435}

In [40]:
import math
test_loss = trainer.evaluate()["eval_loss"]
perplexity = math.exp(test_loss)

print(f"Test Perplexity: {perplexity:.4f}")

Test Perplexity: 458.1433


#### Extract embedding for a certain word

In [None]:
token = "laag"

# Convert token to token ID
token_id = tokenizer.convert_tokens_to_ids(token)

# Check if token is in the vocabulary
if token_id is None or token_id == tokenizer.unk_token_id:
    print(f"Token '{token}' not found in the vocabulary! The embedding will be the one for the [UNK] token.")
else:
    print(f"Embedding for '{token}':")

# Convert token ID to tensor
input_ids = torch.tensor([[token_id]])  # Shape: (1, 1) -> Batch of 1, 1 token

# Get the model's input embeddings
#with torch.no_grad():
#    embedding_layer = model.get_input_embeddings()  # Extract input embeddings
#    token_embedding = embedding_layer(input_ids)  # Get embedding vector

# Get the output from the model (excluding MLM head, so the last hidden layer)
with torch.no_grad():
    outputs = model(input_ids, output_hidden_states=True) # This makes sure it is only passed through the BERT encoder
    hidden_states = outputs.hidden_states

# Extract the last hidden layer
last_hidden_state = hidden_states[-1]

# Retrieve the token embedding
print(last_hidden_state.shape)
token_embedding = last_hidden_state[0, -1, :]

print("Token Embedding Shape:", token_embedding.shape) # Shape: (768,)
print("Token Embedding:", token_embedding)

Embedding for 'laag':
torch.Size([1, 1, 768])
Token Embedding Shape: torch.Size([768])
Token Embedding: tensor([ 2.9236,  1.0079, -0.8346, -0.0273, -1.1866, -0.6646,  0.2908,  0.0036,
         1.3928, -0.6854, -0.1433,  1.7425, -1.7796,  0.0531,  0.0398, -0.6935,
        -0.5568, -0.5510,  0.5848, -0.2226,  1.6323, -0.3731,  1.0677, -1.6748,
         0.0895, -1.0691, -1.7554,  0.2883, -0.1404,  1.4276,  0.5130, -1.4716,
        -1.4637, -0.8977,  0.8635,  0.0059,  1.6054, -0.5841,  0.3528,  1.0565,
         2.3417, -1.2483, -1.0305, -1.0594,  0.3393,  0.3274, -0.8450, -0.0059,
        -0.1178,  1.7839, -0.0681, -0.7162, -1.1627,  0.0797, -0.7405,  1.3743,
        -0.4501,  0.0983,  0.3862, -0.8646, -0.8856, -0.1382, -0.9937, -1.0002,
         0.2911, -1.2921,  0.8013, -1.0002,  0.3577,  0.2958,  0.7073, -0.9498,
        -0.0648,  1.6411,  0.2564, -0.1371,  0.7749,  0.5836,  0.5437,  0.9387,
        -0.8600,  0.0299,  2.8112, -1.4452,  0.8423, -0.0889,  0.7509, -2.0065,
         0.0670,

In [52]:
print(tokenizer.convert_tokens_to_ids("laag"))

2358


#### Compute cosine similarity between embeddings of two different words

In [None]:
import torch.nn.functional as F

def cosine_similarity(embedding1, embedding2):
    return F.cosine_similarity(embedding1, embedding2, dim=-1)

token1 = "laag"
token2 = "hoog"

# Convert tokens to IDs
token_id1 = tokenizer.convert_tokens_to_ids(token1)
token_id2 = tokenizer.convert_tokens_to_ids(token2)

# Ensure both tokens exist in the vocabulary
if token_id1 is None or token_id2 is None:
    raise ValueError(f"One of the tokens ('{token1}', '{token2}') is not in the vocabulary.")

# Convert token IDs to tensors
input_ids1 = torch.tensor([[token_id1]])
input_ids2 = torch.tensor([[token_id2]])

with torch.no_grad():
    outputs = model(input_ids1, output_hidden_states=True) 
    hidden_states1 = outputs.hidden_states

last_hidden_state1 = hidden_states1[-1]

# Retrieve the token embedding
print(last_hidden_state1.shape)
token_embedding1 = last_hidden_state1[0, -1, :]

with torch.no_grad():
    outputs = model(input_ids2, output_hidden_states=True)
    hidden_states2 = outputs.hidden_states

# Extract the last hidden layer
last_hidden_state2 = hidden_states2[-1]

# Retrieve the token embedding
print(last_hidden_state2.shape)
token_embedding2 = last_hidden_state2[0, -1, :]

# Compute cosine similarity
similarity = cosine_similarity(token_embedding1, token_embedding2)
print(f"Cosine Similarity between '{token1}' and '{token2}':", similarity.item())


torch.Size([1, 1, 768])
torch.Size([1, 1, 768])
Cosine Similarity between 'laag' and 'hoog': 0.5117028951644897
