# 1. BPE (Byte Pair Encoding) Tokenization

In [3]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers

In [2]:
!pip install tokenizers

Defaulting to user installation because normal site-packages is not writeable
Collecting tokenizers
  Downloading tokenizers-0.20.1-cp312-none-win_amd64.whl.metadata (6.9 kB)
Downloading tokenizers-0.20.1-cp312-none-win_amd64.whl (2.4 MB)
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   -- ------------------------------------- 0.2/2.4 MB 2.4 MB/s eta 0:00:01
   ------ --------------------------------- 0.4/2.4 MB 3.6 MB/s eta 0:00:01
   ---------- ----------------------------- 0.6/2.4 MB 3.9 MB/s eta 0:00:01
   ------------- -------------------------- 0.8/2.4 MB 4.0 MB/s eta 0:00:01
   ---------------- ----------------------- 1.0/2.4 MB 4.0 MB/s eta 0:00:01
   ------------------- -------------------- 1.2/2.4 MB 4.1 MB/s eta 0:00:01
   ----------------------- ---------------- 1.4/2.4 MB 4.0 MB/s eta 0:00:01
   --------------------------- ------------ 1.6/2.4 MB 4.1 MB/s eta 0:00:01
   ------------

In [5]:

# Create a BPE tokenizer
tokenizer = Tokenizer(models.BPE())

# Set pre-tokenizer to split on whitespace
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Create a trainer for the tokenizer
trainer = trainers.BpeTrainer(vocab_size=50, min_frequency=2, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

# Sample data
files = ["low lower newest", "newest lower low", "low and lower"]

# Train the tokenizer
tokenizer.train_from_iterator(files, trainer=trainer)

# Encode a sample text
encoded = tokenizer.encode("newest lower low")
print("Tokens:", encoded.tokens)
print("IDs:", encoded.ids)


Tokens: ['newest', 'lower', 'low']
IDs: [23, 18, 16]


# 2. WordPiece Tokenization

In [7]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
     ---------------------------------------- 0.0/44.1 kB ? eta -:--:--
     --------- ------------------------------ 10.2/44.1 kB ? eta -:--:--
     -------------------------- ----------- 30.7/44.1 kB 262.6 kB/s eta 0:00:01
     -------------------------------------- 44.1/44.1 kB 308.1 kB/s eta 0:00:00
Downloading transformers-4.46.1-py3-none-any.whl (10.0 MB)
   ---------------------------------------- 0.0/10.0 MB ? eta -:--:--
    --------------------------------------- 0.1/10.0 MB 2.8 MB/s eta 0:00:04
   - -------------------------------------- 0.4/10.0 MB 3.8 MB/s eta 0:00:03
   -- ------------------------------------- 0.6/10.0 MB 4.1 MB/s eta 0:00:03
   --- ------------------------------------ 0.8/10.0 MB 4.3 MB/s eta 0:00:03
   ---- ----------------------------------- 1.1/10.0 MB 4.8 MB/s eta 0:00:02
   ----- --



In [8]:
from transformers import BertTokenizer

# Load the pre-trained BERT tokenizer (which uses WordPiece tokenization)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Sample text to tokenize
text = "The quick brown fox jumps over the lazy dog."

# Tokenize the text
tokens = tokenizer.tokenize(text)

# Convert tokens to input IDs
input_ids = tokenizer.encode(text, add_special_tokens=True)

# Display the tokens and their corresponding IDs
print("Original Text:", text)
print("Tokens:", tokens)
print("Input IDs:", input_ids)

# Decode the input IDs back to text
decoded_text = tokenizer.decode(input_ids)
print("Decoded Text:", decoded_text)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Original Text: The quick brown fox jumps over the lazy dog.
Tokens: ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']
Input IDs: [101, 1996, 4248, 2829, 4419, 14523, 2058, 1996, 13971, 3899, 1012, 102]
Decoded Text: [CLS] the quick brown fox jumps over the lazy dog. [SEP]
