In [1]:
from tokenizers import ByteLevelBPETokenizer

In [7]:
data_path = 'smaller.txt'
tokenizer = ByteLevelBPETokenizer()

In [8]:
# Customize training
tokenizer.train(files=data_path, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])
tokenizer.save_model('smaller_tokenizer')

['smaller_tokenizer\\vocab.json', 'smaller_tokenizer\\merges.txt']

In [13]:
inp = "Hello, good morning"
t = tokenizer.encode(inp)

print(t.ids)
print(t.tokens)

[7482, 16, 6800, 339, 272, 2107]
['Hello', ',', 'Ġgood', 'Ġm', 'or', 'ning']


In [14]:
tokenizer.decode(t.ids)

'Hello, good morning'

In [None]:
!pip install datasets

In [10]:
# Load GPT2 Tokenizer
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling
from datasets import load_dataset




In [17]:
new_tokenizer = GPT2Tokenizer.from_pretrained("./smaller_tokenizer")
new_tokenizer.add_special_tokens({
    "eos_token":"</s>",
    "bos_token":"<s>",
    "unk_token":"<unk>",
    "pad_token":"<pad>",
    "mask_token":"<mask>"
})

example = """"class LinearLayer():
def __init__(self, input_size, output_size):
    self.weight = torch.randn(input_size, output_size)
    self.bias = torch.zeros(output_size)

def __call__(self, x):
    return x @ self.weights + self.bias
"""

t = new_tokenizer.tokenize(example)
print(t)

['"', 'class', 'ĠLinear', 'Layer', '():', 'Ċ', 'def', 'Ġ__', 'init', '__(', 'self', ',', 'Ġinput', '_', 'size', ',', 'Ġoutput', '_', 'size', '):', 'Ċ', 'ĠĠĠ', 'Ġself', '.', 'weight', 'Ġ=', 'Ġtorch', '.', 'randn', '(', 'input', '_', 'size', ',', 'Ġoutput', '_', 'size', ')', 'Ċ', 'ĠĠĠ', 'Ġself', '.', 'bias', 'Ġ=', 'Ġtorch', '.', 'zeros', '(', 'output', '_', 'size', ')', 'Ċ', 'Ċ', 'def', 'Ġ__', 'call', '__(', 'self', ',', 'Ġx', '):', 'Ċ', 'ĠĠĠ', 'Ġreturn', 'Ġx', 'Ġ@', 'Ġself', '.', 'weights', 'Ġ+', 'Ġself', '.', 'bias', 'Ċ']


In [18]:
gpt_tokenizer = new_tokenizer.encode(inp)
print(gpt_tokenizer)

[7482, 16, 6800, 339, 272, 2107]


In [19]:
tokenizer.decode(gpt_tokenizer)

'Hello, good morning'