# Example 

In [2]:
from transformers import AutoTokenizer 
def tok_list(tokenizer , text ):
    input_ids = tokenizer(text , add_special_tokens = False)["input_ids"]
    return [tokenizer.decode(tok) for tok in input_ids] 
tokenizer_T5 = AutoTokenizer.from_pretrained("t5-base")
tokenzier_camembert = AutoTokenizer.from_pretrained("camembert-base")

print(f'T5 tokens for "sex": {tok_list(tokenizer_T5,"sex")}')
print(f'CamemBERT tokens for "being": {tok_list(tokenzier_camembert,"being")}')
   

T5 tokens for "sex": ['', 's', 'ex']
CamemBERT tokens for "being": ['be', 'ing']


# The tokenzier model and measuring tokenzier performance

In [3]:
from transformers import AutoTokenizer
python_code = r"""def say_hello():
 print("Hello, World!")
# Print it
say_hello()
"""
tokenizer = AutoTokenizer.from_pretrained("gpt2")
print(tokenizer(python_code).tokens())


['def', 'Ġsay', '_', 'hello', '():', 'Ċ', 'Ġprint', '("', 'Hello', ',', 'ĠWorld', '!"', ')', 'Ċ', '#', 'ĠPrint', 'Ġit', 'Ċ', 'say', '_', 'hello', '()', 'Ċ']


In [6]:
print(tokenizer.backend_tokenizer.normalizer)

None


In [11]:
print(tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(python_code))

[('def', (0, 3)), ('Ġsay', (3, 7)), ('_', (7, 8)), ('hello', (8, 13)), ('():', (13, 16)), ('Ċ', (16, 17)), ('Ġprint', (17, 23)), ('("', (23, 25)), ('Hello', (25, 30)), (',', (30, 31)), ('ĠWorld', (31, 37)), ('!")', (37, 40)), ('Ċ', (40, 41)), ('#', (41, 42)), ('ĠPrint', (42, 48)), ('Ġit', (48, 51)), ('Ċ', (51, 52)), ('say', (52, 55)), ('_', (55, 56)), ('hello', (56, 61)), ('()', (61, 63)), ('Ċ', (63, 64))]


In [12]:
a, e = u"a", u"€"
byte = ord(a.encode("utf-8"))
print(f'`{a}` is encoded as `{a.encode("utf-8")}` with a single byte: {byte}')
byte = [ord(chr(i)) for i in e.encode("utf-8")]
print(f'`{e}` is encoded as `{e.encode("utf-8")}` with three bytes: {byte}')


`a` is encoded as `b'a'` with a single byte: 97
`€` is encoded as `b'\xe2\x82\xac'` with three bytes: [226, 130, 172]


In [13]:
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
byte_to_unicode_map = bytes_to_unicode()
unicode_to_byte_map = dict((v, k) for k, v in byte_to_unicode_map.items())
base_vocab = list(unicode_to_byte_map.keys())
print(f'Size of our base vocabulary: {len(base_vocab)}')
print(f'First element: `{base_vocab[0]}`, last element: `{base_vocab[-1]}`')


Size of our base vocabulary: 256
First element: `!`, last element: `Ń`


In [33]:
base_vocab

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '|',
 '}',
 '~',
 '¡',
 '¢',
 '£',
 '¤',
 '¥',
 '¦',
 '§',
 '¨',
 '©',
 'ª',
 '«',
 '¬',
 '®',
 '¯',
 '°',
 '±',
 '²',
 '³',
 '´',
 'µ',
 '¶',
 '·',
 '¸',
 '¹',
 'º',
 '»',
 '¼',
 '½',
 '¾',
 '¿',
 'À',
 'Á',
 'Â',
 'Ã',
 'Ä',
 'Å',
 'Æ',
 'Ç',
 'È',
 'É',
 'Ê',
 'Ë',
 'Ì',
 'Í',
 'Î',
 'Ï',
 'Ð',
 'Ñ',
 'Ò',
 'Ó',
 'Ô',
 'Õ',
 'Ö',
 '×',
 'Ø',
 'Ù',
 'Ú',
 'Û',
 'Ü',
 'Ý',
 'Þ',
 'ß',
 'à',
 'á',
 'â',
 'ã',
 'ä',
 'å',
 'æ',
 'ç',
 'è',
 'é',
 'ê

In [14]:
print(tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(python_code))

[('def', (0, 3)), ('Ġsay', (3, 7)), ('_', (7, 8)), ('hello', (8, 13)), ('():', (13, 16)), ('Ċ', (16, 17)), ('Ġprint', (17, 23)), ('("', (23, 25)), ('Hello', (25, 30)), (',', (30, 31)), ('ĠWorld', (31, 37)), ('!")', (37, 40)), ('Ċ', (40, 41)), ('#', (41, 42)), ('ĠPrint', (42, 48)), ('Ġit', (48, 51)), ('Ċ', (51, 52)), ('say', (52, 55)), ('_', (55, 56)), ('hello', (56, 61)), ('()', (61, 63)), ('Ċ', (63, 64))]


In [15]:
print(f"Size of the vocabulary: {len(tokenizer)}")


Size of the vocabulary: 50257


In [16]:
print(tokenizer(python_code).tokens())


['def', 'Ġsay', '_', 'hello', '():', 'Ċ', 'Ġprint', '("', 'Hello', ',', 'ĠWorld', '!"', ')', 'Ċ', '#', 'ĠPrint', 'Ġit', 'Ċ', 'say', '_', 'hello', '()', 'Ċ']


# Training a Tokenizer

In [25]:
# extract the longest work token
tokens = sorted(tokenizer.vocab.items(), key=lambda x: len(x[0]), reverse=True)
print(tokens[:5])
print([f't: {t} {tokenizer.convert_tokens_to_string([t])}' for t, _ in tokens[:8]]);




In [32]:
tokens = sorted(tokenizer.vocab.items(), key=lambda x: x[1], reverse=True)
print([f'{tokenizer.convert_tokens_to_string([t])}' for t, _ in tokens[:12]]);

['<|endoftext|>', ' gazed', ' informants', ' Collider', ' regress', 'ominated', ' amplification', 'Compar', '…."', ' (/', 'Commission', ' Hitman']


In [None]:
from tqdm.auto import tqdm 
from datasets import load_dataset
length = 100000
dataset_name = 'transformerbook/codeparrot-train'
dataset = load_dataset(dataset_name, split = 'train' , streaming = True)
iter_dataset = iter(dataset) 

def batch_iterator(batch_size = 10 ):
    for _ in tqdm(range(0 , length, batch_size)):
        yield [next(iter_dataset)['content'] for _ in range(batch_size)]
new_tokenizer = AutoTokenizer.train_new_from_iterator(batch_iterator(),
                                                      vocab_size = 12500 , 
                                                      initial_alphabet = base_vocab)