In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer
from tqdm import tqdm

dataset = load_dataset('csv', data_files='dataset/train.csv', split='train').select(range(100000))
dataset = dataset.filter(lambda x : isinstance(x['text'], str))

In [3]:
print(dataset['text'][0])

One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.

Lily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."

Together, they shared the needle and sewed the button on Lily's shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.


In [4]:
from tqdm import tqdm

training_corpus = [
  f'<|start_story|>{text}<|end_story|>' for i in tqdm(range(0, len(dataset['text']), 4096)) for text in dataset['text'][i:i+4096]
]

100%|██████████| 25/25 [00:24<00:00,  1.04it/s]


In [5]:
example = """<|start_story|>Once upon a time, in a big forest, there lived a rhinoceros named Roxy. Roxy loved to climb. She climbed trees, rocks, and hills. One day, Roxy found an icy hill. She had never seen anything like it before. It was shiny and cold, and she wanted to climb it. Roxy tried to climb the icy hill, but it was very slippery. She tried again and again, but she kept falling down. Roxy was sad. She wanted to climb the icy hill so much. Then, she saw a little bird named Billy. Billy saw that Roxy was sad and asked, "Why are you sad, Roxy?" Roxy told Billy about the icy hill and how she couldn't climb it. Billy said, "I have an idea! Let's find some big leaves to put under your feet. They will help you climb the icy hill." Roxy and Billy looked for big leaves and found some. Roxy put the leaves under her feet and tried to climb the icy hill again. This time, Roxy didn't slip. She climbed and climbed until she reached the top of the icy hill. Roxy was so happy! She and Billy played on the icy hill all day. From that day on, Roxy and Billy were the best of friends, and they climbed and played together all the time. And Roxy learned that with a little help from a friend, she could climb anything. <|end_story|>""""""<|start_story|>Once upon a time, in a big forest, there lived a rhinoceros named Roxy.<|end_story|>"""

from tokenizers import (
  decoders, models, Tokenizer, pre_tokenizers, trainers, processors
)
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
print(tokenizer.pre_tokenizer.pre_tokenize_str(example))


[('<|', (0, 2)), ('start', (2, 7)), ('_', (7, 8)), ('story', (8, 13)), ('|>', (13, 15)), ('Once', (15, 19)), ('Ġupon', (19, 24)), ('Ġa', (24, 26)), ('Ġtime', (26, 31)), (',', (31, 32)), ('Ġin', (32, 35)), ('Ġa', (35, 37)), ('Ġbig', (37, 41)), ('Ġforest', (41, 48)), (',', (48, 49)), ('Ġthere', (49, 55)), ('Ġlived', (55, 61)), ('Ġa', (61, 63)), ('Ġrhinoceros', (63, 74)), ('Ġnamed', (74, 80)), ('ĠRoxy', (80, 85)), ('.', (85, 86)), ('ĠRoxy', (86, 91)), ('Ġloved', (91, 97)), ('Ġto', (97, 100)), ('Ġclimb', (100, 106)), ('.', (106, 107)), ('ĠShe', (107, 111)), ('Ġclimbed', (111, 119)), ('Ġtrees', (119, 125)), (',', (125, 126)), ('Ġrocks', (126, 132)), (',', (132, 133)), ('Ġand', (133, 137)), ('Ġhills', (137, 143)), ('.', (143, 144)), ('ĠOne', (144, 148)), ('Ġday', (148, 152)), (',', (152, 153)), ('ĠRoxy', (153, 158)), ('Ġfound', (158, 164)), ('Ġan', (164, 167)), ('Ġicy', (167, 171)), ('Ġhill', (171, 176)), ('.', (176, 177)), ('ĠShe', (177, 181)), ('Ġhad', (181, 185)), ('Ġnever', (185, 191)), 

In [6]:
trainer = trainers.BpeTrainer(vocab_size=2048, special_tokens=["<|start_story|>", "<|end_story|>"])
tokenizer.train_from_iterator(tqdm(iter(training_corpus)), trainer=trainer)

99983it [00:08, 12210.04it/s]


In [7]:
encoding = tokenizer.encode(example)
print(encoding.tokens)
print(encoding.ids)

['<|start_story|>', 'Once', 'Ġupon', 'Ġa', 'Ġtime', ',', 'Ġin', 'Ġa', 'Ġbig', 'Ġforest', ',', 'Ġthere', 'Ġlived', 'Ġa', 'Ġr', 'h', 'in', 'o', 'cer', 'os', 'Ġnamed', 'ĠR', 'o', 'x', 'y', '.', 'ĠR', 'o', 'x', 'y', 'Ġloved', 'Ġto', 'Ġclimb', '.', 'ĠShe', 'Ġclimbed', 'Ġtrees', ',', 'Ġrocks', ',', 'Ġand', 'Ġhill', 's', '.', 'ĠOne', 'Ġday', ',', 'ĠR', 'o', 'x', 'y', 'Ġfound', 'Ġan', 'Ġ', 'icy', 'Ġhill', '.', 'ĠShe', 'Ġhad', 'Ġnever', 'Ġseen', 'Ġanything', 'Ġlike', 'Ġit', 'Ġbefore', '.', 'ĠIt', 'Ġwas', 'Ġshiny', 'Ġand', 'Ġcold', ',', 'Ġand', 'Ġshe', 'Ġwanted', 'Ġto', 'Ġclimb', 'Ġit', '.', 'ĠR', 'o', 'x', 'y', 'Ġtried', 'Ġto', 'Ġclimb', 'Ġthe', 'Ġ', 'icy', 'Ġhill', ',', 'Ġbut', 'Ġit', 'Ġwas', 'Ġvery', 'Ġsl', 'i', 'pper', 'y', '.', 'ĠShe', 'Ġtried', 'Ġagain', 'Ġand', 'Ġagain', ',', 'Ġbut', 'Ġshe', 'Ġkept', 'Ġfall', 'ing', 'Ġdown', '.', 'ĠR', 'o', 'x', 'y', 'Ġwas', 'Ġsad', '.', 'ĠShe', 'Ġwanted', 'Ġto', 'Ġclimb', 'Ġthe', 'Ġ', 'icy', 'Ġhill', 'Ġso', 'Ġmuch', '.', 'ĠThen', ',', 'Ġshe', 'Ġsaw', 'Ġa

In [8]:
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
tokenizer.decoder = decoders.ByteLevel()

In [9]:
print(tokenizer.decode(encoding.ids))

Once upon a time, in a big forest, there lived a rhinoceros named Roxy. Roxy loved to climb. She climbed trees, rocks, and hills. One day, Roxy found an icy hill. She had never seen anything like it before. It was shiny and cold, and she wanted to climb it. Roxy tried to climb the icy hill, but it was very slippery. She tried again and again, but she kept falling down. Roxy was sad. She wanted to climb the icy hill so much. Then, she saw a little bird named Billy. Billy saw that Roxy was sad and asked, "Why are you sad, Roxy?" Roxy told Billy about the icy hill and how she couldn't climb it. Billy said, "I have an idea! Let's find some big leaves to put under your feet. They will help you climb the icy hill." Roxy and Billy looked for big leaves and found some. Roxy put the leaves under her feet and tried to climb the icy hill again. This time, Roxy didn't slip. She climbed and climbed until she reached the top of the icy hill. Roxy was so happy! She and Billy played on the icy hill al

In [10]:
from transformers import PreTrainedTokenizerFast
wrapped_tokenizer = PreTrainedTokenizerFast(
  tokenizer_object=tokenizer,
  bos_token='<|start_story|>',
  eos_token='<|end_story|>',
)

In [18]:
encoding = wrapped_tokenizer.encode('<|start_story|>Once upon a time, ')

In [20]:
wrapped_tokenizer.decode(encoding, skip_special_tokens=True)

'Once upon a time, '

In [21]:
tokenizer.save('tokenizer.json', pretty=True)