In [1]:
# import torch
# from transformers import AutoTokenizer

# model_id="meta-llama/Llama-2-7b-hf"

# hf_tokenizer = AutoTokenizer.from_pretrained(model_id,
#                                          cache_dir="/home/pranav-pc/projects/OpenTransformer/multiformer/tokenizer_checkpoints/external",
#                                          use_fast=True)
# hf_tokenizer.save_pretrained('/home/pranav-pc/projects/OpenTransformer/multiformer/tokenizer_checkpoints/')

In [2]:
from tokenizers import Tokenizer as HFTokenizer

hf_tokenizer = HFTokenizer.from_file(
    "/home/pranav-pc/projects/OpenTransformer/multiformer/tokenizer_checkpoints/tokenizer.json"
)

In [3]:
from sentencepiece import SentencePieceProcessor

In [23]:
sp_tokenizer = SentencePieceProcessor(
    "/home/pranav-pc/projects/OpenTransformer/multiformer/tokenizer_checkpoints/tokenizer.model",
    add_bos=True,
    add_eos=True,
)

## Benchmark

In [5]:
# Dummy text
# Load dataset from Hugging Face datasets library
from datasets import load_dataset

dataset = load_dataset("imdb")
raw_text = dataset["test"]["text"]

In [6]:
%timeit [hf_tokenizer.encode(text).ids for text in raw_text]

6.25 s ± 44.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%timeit sp_tokenizer.encode(raw_text)

546 ms ± 4.55 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
%timeit [hf_tokenizer.decode(hf_tokenizer.encode(text).ids) for text in raw_text]

8.9 s ± 175 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%timeit sp_tokenizer.decode(sp_tokenizer.encode(raw_text))

769 ms ± 24.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


It's quite apparent that while Hugging Face may offer feature-rich functionality, SentencePieceTokenizer suffices for our use case and boasts significantly faster performance.

In [55]:
tokens = sp_tokenizer.encode(raw_text)

In [62]:
[len(t) for t in tokens]

[363,
 319,
 160,
 516,
 169,
 241,
 404,
 234,
 200,
 248,
 172,
 249,
 186,
 559,
 190,
 258,
 340,
 206,
 397,
 354,
 222,
 610,
 185,
 517,
 383,
 271,
 378,
 278,
 188,
 109,
 299,
 289,
 549,
 214,
 319,
 204,
 796,
 319,
 169,
 266,
 348,
 45,
 270,
 306,
 256,
 186,
 509,
 80,
 210,
 47,
 813,
 84,
 242,
 227,
 665,
 570,
 987,
 902,
 442,
 233,
 875,
 71,
 218,
 112,
 101,
 144,
 67,
 627,
 187,
 843,
 353,
 407,
 287,
 1139,
 632,
 313,
 692,
 228,
 487,
 482,
 211,
 652,
 208,
 180,
 215,
 425,
 755,
 271,
 795,
 366,
 308,
 237,
 234,
 926,
 194,
 369,
 190,
 310,
 700,
 206,
 195,
 78,
 400,
 392,
 217,
 216,
 298,
 81,
 245,
 989,
 223,
 187,
 514,
 188,
 188,
 367,
 286,
 1490,
 626,
 289,
 308,
 490,
 548,
 229,
 239,
 364,
 194,
 227,
 209,
 424,
 688,
 917,
 279,
 308,
 447,
 203,
 209,
 591,
 490,
 343,
 742,
 331,
 236,
 60,
 173,
 451,
 160,
 228,
 157,
 113,
 81,
 385,
 63,
 346,
 56,
 104,
 188,
 452,
 152,
 253,
 915,
 226,
 180,
 183,
 294,
 245,
 176,
 371,
 7