# HuggingFace's Tokenizers timing experiment

## Notebook set-up

### Imports

In [1]:
import os, time
from pathlib import Path
import torch
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
from transformers import BertTokenizer
from tokenizers import BertWordPieceTokenizer
from re_sent_splitter import split_into_sentences
from tokenizers.processors import TemplateProcessing

In [2]:
vm_tok_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Preprocessing/Tokenization/wp-vocab-30500-vocab.txt'
vm_data = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Data/english_docs_aa.txt'
local_tok_path = '/Users/americanthinker1/NationalSecurityBERT/Preprocessing/Tokenization/wp-vocab-30500-vocab.txt'
local_data = '/Users/americanthinker1/aws_data/processed_data/processed_chunks/english_docs_aa.txt'
model_path = '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/model_checkpoints/'

### Loading the text data

In [3]:
with open(vm_data) as f:
    docs = [line for line in f.read().splitlines()]

In [4]:
print(f'Number of docs: {len(docs)}')

Number of docs: 98862


### Initializing the tokenizers

In [5]:
#Transformers_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer = BertWordPieceTokenizer('../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt', strip_accents=True, lowercase=True)

### Initializing the executor

In [6]:
n_cpus = os.cpu_count()
print(f'Number of CPUs: {n_cpus}')
executor = ThreadPoolExecutor(max_workers=n_cpus)

Number of CPUs: 24


## Transformers vs Tokenizers

### Tokenizers' BertWordPieceTokenizer

In [7]:
sample = docs[:500]
sentences = [split_into_sentences(i) for i in tqdm(docs)]

  0%|          | 0/98862 [00:00<?, ?it/s]

In [8]:
all_sentences = []
for doc in sentences:
    for sentence in doc:
        if len(sentence.split()) > 4:
            all_sentences.append(sentence)

In [9]:
all_sentences = [s for s in all_sentences if len(s.split()) > 4]
#lengths = [len(sentence.split()) for sentence in all_sentences]

In [84]:
import pandas as pd
df = pd.DataFrame(lengths)

In [85]:
df.describe()

Unnamed: 0,0
count,7105466.0
mean,24.11707
std,20.76012
min,5.0
25%,15.0
50%,21.0
75%,29.0
max,1851.0


In [32]:
import time

s = time.perf_counter()
Tokenizers_tokenized = []
doc_count = 0

for doc in sample:
    Tokenizers_tokenized.append(Tokenizers_tokenizer.encode(doc).ids)
    doc_count += 1
    if doc_count % 1000 == 0:
        print(f'{doc_count} docs completed. {len(sample) - doc_count} docs to go')
e = time.perf_counter() - s
e

1000 docs completed. 9000 docs to go
2000 docs completed. 8000 docs to go
3000 docs completed. 7000 docs to go
4000 docs completed. 6000 docs to go
5000 docs completed. 5000 docs to go
6000 docs completed. 4000 docs to go
7000 docs completed. 3000 docs to go
8000 docs completed. 2000 docs to go
9000 docs completed. 1000 docs to go
10000 docs completed. 0 docs to go


85.16536681899743

### concurrent.futures submit

In [10]:
%%timeit -n 1 -r 5
submit_tokenized = []
encoded_futures = [executor.submit(Tokenizers_tokenizer.encode, sentence) for sentence in text_data]
for encoded_future in as_completed(encoded_futures):
    submit_tokenized.append(encoded_future.result().ids)

1min 8s ± 1.57 s per loop (mean ± std. dev. of 5 runs, 1 loop each)


### concurrent.futures map

In [11]:
%%timeit -n 1 -r 5
map_tokenized = [encoded.ids for encoded in executor.map(Tokenizers_tokenizer.encode, text_data)]

1min 9s ± 3.91 s per loop (mean ± std. dev. of 5 runs, 1 loop each)


### encode_batch

In [36]:
%%timeit -n 1 -r 5
batch_encode_tokenized = Tokenizers_tokenizer.encode_batch(sample)

7.56 s ± 181 ms per loop (mean ± std. dev. of 5 runs, 1 loop each)


In [10]:
tokenizer.enable_truncation(max_length=50)
tokenizer.enable_padding()
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
        ("[MASK]", tokenizer.token_to_id("[MASK]")),
    ],
)

In [29]:
s = time.perf_counter()
batch = tokenizer.encode_batch(all_sentences)
e = time.perf_counter() - s
print(round(e,2), 'seconds')

124.79 seconds


In [40]:
all_sentences[0]

'Introduction Under normal physiological conditions, all cells in the body are exposed chronically to oxidants from both endogenous and exogenous sources; yet the intracellular “redox buffer” mechanism provides significant protection mainly by the antioxidant network [1].'

In [16]:
' '.join(batch[0].tokens)

'[CLS] introduction under normal physiological conditions , all cells in the body are exposed chronically to oxidants from both endogenous and exogenous sources ; yet the intracellular “ redox buffer ” mechanism provides significant protection mainly by the antioxidant network [ 1 ] . [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [17]:
' '.join(map(str, batch[0].special_tokens_mask))

'1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1'

In [1]:
346/60

5.766666666666667