In [1]:
import os
import sys
sys.path.append("../../../../")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import copy
import torch
from datetime import datetime
from utils.helper import ModelConfig, color_print
from utils.dataset_utils.load_dataset import (
    load_data,
)
from utils.model_utils.load_model import load_model
from utils.model_utils.save_module import save_module
from utils.model_utils.evaluate import evaluate_model, get_sparsity, similar
from utils.dataset_utils.sampling import SamplingDataset
from utils.prune_utils.prune import (
    prune_concern_identification,
    recover_tangling_identification,
)

In [3]:
name = "YahooAnswersTopics"
device = torch.device("cuda:0")
checkpoint = None
batch_size=32
num_workers=48
num_samples=16
ci_ratio=0.3
seed=44
include_layers=["attention", "intermediate", "output"]
exclude_layers=None

In [4]:
script_start_time = datetime.now()
print(f"Script started at: {script_start_time.strftime('%Y-%m-%d %H:%M:%S')}")

Script started at: 2024-08-19 22:53:15


In [5]:
model_config = ModelConfig(name, device)
num_labels = model_config.config["num_labels"]
model, tokenizer, checkpoint = load_model(model_config)

Loading the model.




{'model_name': 'fabriceyhc/bert-base-uncased-yahoo_answers_topics', 'task_type': 'classification', 'architectures': 'bert', 'dataset_name': 'YahooAnswersTopics', 'num_labels': 10, 'cache_dir': 'Models'}




The model fabriceyhc/bert-base-uncased-yahoo_answers_topics is loaded.




In [6]:
# print("Evaluate the original model")
# result = evaluate_model(model, model_config, test_dataloader)

In [7]:
# Evaluate the original model
# Evaluating: 100%|███████████████████████████████████████████████████████████████████| 1875/1875 [30:03<00:00,  1.04it/s]
# Loss: 1.0044
# Precision: 0.6874, Recall: 0.6865, F1-Score: 0.6839
#               precision    recall  f1-score   support

#            0       0.57      0.57      0.57      6000
#            1       0.74      0.66      0.69      6000
#            2       0.71      0.78      0.74      6000
#            3       0.54      0.53      0.53      6000
#            4       0.80      0.82      0.81      6000
#            5       0.90      0.84      0.87      6000
#            6       0.61      0.43      0.50      6000
#            7       0.62      0.73      0.67      6000
#            8       0.64      0.76      0.70      6000
#            9       0.75      0.75      0.75      6000

#     accuracy                           0.69     60000
#    macro avg       0.69      0.69      0.68     60000
# weighted avg       0.69      0.69      0.68     60000

In [8]:
for concern in range(num_labels):
    train_dataloader, valid_dataloader, test_dataloader = load_data(
    name, batch_size=batch_size, num_workers=num_workers
    )
    
    positive_samples = SamplingDataset(
        train_dataloader, concern, num_samples, num_labels, True, 4, device=device, resample=False, seed=seed
    )
    negative_samples = SamplingDataset(
        train_dataloader, concern, num_samples, num_labels, False, 4, device=device, resample=False, seed=seed
    )
    all_samples = SamplingDataset(
        train_dataloader, 200, num_samples, num_labels, False, 4, device=device, resample=False, seed=seed
    )
    
    module = copy.deepcopy(model)
    
    prune_concern_identification(
        module,
        model_config,
        positive_samples,
        negative_samples,
        include_layers=include_layers,
        exclude_layers=exclude_layers,
        sparsity_ratio=ci_ratio,
    )
    
    print(f"Evaluate the pruned model {concern}")
    result = evaluate_model(module, model_config, test_dataloader)
    get_sparsity(module)
    
    similar(model, module, valid_dataloader, concern, num_samples, num_labels, device=device, seed=seed)
    
    # save_module(module, "Modules/", f"ci_{name}_{ci_ratio}p.pt")

{'dataset_name': 'YahooAnswersTopics', 'path': 'yahoo_answers_topics', 'config_name': 'yahoo_answers_topics', 'text_column': 'question_title', 'label_column': 'topic', 'cache_dir': 'Datasets/Yahoo', 'task_type': 'classification'}




Downloading the Dataset YahooAnswersTopics




Tokenizing dataset:   0%|          | 0/700000 [00:00<?, ?it/s]

Tokenizing dataset:   0%|          | 164/700000 [00:00<07:06, 1639.52it/s]

Tokenizing dataset:   0%|          | 344/700000 [00:00<06:44, 1730.87it/s]

Tokenizing dataset:   0%|          | 522/700000 [00:00<06:39, 1750.74it/s]

Tokenizing dataset:   0%|          | 698/700000 [00:00<06:56, 1680.14it/s]

Tokenizing dataset:   0%|          | 870/700000 [00:00<06:53, 1691.89it/s]

Tokenizing dataset:   0%|          | 1040/700000 [00:00<07:02, 1653.82it/s]

Tokenizing dataset:   0%|          | 1207/700000 [00:00<07:01, 1658.16it/s]

Tokenizing dataset:   0%|          | 1381/700000 [00:00<06:55, 1682.20it/s]

Tokenizing dataset:   0%|          | 1556/700000 [00:00<06:50, 1700.16it/s]

Tokenizing dataset:   0%|          | 1736/700000 [00:01<06:43, 1728.96it/s]

Tokenizing dataset:   0%|          | 1916/700000 [00:01<06:38, 1750.03it/s]

Tokenizing dataset:   0%|          | 2096/700000 [00:01<06:36, 1762.31it/s]

Tokenizing dataset:   0%|          | 2273/700000 [00:01<06:36, 1757.75it/s]

Tokenizing dataset:   0%|          | 2453/700000 [00:01<06:34, 1767.52it/s]

Tokenizing dataset:   0%|          | 2636/700000 [00:01<06:30, 1785.15it/s]

Tokenizing dataset:   0%|          | 2815/700000 [00:01<06:33, 1771.18it/s]

Tokenizing dataset:   0%|          | 2993/700000 [00:01<06:35, 1761.05it/s]

Tokenizing dataset:   0%|          | 3170/700000 [00:01<06:41, 1736.19it/s]

Tokenizing dataset:   0%|          | 3344/700000 [00:01<06:43, 1727.96it/s]

Tokenizing dataset:   1%|          | 3517/700000 [00:02<06:44, 1723.74it/s]

Tokenizing dataset:   1%|          | 3690/700000 [00:02<06:44, 1719.33it/s]

Tokenizing dataset:   1%|          | 3862/700000 [00:02<06:47, 1708.90it/s]

Tokenizing dataset:   1%|          | 4033/700000 [00:02<06:49, 1699.87it/s]

Tokenizing dataset:   1%|          | 4232/700000 [00:02<06:29, 1784.15it/s]

Tokenizing dataset:   1%|          | 4476/700000 [00:02<05:51, 1978.68it/s]

Tokenizing dataset:   1%|          | 4742/700000 [00:02<05:18, 2179.70it/s]

Tokenizing dataset:   1%|          | 5005/700000 [00:02<05:00, 2314.01it/s]

Tokenizing dataset:   1%|          | 5262/700000 [00:02<04:50, 2388.21it/s]

Tokenizing dataset:   1%|          | 5514/700000 [00:02<04:46, 2426.24it/s]

Tokenizing dataset:   1%|          | 5757/700000 [00:03<05:02, 2296.57it/s]

Tokenizing dataset:   1%|          | 5989/700000 [00:03<05:17, 2185.07it/s]

Tokenizing dataset:   1%|          | 6210/700000 [00:03<05:28, 2111.81it/s]

Tokenizing dataset:   1%|          | 6423/700000 [00:03<05:35, 2065.39it/s]

Tokenizing dataset:   1%|          | 6631/700000 [00:03<05:39, 2040.72it/s]

Tokenizing dataset:   1%|          | 6836/700000 [00:03<05:44, 2013.55it/s]

Tokenizing dataset:   1%|          | 7038/700000 [00:03<05:48, 1989.29it/s]

Tokenizing dataset:   1%|          | 7238/700000 [00:03<05:54, 1953.60it/s]

Tokenizing dataset:   1%|          | 7434/700000 [00:03<06:42, 1721.28it/s]

Tokenizing dataset:   1%|          | 7613/700000 [00:04<06:38, 1737.99it/s]

Tokenizing dataset:   1%|          | 7794/700000 [00:04<06:34, 1755.85it/s]

Tokenizing dataset:   1%|          | 7976/700000 [00:04<06:30, 1772.79it/s]

Tokenizing dataset:   1%|          | 8156/700000 [00:04<06:31, 1768.82it/s]

Tokenizing dataset:   1%|          | 8344/700000 [00:04<06:24, 1800.09it/s]

Tokenizing dataset:   1%|          | 8537/700000 [00:04<06:16, 1835.73it/s]

Tokenizing dataset:   1%|          | 8727/700000 [00:04<06:13, 1853.06it/s]

Tokenizing dataset:   1%|▏         | 8913/700000 [00:04<06:13, 1848.91it/s]

Tokenizing dataset:   1%|▏         | 9103/700000 [00:04<06:10, 1862.33it/s]

Tokenizing dataset:   1%|▏         | 9292/700000 [00:04<06:09, 1868.18it/s]

Tokenizing dataset:   1%|▏         | 9480/700000 [00:05<06:11, 1858.39it/s]

Tokenizing dataset:   1%|▏         | 9668/700000 [00:05<06:10, 1864.73it/s]

Tokenizing dataset:   1%|▏         | 9857/700000 [00:05<06:09, 1869.96it/s]

Tokenizing dataset:   1%|▏         | 10045/700000 [00:05<06:38, 1731.19it/s]

Tokenizing dataset:   1%|▏         | 10221/700000 [00:05<06:43, 1709.59it/s]

Tokenizing dataset:   1%|▏         | 10406/700000 [00:05<06:34, 1747.94it/s]

Tokenizing dataset:   2%|▏         | 10585/700000 [00:05<06:32, 1758.65it/s]

Tokenizing dataset:   2%|▏         | 10762/700000 [00:05<06:38, 1730.71it/s]

Tokenizing dataset:   2%|▏         | 10936/700000 [00:06<14:09, 811.30it/s] 

Tokenizing dataset:   2%|▏         | 11121/700000 [00:06<11:42, 980.18it/s]

Tokenizing dataset:   2%|▏         | 11307/700000 [00:06<10:01, 1144.95it/s]

Tokenizing dataset:   2%|▏         | 11488/700000 [00:06<08:55, 1285.61it/s]

Tokenizing dataset:   2%|▏         | 11665/700000 [00:06<08:12, 1397.62it/s]

Tokenizing dataset:   2%|▏         | 11842/700000 [00:06<07:41, 1490.15it/s]

Tokenizing dataset:   2%|▏         | 12022/700000 [00:06<07:18, 1569.61it/s]

Tokenizing dataset:   2%|▏         | 12203/700000 [00:07<07:00, 1634.64it/s]

Tokenizing dataset:   2%|▏         | 12379/700000 [00:07<06:55, 1654.97it/s]

Tokenizing dataset:   2%|▏         | 12556/700000 [00:07<06:47, 1685.36it/s]

Tokenizing dataset:   2%|▏         | 12736/700000 [00:07<06:40, 1716.36it/s]

Tokenizing dataset:   2%|▏         | 12917/700000 [00:07<06:34, 1743.22it/s]

Tokenizing dataset:   2%|▏         | 13095/700000 [00:07<06:32, 1751.23it/s]

Tokenizing dataset:   2%|▏         | 13279/700000 [00:07<06:26, 1775.75it/s]

Tokenizing dataset:   2%|▏         | 13467/700000 [00:07<06:20, 1806.34it/s]

Tokenizing dataset:   2%|▏         | 13649/700000 [00:07<06:27, 1773.33it/s]

Tokenizing dataset:   2%|▏         | 13828/700000 [00:08<08:34, 1334.06it/s]

Tokenizing dataset:   2%|▏         | 13994/700000 [00:08<08:05, 1411.55it/s]

Tokenizing dataset:   2%|▏         | 14168/700000 [00:08<07:38, 1494.35it/s]

Tokenizing dataset:   2%|▏         | 14343/700000 [00:08<07:19, 1560.80it/s]

Tokenizing dataset:   2%|▏         | 14516/700000 [00:08<07:06, 1607.38it/s]

Tokenizing dataset:   2%|▏         | 14694/700000 [00:08<06:54, 1654.42it/s]

Tokenizing dataset:   2%|▏         | 14872/700000 [00:08<06:45, 1690.15it/s]

Tokenizing dataset:   2%|▏         | 15052/700000 [00:08<06:37, 1721.94it/s]

Tokenizing dataset:   2%|▏         | 15234/700000 [00:08<06:31, 1748.52it/s]

Tokenizing dataset:   2%|▏         | 15411/700000 [00:08<07:16, 1569.39it/s]

Tokenizing dataset:   2%|▏         | 15573/700000 [00:09<07:30, 1520.46it/s]

Tokenizing dataset:   2%|▏         | 15730/700000 [00:09<07:26, 1531.99it/s]

Tokenizing dataset:   2%|▏         | 15886/700000 [00:09<08:00, 1422.99it/s]

Tokenizing dataset:   2%|▏         | 16032/700000 [00:09<08:01, 1420.23it/s]

Tokenizing dataset:   2%|▏         | 16192/700000 [00:09<07:45, 1469.87it/s]

Tokenizing dataset:   2%|▏         | 16363/700000 [00:09<07:24, 1537.68it/s]

Tokenizing dataset:   2%|▏         | 16531/700000 [00:09<07:13, 1578.23it/s]

Tokenizing dataset:   2%|▏         | 16708/700000 [00:09<06:58, 1632.08it/s]

Tokenizing dataset:   2%|▏         | 16889/700000 [00:09<06:46, 1682.40it/s]

Tokenizing dataset:   2%|▏         | 17071/700000 [00:10<06:36, 1722.78it/s]

Tokenizing dataset:   2%|▏         | 17265/700000 [00:10<06:22, 1785.74it/s]

Tokenizing dataset:   2%|▏         | 17459/700000 [00:10<06:12, 1830.02it/s]

Tokenizing dataset:   3%|▎         | 17652/700000 [00:10<06:07, 1857.57it/s]

Tokenizing dataset:   3%|▎         | 17839/700000 [00:10<06:29, 1749.67it/s]

Tokenizing dataset:   3%|▎         | 18016/700000 [00:10<06:43, 1688.97it/s]

Tokenizing dataset:   3%|▎         | 18187/700000 [00:10<06:43, 1687.90it/s]

Tokenizing dataset:   3%|▎         | 18374/700000 [00:10<06:32, 1738.10it/s]

Tokenizing dataset:   3%|▎         | 18561/700000 [00:10<06:23, 1775.13it/s]

Tokenizing dataset:   3%|▎         | 18747/700000 [00:10<06:18, 1798.18it/s]

Tokenizing dataset:   3%|▎         | 18938/700000 [00:11<06:12, 1828.13it/s]

Tokenizing dataset:   3%|▎         | 19128/700000 [00:11<06:08, 1846.91it/s]

Tokenizing dataset:   3%|▎         | 19318/700000 [00:11<06:05, 1862.65it/s]

Tokenizing dataset:   3%|▎         | 19508/700000 [00:11<06:03, 1873.13it/s]

Tokenizing dataset:   3%|▎         | 19725/700000 [00:11<05:46, 1961.16it/s]

Tokenizing dataset:   3%|▎         | 20036/700000 [00:11<04:55, 2303.87it/s]

Tokenizing dataset:   3%|▎         | 20343/700000 [00:11<04:28, 2531.34it/s]

Tokenizing dataset:   3%|▎         | 20638/700000 [00:11<04:15, 2655.93it/s]

Tokenizing dataset:   3%|▎         | 20944/700000 [00:11<04:04, 2776.22it/s]

Tokenizing dataset:   3%|▎         | 21243/700000 [00:11<03:59, 2838.79it/s]

Tokenizing dataset:   3%|▎         | 21555/700000 [00:12<03:52, 2922.11it/s]

Tokenizing dataset:   3%|▎         | 21848/700000 [00:12<04:25, 2557.58it/s]

Tokenizing dataset:   3%|▎         | 22113/700000 [00:12<04:50, 2330.16it/s]

Tokenizing dataset:   3%|▎         | 22355/700000 [00:12<05:10, 2181.39it/s]

Tokenizing dataset:   3%|▎         | 22580/700000 [00:12<05:26, 2072.53it/s]

Tokenizing dataset:   3%|▎         | 22792/700000 [00:12<05:32, 2034.39it/s]

Tokenizing dataset:   3%|▎         | 22999/700000 [00:12<05:37, 2008.65it/s]

Tokenizing dataset:   3%|▎         | 23202/700000 [00:12<05:40, 1985.28it/s]

Tokenizing dataset:   3%|▎         | 23402/700000 [00:13<05:43, 1967.94it/s]

Tokenizing dataset:   3%|▎         | 23600/700000 [00:13<05:44, 1961.14it/s]

Tokenizing dataset:   3%|▎         | 23797/700000 [00:13<05:49, 1933.21it/s]

Tokenizing dataset:   3%|▎         | 23991/700000 [00:13<05:54, 1907.42it/s]

Tokenizing dataset:   3%|▎         | 24182/700000 [00:13<05:59, 1882.00it/s]

Tokenizing dataset:   3%|▎         | 24371/700000 [00:13<06:02, 1866.34it/s]

Tokenizing dataset:   4%|▎         | 24558/700000 [00:13<06:04, 1855.10it/s]

Tokenizing dataset:   4%|▎         | 24744/700000 [00:13<06:10, 1822.06it/s]

Tokenizing dataset:   4%|▎         | 24928/700000 [00:13<06:09, 1826.50it/s]

Tokenizing dataset:   4%|▎         | 25113/700000 [00:13<06:08, 1831.90it/s]

Tokenizing dataset:   4%|▎         | 25298/700000 [00:14<06:07, 1834.75it/s]

Tokenizing dataset:   4%|▎         | 25482/700000 [00:14<06:07, 1835.57it/s]

Tokenizing dataset:   4%|▎         | 25666/700000 [00:14<06:09, 1825.92it/s]

Tokenizing dataset:   4%|▎         | 25849/700000 [00:14<06:10, 1822.01it/s]

Tokenizing dataset:   4%|▎         | 26032/700000 [00:14<06:34, 1709.58it/s]

Tokenizing dataset:   4%|▎         | 26206/700000 [00:14<06:32, 1717.52it/s]

Tokenizing dataset:   4%|▍         | 26448/700000 [00:14<05:50, 1919.35it/s]

Tokenizing dataset:   4%|▍         | 26723/700000 [00:14<05:11, 2159.97it/s]

Tokenizing dataset:   4%|▍         | 27022/700000 [00:14<04:40, 2402.71it/s]

Tokenizing dataset:   4%|▍         | 27306/700000 [00:14<04:25, 2529.82it/s]

Tokenizing dataset:   4%|▍         | 27607/700000 [00:15<04:11, 2671.66it/s]

Tokenizing dataset:   4%|▍         | 27890/700000 [00:15<04:07, 2718.36it/s]

Tokenizing dataset:   4%|▍         | 28185/700000 [00:15<04:01, 2787.11it/s]

Tokenizing dataset:   4%|▍         | 28482/700000 [00:15<03:56, 2839.51it/s]

Tokenizing dataset:   4%|▍         | 28767/700000 [00:15<03:59, 2807.72it/s]

Tokenizing dataset:   4%|▍         | 29049/700000 [00:15<04:01, 2778.76it/s]

Tokenizing dataset:   4%|▍         | 29342/700000 [00:15<03:57, 2822.24it/s]

Tokenizing dataset:   4%|▍         | 29625/700000 [00:15<04:04, 2742.86it/s]

Tokenizing dataset:   4%|▍         | 29900/700000 [00:15<04:04, 2740.07it/s]

Tokenizing dataset:   4%|▍         | 30175/700000 [00:16<04:05, 2728.15it/s]

Tokenizing dataset:   4%|▍         | 30455/700000 [00:16<04:03, 2748.32it/s]

Tokenizing dataset:   4%|▍         | 30734/700000 [00:16<04:02, 2759.65it/s]

Tokenizing dataset:   4%|▍         | 31011/700000 [00:16<04:06, 2712.70it/s]

Tokenizing dataset:   4%|▍         | 31283/700000 [00:16<04:11, 2659.86it/s]

Tokenizing dataset:   5%|▍         | 31550/700000 [00:16<04:12, 2643.16it/s]

Tokenizing dataset:   5%|▍         | 31815/700000 [00:16<04:15, 2614.96it/s]

Tokenizing dataset:   5%|▍         | 32092/700000 [00:16<04:11, 2658.87it/s]

Tokenizing dataset:   5%|▍         | 32367/700000 [00:16<04:08, 2685.53it/s]

Tokenizing dataset:   5%|▍         | 32636/700000 [00:16<04:09, 2679.03it/s]

Tokenizing dataset:   5%|▍         | 32905/700000 [00:17<04:08, 2680.63it/s]

Tokenizing dataset:   5%|▍         | 33176/700000 [00:17<04:08, 2687.42it/s]

Tokenizing dataset:   5%|▍         | 33449/700000 [00:17<04:07, 2697.50it/s]

Tokenizing dataset:   5%|▍         | 33726/700000 [00:17<04:05, 2717.57it/s]

Tokenizing dataset:   5%|▍         | 33998/700000 [00:17<04:06, 2698.62it/s]

Tokenizing dataset:   5%|▍         | 34270/700000 [00:17<04:06, 2702.32it/s]

Tokenizing dataset:   5%|▍         | 34548/700000 [00:17<04:04, 2724.69it/s]

Tokenizing dataset:   5%|▍         | 34821/700000 [00:17<04:04, 2724.91it/s]

Tokenizing dataset:   5%|▌         | 35097/700000 [00:17<04:03, 2735.18it/s]

Tokenizing dataset:   5%|▌         | 35371/700000 [00:17<04:13, 2624.94it/s]

Tokenizing dataset:   5%|▌         | 35635/700000 [00:18<04:14, 2612.53it/s]

Tokenizing dataset:   5%|▌         | 35909/700000 [00:18<04:10, 2649.73it/s]

Tokenizing dataset:   5%|▌         | 36195/700000 [00:18<04:04, 2709.72it/s]

Tokenizing dataset:   5%|▌         | 36487/700000 [00:18<03:59, 2769.46it/s]

Tokenizing dataset:   5%|▌         | 36765/700000 [00:18<04:02, 2739.40it/s]

Tokenizing dataset:   5%|▌         | 37040/700000 [00:18<06:27, 1712.08it/s]

Tokenizing dataset:   5%|▌         | 37334/700000 [00:18<05:36, 1967.00it/s]

Tokenizing dataset:   5%|▌         | 37629/700000 [00:18<05:02, 2191.52it/s]

Tokenizing dataset:   5%|▌         | 37931/700000 [00:19<04:36, 2395.22it/s]

Tokenizing dataset:   5%|▌         | 38207/700000 [00:19<04:25, 2488.91it/s]

Tokenizing dataset:   5%|▌         | 38480/700000 [00:19<04:20, 2537.70it/s]

Tokenizing dataset:   6%|▌         | 38751/700000 [00:19<04:19, 2550.99it/s]

Tokenizing dataset:   6%|▌         | 39018/700000 [00:19<04:18, 2557.78it/s]

Tokenizing dataset:   6%|▌         | 39283/700000 [00:19<04:15, 2581.43it/s]

Tokenizing dataset:   6%|▌         | 39550/700000 [00:19<04:13, 2605.31it/s]

Tokenizing dataset:   6%|▌         | 39815/700000 [00:19<04:18, 2553.51it/s]

Tokenizing dataset:   6%|▌         | 40074/700000 [00:19<04:19, 2545.43it/s]

Tokenizing dataset:   6%|▌         | 40331/700000 [00:19<04:21, 2527.40it/s]

Tokenizing dataset:   6%|▌         | 40586/700000 [00:20<04:23, 2501.87it/s]

Tokenizing dataset:   6%|▌         | 40848/700000 [00:20<04:20, 2534.90it/s]

Tokenizing dataset:   6%|▌         | 41103/700000 [00:20<04:19, 2539.05it/s]

Tokenizing dataset:   6%|▌         | 41366/700000 [00:20<04:16, 2564.76it/s]

Tokenizing dataset:   6%|▌         | 41631/700000 [00:20<04:14, 2589.56it/s]

Tokenizing dataset:   6%|▌         | 41891/700000 [00:20<04:26, 2473.62it/s]

Tokenizing dataset:   6%|▌         | 42149/700000 [00:20<04:22, 2503.53it/s]

Tokenizing dataset:   6%|▌         | 42412/700000 [00:20<04:19, 2538.72it/s]

Tokenizing dataset:   6%|▌         | 42684/700000 [00:20<04:13, 2589.80it/s]

Tokenizing dataset:   6%|▌         | 42961/700000 [00:21<04:08, 2640.54it/s]

Tokenizing dataset:   6%|▌         | 43254/700000 [00:21<04:00, 2725.91it/s]

Tokenizing dataset:   6%|▌         | 43528/700000 [00:21<04:01, 2716.28it/s]

Tokenizing dataset:   6%|▋         | 43803/700000 [00:21<04:00, 2726.02it/s]

Tokenizing dataset:   6%|▋         | 44092/700000 [00:21<03:56, 2772.99it/s]

Tokenizing dataset:   6%|▋         | 44371/700000 [00:21<03:56, 2777.70it/s]

Tokenizing dataset:   6%|▋         | 44649/700000 [00:21<03:55, 2777.55it/s]

Tokenizing dataset:   6%|▋         | 44927/700000 [00:21<04:22, 2492.74it/s]

Tokenizing dataset:   6%|▋         | 45182/700000 [00:21<04:45, 2296.88it/s]

Tokenizing dataset:   6%|▋         | 45418/700000 [00:22<04:58, 2190.42it/s]

Tokenizing dataset:   7%|▋         | 45642/700000 [00:22<05:09, 2117.04it/s]

Tokenizing dataset:   7%|▋         | 45857/700000 [00:22<05:18, 2052.45it/s]

Tokenizing dataset:   7%|▋         | 46065/700000 [00:22<05:24, 2014.25it/s]

Tokenizing dataset:   7%|▋         | 46268/700000 [00:22<05:28, 1989.16it/s]

Tokenizing dataset:   7%|▋         | 46468/700000 [00:22<05:32, 1963.46it/s]

Tokenizing dataset:   7%|▋         | 46665/700000 [00:22<05:34, 1950.84it/s]

Tokenizing dataset:   7%|▋         | 46861/700000 [00:22<05:34, 1953.36it/s]

Tokenizing dataset:   7%|▋         | 47060/700000 [00:22<05:32, 1961.80it/s]

Tokenizing dataset:   7%|▋         | 47259/700000 [00:22<05:31, 1968.38it/s]

Tokenizing dataset:   7%|▋         | 47456/700000 [00:23<05:31, 1968.51it/s]

Tokenizing dataset:   7%|▋         | 47656/700000 [00:23<05:30, 1976.60it/s]

Tokenizing dataset:   7%|▋         | 47855/700000 [00:23<05:29, 1980.20it/s]

Tokenizing dataset:   7%|▋         | 48054/700000 [00:23<05:29, 1979.23it/s]

Tokenizing dataset:   7%|▋         | 48252/700000 [00:23<05:29, 1975.44it/s]

Tokenizing dataset:   7%|▋         | 48450/700000 [00:23<05:32, 1961.40it/s]

Tokenizing dataset:   7%|▋         | 48647/700000 [00:23<05:35, 1942.25it/s]

Tokenizing dataset:   7%|▋         | 48842/700000 [00:23<05:37, 1929.95it/s]

Tokenizing dataset:   7%|▋         | 49036/700000 [00:23<05:37, 1931.49it/s]

Tokenizing dataset:   7%|▋         | 49234/700000 [00:23<05:34, 1943.91it/s]

Tokenizing dataset:   7%|▋         | 49429/700000 [00:24<05:35, 1936.37it/s]

Tokenizing dataset:   7%|▋         | 49623/700000 [00:24<05:38, 1921.93it/s]

Tokenizing dataset:   7%|▋         | 49819/700000 [00:24<05:36, 1931.24it/s]

Tokenizing dataset:   7%|▋         | 50013/700000 [00:24<05:37, 1924.82it/s]

Tokenizing dataset:   7%|▋         | 50206/700000 [00:24<05:42, 1896.53it/s]

Tokenizing dataset:   7%|▋         | 50396/700000 [00:24<05:44, 1885.39it/s]

Tokenizing dataset:   7%|▋         | 50585/700000 [00:24<05:46, 1871.94it/s]

Tokenizing dataset:   7%|▋         | 50773/700000 [00:24<05:48, 1862.87it/s]

Tokenizing dataset:   7%|▋         | 50960/700000 [00:24<05:57, 1817.15it/s]

Tokenizing dataset:   7%|▋         | 51142/700000 [00:25<05:58, 1811.93it/s]

Tokenizing dataset:   7%|▋         | 51325/700000 [00:25<05:57, 1814.66it/s]

Tokenizing dataset:   7%|▋         | 51507/700000 [00:25<05:57, 1816.09it/s]

Tokenizing dataset:   7%|▋         | 51689/700000 [00:25<05:57, 1814.56it/s]

Tokenizing dataset:   7%|▋         | 51873/700000 [00:25<05:55, 1822.10it/s]

Tokenizing dataset:   7%|▋         | 52059/700000 [00:25<05:53, 1833.14it/s]

Tokenizing dataset:   7%|▋         | 52301/700000 [00:25<05:22, 2006.63it/s]

Tokenizing dataset:   8%|▊         | 52555/700000 [00:25<04:58, 2165.97it/s]

Tokenizing dataset:   8%|▊         | 52806/700000 [00:25<04:45, 2268.69it/s]

Tokenizing dataset:   8%|▊         | 53070/700000 [00:25<04:32, 2378.31it/s]

Tokenizing dataset:   8%|▊         | 53338/700000 [00:26<04:22, 2468.10it/s]

Tokenizing dataset:   8%|▊         | 53599/700000 [00:26<04:17, 2509.71it/s]

Tokenizing dataset:   8%|▊         | 53859/700000 [00:26<04:14, 2535.24it/s]

Tokenizing dataset:   8%|▊         | 54113/700000 [00:26<04:14, 2535.09it/s]

Tokenizing dataset:   8%|▊         | 54374/700000 [00:26<04:12, 2556.18it/s]

Tokenizing dataset:   8%|▊         | 54662/700000 [00:26<04:03, 2650.50it/s]

Tokenizing dataset:   8%|▊         | 54948/700000 [00:26<03:57, 2712.20it/s]

Tokenizing dataset:   8%|▊         | 55247/700000 [00:26<03:50, 2795.33it/s]

Tokenizing dataset:   8%|▊         | 55536/700000 [00:26<03:48, 2822.19it/s]

Tokenizing dataset:   8%|▊         | 55827/700000 [00:26<03:46, 2846.93it/s]

Tokenizing dataset:   8%|▊         | 56120/700000 [00:27<03:44, 2871.10it/s]

Tokenizing dataset:   8%|▊         | 56408/700000 [00:27<03:47, 2834.65it/s]

Tokenizing dataset:   8%|▊         | 56692/700000 [00:27<03:47, 2825.73it/s]

Tokenizing dataset:   8%|▊         | 56975/700000 [00:27<03:53, 2758.68it/s]

Tokenizing dataset:   8%|▊         | 57252/700000 [00:27<03:57, 2701.07it/s]

Tokenizing dataset:   8%|▊         | 57523/700000 [00:27<03:59, 2680.86it/s]

Tokenizing dataset:   8%|▊         | 57792/700000 [00:27<04:00, 2668.70it/s]

Tokenizing dataset:   8%|▊         | 58061/700000 [00:27<04:00, 2673.73it/s]

Tokenizing dataset:   8%|▊         | 58329/700000 [00:27<04:00, 2667.35it/s]

Tokenizing dataset:   8%|▊         | 58596/700000 [00:27<04:01, 2657.25it/s]

Tokenizing dataset:   8%|▊         | 58862/700000 [00:28<04:02, 2643.33it/s]

Tokenizing dataset:   8%|▊         | 59127/700000 [00:28<04:04, 2620.99it/s]

Tokenizing dataset:   8%|▊         | 59390/700000 [00:28<04:05, 2613.51it/s]

Tokenizing dataset:   9%|▊         | 59658/700000 [00:28<04:03, 2629.41it/s]

Tokenizing dataset:   9%|▊         | 59921/700000 [00:28<04:05, 2611.35it/s]

Tokenizing dataset:   9%|▊         | 60197/700000 [00:28<04:01, 2652.81it/s]

Tokenizing dataset:   9%|▊         | 60481/700000 [00:28<03:56, 2706.12it/s]

Tokenizing dataset:   9%|▊         | 60752/700000 [00:28<03:56, 2702.13it/s]

Tokenizing dataset:   9%|▊         | 61023/700000 [00:28<03:59, 2663.62it/s]

Tokenizing dataset:   9%|▉         | 61294/700000 [00:28<03:58, 2677.12it/s]

Tokenizing dataset:   9%|▉         | 61569/700000 [00:29<03:56, 2698.65it/s]

Tokenizing dataset:   9%|▉         | 61860/700000 [00:29<03:51, 2759.60it/s]

Tokenizing dataset:   9%|▉         | 62139/700000 [00:29<03:50, 2767.22it/s]

Tokenizing dataset:   9%|▉         | 62432/700000 [00:29<03:46, 2813.35it/s]

Tokenizing dataset:   9%|▉         | 62718/700000 [00:29<03:45, 2825.33it/s]

Tokenizing dataset:   9%|▉         | 63010/700000 [00:29<03:43, 2853.42it/s]

Tokenizing dataset:   9%|▉         | 63303/700000 [00:29<03:41, 2873.91it/s]

Tokenizing dataset:   9%|▉         | 63591/700000 [00:29<03:42, 2860.57it/s]

Tokenizing dataset:   9%|▉         | 63881/700000 [00:29<03:41, 2870.30it/s]

Tokenizing dataset:   9%|▉         | 64169/700000 [00:29<03:43, 2838.89it/s]

Tokenizing dataset:   9%|▉         | 64453/700000 [00:30<03:45, 2818.67it/s]

Tokenizing dataset:   9%|▉         | 64735/700000 [00:30<03:51, 2747.75it/s]

Tokenizing dataset:   9%|▉         | 65011/700000 [00:30<03:54, 2708.17it/s]

Tokenizing dataset:   9%|▉         | 65283/700000 [00:30<03:55, 2689.91it/s]

Tokenizing dataset:   9%|▉         | 65560/700000 [00:30<03:53, 2712.31it/s]

Tokenizing dataset:   9%|▉         | 65832/700000 [00:30<03:57, 2672.59it/s]

Tokenizing dataset:   9%|▉         | 66100/700000 [00:30<03:59, 2648.04it/s]

Tokenizing dataset:   9%|▉         | 66365/700000 [00:30<04:00, 2631.20it/s]

Tokenizing dataset:  10%|▉         | 66629/700000 [00:30<04:01, 2621.54it/s]

Tokenizing dataset:  10%|▉         | 66892/700000 [00:30<04:03, 2602.54it/s]

Tokenizing dataset:  10%|▉         | 67153/700000 [00:31<04:05, 2574.12it/s]

Tokenizing dataset:  10%|▉         | 67411/700000 [00:31<04:07, 2560.12it/s]

Tokenizing dataset:  10%|▉         | 67669/700000 [00:31<04:06, 2563.10it/s]

Tokenizing dataset:  10%|▉         | 67926/700000 [00:31<04:09, 2534.70it/s]

Tokenizing dataset:  10%|▉         | 68182/700000 [00:31<04:08, 2541.16it/s]

Tokenizing dataset:  10%|▉         | 68437/700000 [00:31<04:11, 2516.06it/s]

Tokenizing dataset:  10%|▉         | 68692/700000 [00:31<04:10, 2523.47it/s]

Tokenizing dataset:  10%|▉         | 68952/700000 [00:31<04:07, 2544.67it/s]

Tokenizing dataset:  10%|▉         | 69207/700000 [00:32<07:01, 1495.45it/s]

Tokenizing dataset:  10%|▉         | 69477/700000 [00:32<06:03, 1735.23it/s]

Tokenizing dataset:  10%|▉         | 69746/700000 [00:32<05:23, 1946.90it/s]

Tokenizing dataset:  10%|█         | 70025/700000 [00:32<04:53, 2148.23it/s]

Tokenizing dataset:  10%|█         | 70303/700000 [00:32<04:32, 2308.91it/s]

Tokenizing dataset:  10%|█         | 70561/700000 [00:32<04:24, 2377.01it/s]

Tokenizing dataset:  10%|█         | 70819/700000 [00:32<04:18, 2432.28it/s]

Tokenizing dataset:  10%|█         | 71079/700000 [00:32<04:13, 2479.16it/s]

Tokenizing dataset:  10%|█         | 71337/700000 [00:32<04:11, 2503.61it/s]

Tokenizing dataset:  10%|█         | 71600/700000 [00:33<04:07, 2539.36it/s]

Tokenizing dataset:  10%|█         | 71869/700000 [00:33<04:03, 2583.09it/s]

Tokenizing dataset:  10%|█         | 72134/700000 [00:33<04:01, 2602.24it/s]

Tokenizing dataset:  10%|█         | 72400/700000 [00:33<03:59, 2618.46it/s]

Tokenizing dataset:  10%|█         | 72664/700000 [00:33<04:00, 2613.60it/s]

Tokenizing dataset:  10%|█         | 72927/700000 [00:33<03:59, 2613.81it/s]

Tokenizing dataset:  10%|█         | 73197/700000 [00:33<03:57, 2639.27it/s]

Tokenizing dataset:  10%|█         | 73471/700000 [00:33<03:54, 2666.46it/s]

Tokenizing dataset:  11%|█         | 73739/700000 [00:33<03:56, 2651.36it/s]

Tokenizing dataset:  11%|█         | 74005/700000 [00:33<03:56, 2641.64it/s]

Tokenizing dataset:  11%|█         | 74270/700000 [00:34<03:57, 2639.79it/s]

Tokenizing dataset:  11%|█         | 74538/700000 [00:34<03:55, 2651.63it/s]

Tokenizing dataset:  11%|█         | 74804/700000 [00:34<03:59, 2607.85it/s]

Tokenizing dataset:  11%|█         | 75066/700000 [00:34<04:00, 2603.86it/s]

Tokenizing dataset:  11%|█         | 75327/700000 [00:34<04:01, 2587.80it/s]

Tokenizing dataset:  11%|█         | 75591/700000 [00:34<03:59, 2602.29it/s]

Tokenizing dataset:  11%|█         | 75879/700000 [00:34<03:52, 2681.82it/s]

Tokenizing dataset:  11%|█         | 76148/700000 [00:34<03:54, 2655.44it/s]

Tokenizing dataset:  11%|█         | 76430/700000 [00:34<03:50, 2703.21it/s]

Tokenizing dataset:  11%|█         | 76724/700000 [00:34<03:44, 2772.51it/s]

Tokenizing dataset:  11%|█         | 77002/700000 [00:35<03:45, 2764.29it/s]

Tokenizing dataset:  11%|█         | 77279/700000 [00:35<03:46, 2749.36it/s]

Tokenizing dataset:  11%|█         | 77555/700000 [00:35<03:48, 2727.53it/s]

Tokenizing dataset:  11%|█         | 77834/700000 [00:35<03:46, 2744.27it/s]

Tokenizing dataset:  11%|█         | 78109/700000 [00:35<03:46, 2742.65it/s]

Tokenizing dataset:  11%|█         | 78384/700000 [00:35<03:48, 2721.73it/s]

Tokenizing dataset:  11%|█         | 78657/700000 [00:35<03:52, 2676.78it/s]

Tokenizing dataset:  11%|█▏        | 78925/700000 [00:35<03:54, 2645.15it/s]

Tokenizing dataset:  11%|█▏        | 79190/700000 [00:35<03:55, 2636.99it/s]

Tokenizing dataset:  11%|█▏        | 79454/700000 [00:35<03:56, 2619.52it/s]

Tokenizing dataset:  11%|█▏        | 79717/700000 [00:36<03:58, 2595.48it/s]

Tokenizing dataset:  11%|█▏        | 79977/700000 [00:36<03:59, 2591.26it/s]

Tokenizing dataset:  11%|█▏        | 80244/700000 [00:36<03:57, 2611.29it/s]

Tokenizing dataset:  12%|█▏        | 80511/700000 [00:36<03:55, 2627.86it/s]

Tokenizing dataset:  12%|█▏        | 80784/700000 [00:36<03:53, 2657.02it/s]

Tokenizing dataset:  12%|█▏        | 81054/700000 [00:36<03:51, 2669.46it/s]

Tokenizing dataset:  12%|█▏        | 81333/700000 [00:36<03:48, 2705.30it/s]

Tokenizing dataset:  12%|█▏        | 81605/700000 [00:36<03:48, 2709.19it/s]

Tokenizing dataset:  12%|█▏        | 81880/700000 [00:36<03:47, 2720.15it/s]

Tokenizing dataset:  12%|█▏        | 82162/700000 [00:36<03:44, 2748.13it/s]

Tokenizing dataset:  12%|█▏        | 82437/700000 [00:37<03:46, 2720.78it/s]

Tokenizing dataset:  12%|█▏        | 82710/700000 [00:37<03:51, 2666.65it/s]

Tokenizing dataset:  12%|█▏        | 82977/700000 [00:37<03:54, 2629.45it/s]

Tokenizing dataset:  12%|█▏        | 83241/700000 [00:37<03:56, 2611.25it/s]

Tokenizing dataset:  12%|█▏        | 83511/700000 [00:37<03:53, 2636.09it/s]

Tokenizing dataset:  12%|█▏        | 83790/700000 [00:37<03:49, 2680.58it/s]

Tokenizing dataset:  12%|█▏        | 84070/700000 [00:37<03:46, 2715.40it/s]

Tokenizing dataset:  12%|█▏        | 84355/700000 [00:37<03:43, 2755.33it/s]

Tokenizing dataset:  12%|█▏        | 84632/700000 [00:37<03:43, 2757.61it/s]

Tokenizing dataset:  12%|█▏        | 84908/700000 [00:38<03:45, 2724.76it/s]

Tokenizing dataset:  12%|█▏        | 85181/700000 [00:38<03:46, 2717.28it/s]

Tokenizing dataset:  12%|█▏        | 85453/700000 [00:38<03:47, 2702.59it/s]

Tokenizing dataset:  12%|█▏        | 85724/700000 [00:38<03:49, 2680.86it/s]

Tokenizing dataset:  12%|█▏        | 85993/700000 [00:38<03:49, 2677.78it/s]

Tokenizing dataset:  12%|█▏        | 86262/700000 [00:38<03:49, 2679.52it/s]

Tokenizing dataset:  12%|█▏        | 86530/700000 [00:38<03:51, 2653.10it/s]

Tokenizing dataset:  12%|█▏        | 86796/700000 [00:38<03:53, 2629.14it/s]

Tokenizing dataset:  12%|█▏        | 87059/700000 [00:38<03:58, 2574.42it/s]

Tokenizing dataset:  12%|█▏        | 87317/700000 [00:38<04:00, 2545.95it/s]

Tokenizing dataset:  13%|█▎        | 87572/700000 [00:39<04:05, 2497.58it/s]

Tokenizing dataset:  13%|█▎        | 87822/700000 [00:39<04:07, 2470.85it/s]

Tokenizing dataset:  13%|█▎        | 88070/700000 [00:39<04:08, 2465.32it/s]

Tokenizing dataset:  13%|█▎        | 88327/700000 [00:39<04:05, 2493.31it/s]

Tokenizing dataset:  13%|█▎        | 88581/700000 [00:39<04:03, 2506.03it/s]

Tokenizing dataset:  13%|█▎        | 88832/700000 [00:39<04:04, 2499.31it/s]

Tokenizing dataset:  13%|█▎        | 89082/700000 [00:39<04:09, 2450.40it/s]

Tokenizing dataset:  13%|█▎        | 89337/700000 [00:39<04:06, 2479.40it/s]

Tokenizing dataset:  13%|█▎        | 89592/700000 [00:39<04:04, 2498.25it/s]

Tokenizing dataset:  13%|█▎        | 89854/700000 [00:39<04:00, 2532.19it/s]

Tokenizing dataset:  13%|█▎        | 90108/700000 [00:40<04:01, 2520.31it/s]

Tokenizing dataset:  13%|█▎        | 90364/700000 [00:40<04:00, 2530.81it/s]

Tokenizing dataset:  13%|█▎        | 90618/700000 [00:40<04:03, 2504.32it/s]

Tokenizing dataset:  13%|█▎        | 90869/700000 [00:40<04:04, 2490.15it/s]

Tokenizing dataset:  13%|█▎        | 91123/700000 [00:40<04:03, 2503.20it/s]

Tokenizing dataset:  13%|█▎        | 91382/700000 [00:40<04:00, 2527.43it/s]

Tokenizing dataset:  13%|█▎        | 91656/700000 [00:40<03:54, 2589.23it/s]

Tokenizing dataset:  13%|█▎        | 91918/700000 [00:40<03:54, 2597.97it/s]

Tokenizing dataset:  13%|█▎        | 92185/700000 [00:40<03:52, 2617.64it/s]

Tokenizing dataset:  13%|█▎        | 92454/700000 [00:40<03:50, 2636.97it/s]

Tokenizing dataset:  13%|█▎        | 92722/700000 [00:41<03:49, 2648.55it/s]

Tokenizing dataset:  13%|█▎        | 92989/700000 [00:41<03:48, 2652.89it/s]

Tokenizing dataset:  13%|█▎        | 93263/700000 [00:41<03:46, 2677.64it/s]

Tokenizing dataset:  13%|█▎        | 93537/700000 [00:41<03:44, 2696.05it/s]

Tokenizing dataset:  13%|█▎        | 93807/700000 [00:41<03:47, 2658.83it/s]

Tokenizing dataset:  13%|█▎        | 94074/700000 [00:41<03:49, 2640.21it/s]

Tokenizing dataset:  13%|█▎        | 94346/700000 [00:41<03:47, 2662.03it/s]

Tokenizing dataset:  14%|█▎        | 94613/700000 [00:41<03:49, 2639.47it/s]

Tokenizing dataset:  14%|█▎        | 94879/700000 [00:41<03:48, 2644.70it/s]

Tokenizing dataset:  14%|█▎        | 95144/700000 [00:41<03:53, 2594.48it/s]

Tokenizing dataset:  14%|█▎        | 95404/700000 [00:42<03:54, 2579.26it/s]

Tokenizing dataset:  14%|█▎        | 95663/700000 [00:42<03:57, 2548.59it/s]

Tokenizing dataset:  14%|█▎        | 95919/700000 [00:42<03:57, 2539.22it/s]

Tokenizing dataset:  14%|█▎        | 96174/700000 [00:42<03:58, 2530.03it/s]

Tokenizing dataset:  14%|█▍        | 96442/700000 [00:42<03:54, 2573.59it/s]

Tokenizing dataset:  14%|█▍        | 96701/700000 [00:42<03:54, 2577.43it/s]

Tokenizing dataset:  14%|█▍        | 96961/700000 [00:42<03:53, 2584.07it/s]

Tokenizing dataset:  14%|█▍        | 97230/700000 [00:42<03:50, 2615.45it/s]

Tokenizing dataset:  14%|█▍        | 97503/700000 [00:42<03:47, 2649.63it/s]

Tokenizing dataset:  14%|█▍        | 97774/700000 [00:43<03:46, 2664.70it/s]

Tokenizing dataset:  14%|█▍        | 98052/700000 [00:43<03:43, 2698.75it/s]

Tokenizing dataset:  14%|█▍        | 98322/700000 [00:43<03:45, 2672.22it/s]

Tokenizing dataset:  14%|█▍        | 98591/700000 [00:43<03:44, 2677.01it/s]

Tokenizing dataset:  14%|█▍        | 98878/700000 [00:43<03:39, 2732.44it/s]

Tokenizing dataset:  14%|█▍        | 99152/700000 [00:43<03:41, 2717.60it/s]

Tokenizing dataset:  14%|█▍        | 99424/700000 [00:43<03:41, 2713.16it/s]

Tokenizing dataset:  14%|█▍        | 99703/700000 [00:43<03:39, 2733.32it/s]

Tokenizing dataset:  14%|█▍        | 99977/700000 [00:43<03:39, 2730.34it/s]

Tokenizing dataset:  14%|█▍        | 100251/700000 [00:43<03:39, 2732.14it/s]

Tokenizing dataset:  14%|█▍        | 100537/700000 [00:44<03:36, 2769.52it/s]

Tokenizing dataset:  14%|█▍        | 100814/700000 [00:44<03:36, 2761.85it/s]

Tokenizing dataset:  14%|█▍        | 101091/700000 [00:44<03:39, 2731.62it/s]

Tokenizing dataset:  14%|█▍        | 101372/700000 [00:44<03:37, 2754.59it/s]

Tokenizing dataset:  15%|█▍        | 101648/700000 [00:44<03:42, 2691.40it/s]

Tokenizing dataset:  15%|█▍        | 101918/700000 [00:44<03:43, 2681.44it/s]

Tokenizing dataset:  15%|█▍        | 102187/700000 [00:44<03:43, 2677.99it/s]

Tokenizing dataset:  15%|█▍        | 102455/700000 [00:44<03:43, 2673.22it/s]

Tokenizing dataset:  15%|█▍        | 102725/700000 [00:44<03:42, 2680.89it/s]

Tokenizing dataset:  15%|█▍        | 102999/700000 [00:44<03:41, 2698.44it/s]

Tokenizing dataset:  15%|█▍        | 103269/700000 [00:45<03:44, 2659.53it/s]

Tokenizing dataset:  15%|█▍        | 103536/700000 [00:45<03:44, 2661.76it/s]

Tokenizing dataset:  15%|█▍        | 103803/700000 [00:45<03:45, 2640.59it/s]

Tokenizing dataset:  15%|█▍        | 104068/700000 [00:45<03:47, 2615.61it/s]

Tokenizing dataset:  15%|█▍        | 104330/700000 [00:45<03:49, 2592.72it/s]

Tokenizing dataset:  15%|█▍        | 104590/700000 [00:45<03:50, 2582.91it/s]

Tokenizing dataset:  15%|█▍        | 104850/700000 [00:45<03:50, 2585.79it/s]

Tokenizing dataset:  15%|█▌        | 105118/700000 [00:45<03:47, 2612.05it/s]

Tokenizing dataset:  15%|█▌        | 105381/700000 [00:45<03:47, 2616.88it/s]

Tokenizing dataset:  15%|█▌        | 105643/700000 [00:45<03:49, 2587.32it/s]

Tokenizing dataset:  15%|█▌        | 105904/700000 [00:46<03:49, 2592.71it/s]

Tokenizing dataset:  15%|█▌        | 106168/700000 [00:46<03:47, 2605.08it/s]

Tokenizing dataset:  15%|█▌        | 106433/700000 [00:46<03:46, 2617.29it/s]

Tokenizing dataset:  15%|█▌        | 106721/700000 [00:46<03:40, 2693.17it/s]

Tokenizing dataset:  15%|█▌        | 106991/700000 [00:46<03:41, 2677.77it/s]

Tokenizing dataset:  15%|█▌        | 107261/700000 [00:46<03:40, 2683.16it/s]

Tokenizing dataset:  15%|█▌        | 107537/700000 [00:46<03:38, 2705.34it/s]

Tokenizing dataset:  15%|█▌        | 107814/700000 [00:46<03:37, 2721.90it/s]

Tokenizing dataset:  15%|█▌        | 108087/700000 [00:46<03:37, 2716.72it/s]

Tokenizing dataset:  15%|█▌        | 108374/700000 [00:46<03:34, 2761.92it/s]

Tokenizing dataset:  16%|█▌        | 108657/700000 [00:47<03:32, 2780.53it/s]

Tokenizing dataset:  16%|█▌        | 108947/700000 [00:47<03:30, 2813.73it/s]

Tokenizing dataset:  16%|█▌        | 109229/700000 [00:47<06:53, 1429.18it/s]

Tokenizing dataset:  16%|█▌        | 109493/700000 [00:47<05:58, 1644.97it/s]

Tokenizing dataset:  16%|█▌        | 109763/700000 [00:47<05:17, 1858.70it/s]

Tokenizing dataset:  16%|█▌        | 110018/700000 [00:47<04:52, 2013.71it/s]

Tokenizing dataset:  16%|█▌        | 110276/700000 [00:47<04:34, 2150.99it/s]

Tokenizing dataset:  16%|█▌        | 110534/700000 [00:48<04:20, 2260.61it/s]

Tokenizing dataset:  16%|█▌        | 110788/700000 [00:48<04:12, 2335.52it/s]

Tokenizing dataset:  16%|█▌        | 111044/700000 [00:48<04:05, 2397.81it/s]

Tokenizing dataset:  16%|█▌        | 111298/700000 [00:48<04:01, 2433.69it/s]

Tokenizing dataset:  16%|█▌        | 111552/700000 [00:48<04:00, 2444.15it/s]

Tokenizing dataset:  16%|█▌        | 111804/700000 [00:48<03:59, 2452.92it/s]

Tokenizing dataset:  16%|█▌        | 112055/700000 [00:48<03:59, 2459.67it/s]

Tokenizing dataset:  16%|█▌        | 112305/700000 [00:48<03:59, 2454.64it/s]

Tokenizing dataset:  16%|█▌        | 112553/700000 [00:48<03:59, 2450.74it/s]

Tokenizing dataset:  16%|█▌        | 112816/700000 [00:48<03:54, 2501.51it/s]

Tokenizing dataset:  16%|█▌        | 113072/700000 [00:49<03:53, 2517.06it/s]

Tokenizing dataset:  16%|█▌        | 113344/700000 [00:49<03:47, 2575.88it/s]

Tokenizing dataset:  16%|█▌        | 113603/700000 [00:49<03:48, 2565.65it/s]

Tokenizing dataset:  16%|█▋        | 113883/700000 [00:49<03:42, 2632.84it/s]

Tokenizing dataset:  16%|█▋        | 114147/700000 [00:49<03:43, 2620.24it/s]

Tokenizing dataset:  16%|█▋        | 114410/700000 [00:49<03:44, 2605.53it/s]

Tokenizing dataset:  16%|█▋        | 114686/700000 [00:49<03:40, 2649.53it/s]

Tokenizing dataset:  16%|█▋        | 114956/700000 [00:49<03:39, 2662.67it/s]

Tokenizing dataset:  16%|█▋        | 115227/700000 [00:49<03:38, 2676.63it/s]

Tokenizing dataset:  16%|█▋        | 115495/700000 [00:50<03:45, 2596.10it/s]

Tokenizing dataset:  17%|█▋        | 115756/700000 [00:50<03:46, 2575.50it/s]

Tokenizing dataset:  17%|█▋        | 116014/700000 [00:50<03:49, 2541.45it/s]

Tokenizing dataset:  17%|█▋        | 116269/700000 [00:50<03:51, 2523.46it/s]

Tokenizing dataset:  17%|█▋        | 116523/700000 [00:50<03:50, 2526.90it/s]

Tokenizing dataset:  17%|█▋        | 116776/700000 [00:50<03:50, 2525.56it/s]

Tokenizing dataset:  17%|█▋        | 117029/700000 [00:50<03:51, 2520.53it/s]

Tokenizing dataset:  17%|█▋        | 117282/700000 [00:50<03:51, 2521.45it/s]

Tokenizing dataset:  17%|█▋        | 117539/700000 [00:50<03:49, 2534.77it/s]

Tokenizing dataset:  17%|█▋        | 117794/700000 [00:50<03:49, 2538.22it/s]

Tokenizing dataset:  17%|█▋        | 118057/700000 [00:51<03:47, 2562.93it/s]

Tokenizing dataset:  17%|█▋        | 118314/700000 [00:51<03:47, 2557.61it/s]

Tokenizing dataset:  17%|█▋        | 118576/700000 [00:51<03:45, 2575.84it/s]

Tokenizing dataset:  17%|█▋        | 118834/700000 [00:51<03:46, 2569.79it/s]

Tokenizing dataset:  17%|█▋        | 119091/700000 [00:51<03:46, 2565.86it/s]

Tokenizing dataset:  17%|█▋        | 119348/700000 [00:51<03:49, 2530.81it/s]

Tokenizing dataset:  17%|█▋        | 119619/700000 [00:51<03:44, 2582.55it/s]

Tokenizing dataset:  17%|█▋        | 119878/700000 [00:51<03:45, 2574.71it/s]

Tokenizing dataset:  17%|█▋        | 120136/700000 [00:51<03:47, 2550.47it/s]

Tokenizing dataset:  17%|█▋        | 120392/700000 [00:51<04:08, 2330.54it/s]

Tokenizing dataset:  17%|█▋        | 120629/700000 [00:52<04:32, 2126.73it/s]

Tokenizing dataset:  17%|█▋        | 120863/700000 [00:52<04:25, 2181.12it/s]

Tokenizing dataset:  17%|█▋        | 121130/700000 [00:52<04:10, 2313.22it/s]

Tokenizing dataset:  17%|█▋        | 121379/700000 [00:52<04:04, 2362.26it/s]

Tokenizing dataset:  17%|█▋        | 121648/700000 [00:52<03:55, 2453.72it/s]

Tokenizing dataset:  17%|█▋        | 121927/700000 [00:52<03:46, 2550.10it/s]