# Notebook to check that all tokenization methods don't exceed the maximum context window of the models

In [1]:
import json
from tqdm import tqdm
from tokenization_methods.method_mapping import *
from custom_tokenizer_abstract import *
from custom_models import load_custom_class

data_paths = [
    "data/multinli_1.0/multinli_1.0_dev_matched.jsonl",
    "data/multinli_1.0/multinli_1.0_dev_mismatched.jsonl",
    "data/snli_1.0/snli_1.0_test.jsonl",
]
for data_path in data_paths:
    data = {
            "label": [],
            "sent1": [],
            "sent2": []
        }
    with open(data_path, "r") as file:
        for line in file:
            aux_dict = json.loads(line)
            data["label"] += [aux_dict["gold_label"]]
            data["sent1"] += [aux_dict["sentence1"]] # premise
            data["sent2"] += [aux_dict["sentence2"]] # hypothesis

    for tok_method in TOK_METHOD_MAP.values():
        for model_name, model_parameters in MODEL_MAP.items():
            tokenizer_nli = load_custom_class(model_parameters["model_link"], load_model=False)
            custom_tokenizer = CustomTokenizerGeneral(tokenizer_nli, tok_method, separator=model_parameters["separator_marker"], special_space_token=model_parameters["special_space_token"], max_length=model_parameters["max_length"])
            
            total = len(data["sent1"])
            for premise, hypothesis in tqdm(zip(data["sent1"], data["sent2"]), total=total, desc=f"{model_name} --- {tok_method.__name__} -- {data_path.split('/')[-1]}"):
                tok_ids = custom_tokenizer((premise, hypothesis))["input_ids"]
                assert len(tok_ids) <= model_parameters["max_length"], f"More than {model_parameters['max_length']} tokens; observed {len(tok_ids)} tokens."



Tensors and operations will be done on cuda:0.


roberta --- adverserial_pos_synonym_noun -- multinli_1.0_dev_matched.jsonl: 100%|██████████| 10000/10000 [01:37<00:00, 102.69it/s]


Tensors and operations will be done on cuda:0.


bart --- adverserial_pos_synonym_noun -- multinli_1.0_dev_matched.jsonl: 100%|██████████| 10000/10000 [01:34<00:00, 106.22it/s]


Tensors and operations will be done on cuda:0.


minilm --- adverserial_pos_synonym_noun -- multinli_1.0_dev_matched.jsonl: 100%|██████████| 10000/10000 [01:33<00:00, 106.94it/s]


Tensors and operations will be done on cuda:0.


roberta --- adverserial_pos_synonym_verb -- multinli_1.0_dev_matched.jsonl: 100%|██████████| 10000/10000 [01:31<00:00, 109.38it/s]


Tensors and operations will be done on cuda:0.


bart --- adverserial_pos_synonym_verb -- multinli_1.0_dev_matched.jsonl: 100%|██████████| 10000/10000 [01:32<00:00, 107.57it/s]


Tensors and operations will be done on cuda:0.


minilm --- adverserial_pos_synonym_verb -- multinli_1.0_dev_matched.jsonl: 100%|██████████| 10000/10000 [01:32<00:00, 108.22it/s]


Tensors and operations will be done on cuda:0.


roberta --- adverserial_pos_synonym_noun -- multinli_1.0_dev_mismatched.jsonl: 100%|██████████| 10000/10000 [01:36<00:00, 103.57it/s]


Tensors and operations will be done on cuda:0.


bart --- adverserial_pos_synonym_noun -- multinli_1.0_dev_mismatched.jsonl: 100%|██████████| 10000/10000 [01:36<00:00, 104.11it/s]


Tensors and operations will be done on cuda:0.


minilm --- adverserial_pos_synonym_noun -- multinli_1.0_dev_mismatched.jsonl: 100%|██████████| 10000/10000 [01:35<00:00, 105.10it/s]


Tensors and operations will be done on cuda:0.


roberta --- adverserial_pos_synonym_verb -- multinli_1.0_dev_mismatched.jsonl: 100%|██████████| 10000/10000 [01:34<00:00, 106.04it/s]


Tensors and operations will be done on cuda:0.


bart --- adverserial_pos_synonym_verb -- multinli_1.0_dev_mismatched.jsonl: 100%|██████████| 10000/10000 [01:34<00:00, 106.25it/s]


Tensors and operations will be done on cuda:0.


minilm --- adverserial_pos_synonym_verb -- multinli_1.0_dev_mismatched.jsonl: 100%|██████████| 10000/10000 [01:33<00:00, 106.56it/s]


Tensors and operations will be done on cuda:0.


roberta --- adverserial_pos_synonym_noun -- snli_1.0_test.jsonl: 100%|██████████| 10000/10000 [01:20<00:00, 124.78it/s]


Tensors and operations will be done on cuda:0.


bart --- adverserial_pos_synonym_noun -- snli_1.0_test.jsonl: 100%|██████████| 10000/10000 [01:17<00:00, 129.49it/s]


Tensors and operations will be done on cuda:0.


minilm --- adverserial_pos_synonym_noun -- snli_1.0_test.jsonl: 100%|██████████| 10000/10000 [01:15<00:00, 132.12it/s]


Tensors and operations will be done on cuda:0.


roberta --- adverserial_pos_synonym_verb -- snli_1.0_test.jsonl: 100%|██████████| 10000/10000 [01:12<00:00, 138.30it/s]


Tensors and operations will be done on cuda:0.


bart --- adverserial_pos_synonym_verb -- snli_1.0_test.jsonl: 100%|██████████| 10000/10000 [01:13<00:00, 136.44it/s]


Tensors and operations will be done on cuda:0.


minilm --- adverserial_pos_synonym_verb -- snli_1.0_test.jsonl: 100%|██████████| 10000/10000 [01:13<00:00, 136.17it/s]


# Remarks
* DistilRoBERTa, BART, and MiniLM2, while displaying tokens with a blank space in front, actually use "Ġ" in the vocabulary instead of " " - which we will call <tt>special_space_token</tt>.
* These same models also have separation indicators (which we term as <tt>separation_marker</tt>), indicating which subwords were part of an original word (e.g., "dog" -> "d ##og"); though, the mentioned models do not indicate this separation, which we represent thus as "" and not modify the strings further.
* DistilRoBERTa and MiniLM2 have the same type of tokenizer processor (based on RoBERTa), while BART has its own.

In [None]:
test_sentence = ("This is a test sentence to showcase", "How the tokenizers running natively separate tokens stylishly.")
for model_name, model_parameters in MODEL_MAP.items():
    tokenizer_nli: AutoTokenizer = load_custom_class(model_parameters["model_link"], load_model=False)
    print([tokenizer_nli.decode(token) for token in tokenizer_nli.encode(*test_sentence)])