# Notebook to check that all tokenization methods don't exceed the maximum context window of the models

In [None]:
import json
from tqdm import tqdm
from method_mapping import *
from custom_tokenizer_abstract import *
from custom_models import load_custom_class

data_paths = [
    "data/multinli_1.0/multinli_1.0_dev_matched.jsonl",
    "data/multinli_1.0/multinli_1.0_dev_mismatched.jsonl",
    "data/snli_1.0/snli_1.0_test.jsonl",
]
data = {
        "label": [],
        "sent1": [],
        "sent2": []
    }
for data_path in data_paths:
    with open(data_path, "r") as file:
        for line in file:
            aux_dict = json.loads(line)
            data["label"] += [aux_dict["gold_label"]]
            data["sent1"] += [aux_dict["sentence1"]] # premise
            data["sent2"] += [aux_dict["sentence2"]] # hypothesis

    for tok_method in TOK_METHOD_MAP.values():
        for model_name, model_parameters in MODEL_MAP.items():
            tokenizer_nli = load_custom_class(model_parameters["model_link"], load_model=False)
            custom_tokenizer = CustomTokenizerGeneral(tokenizer_nli, tok_method, separator=model_parameters["separator_marker"], special_space_token=model_parameters["special_space_token"], max_length=model_parameters["max_length"])
            
            total = len(data["sent1"])
            for premise, hypothesis in tqdm(zip(data["sent1"], data["sent2"]), total=total, desc=f"{model_name} --- {tok_method.__name__} -- {data_path.split('/')[-1]}"):
                tok_ids = custom_tokenizer((premise, hypothesis))["input_ids"]
                assert len(tok_ids) <= model_parameters["max_length"], f"More than {model_parameters['max_length']} tokens; observed {len(tok_ids)} tokens."



# Remarks
* DistilRoBERTa, BART, and MiniLM2, while displaying tokens with a blank space in front, actually use "Ġ" in the vocabulary instead of " " - which we will call <tt>special_space_token</tt>.
* These same models also have separation indicators (which we term as <tt>separation_marker</tt>), indicating which subwords were part of an original word (e.g., "dog" -> "d ##og"); though, the mentioned models do not indicate this separation, which we represent thus as "" and not modify the strings further.
* DistilRoBERTa and MiniLM2 have the same type of tokenizer processor (based on RoBERTa), while BART has its own.

In [4]:
test_sentence = ("This is a test sentence to showcase", "How the tokenizers running natively separate tokens stylishly.")
for model_name, model_parameters in MODEL_MAP.items():
    tokenizer_nli: AutoTokenizer = load_custom_class(model_parameters["model_link"], load_model=False)
    print([tokenizer_nli.decode(token) for token in tokenizer_nli.encode(*test_sentence)])

['<s>', 'This', ' is', ' a', ' test', ' sentence', ' to', ' showcase', '</s>', '</s>', 'How', ' the', ' token', 'izers', ' running', ' native', 'ly', ' separate', ' tokens', ' stylish', 'ly', '.', '</s>']
['<s>', 'This', ' is', ' a', ' test', ' sentence', ' to', ' showcase', '</s>', '</s>', 'How', ' the', ' token', 'izers', ' running', ' native', 'ly', ' separate', ' tokens', ' stylish', 'ly', '.', '</s>']
['<s>', 'This', ' is', ' a', ' test', ' sentence', ' to', ' showcase', '</s>', '</s>', 'How', ' the', ' token', 'izers', ' running', ' native', 'ly', ' separate', ' tokens', ' stylish', 'ly', '.', '</s>']
