In [50]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [132]:
import nltk, ssl
nltk.download("punkt")
from nltk.tokenize import sent_tokenize

for res in ["punkt", "punkt_tab"]:
    try:
        nltk.data.find(f"tokenizers/{res}")
    except LookupError:
        nltk.download(res)

from nltk.tokenize import sent_tokenize

import json

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [133]:
from transformers import AutoTokenizer
tok_hf = AutoTokenizer.from_pretrained("bert-base-uncased")
tok_hf.save_pretrained("./bert-base-uncased")

('./bert-base-uncased/tokenizer_config.json',
 './bert-base-uncased/special_tokens_map.json',
 './bert-base-uncased/vocab.txt',
 './bert-base-uncased/added_tokens.json',
 './bert-base-uncased/tokenizer.json')

In [134]:
import sys, os
sys.path.append(os.getcwd())
from Tokenizer import FullTokenizer

In [135]:
#Tokenizer Object
my_tok = FullTokenizer("./bert-base-uncased/vocab.txt", do_lower_case = True)

In [136]:
def make_tokenized_documents(raw_docs, tokenizer):
    out = []
    
    for doc in raw_docs:
        sents = sent_tokenize(doc)
        sent_ids = []
        
        for s in sents:
            toks = tokenizer.tokenize(s)
            ids = tokenizer.convert_tokens_to_ids(toks)
    
            if ids:
                sent_ids.append(ids)
    
        if sent_ids:
            out.append(sent_ids)
        
    return out

In [137]:
special_ids = {
    "cls": tok_hf.cls_token_id,
    "sep": tok_hf.sep_token_id,
    "pad": tok_hf.pad_token_id if tok_hf.pad_token_id is not None else 0,
    "mask": tok_hf.mask_token_id,
}

In [138]:
vocab_size = tok_hf.vocab_size

In [141]:
import sys, os
sys.path.append(os.getcwd())

from bert_pretrain_data import build_pretraining_instances, BertPretrainDataset, bert_collate_fn

In [142]:
instances = build_pretraining_instances(tokenized_documents, special_ids = special_ids, vocab_size = vocab_size, max_seq_len = 128, short_seq_prob = 0.1, nsp_prob = 0.5, mask_prob = 0.15, seed = 123)

In [152]:
#Examples of first 3 instances
for i, inst in enumerate(instances[:3]):
    print(f"---- Instance {i} ----")
    print("NSP label:", "IsNext" if inst.nsp_label == 1 else "NotNext")
    print("len(input_ids):", len(inst.input_ids))
    
    # reconstruct token strings from IDs (for easier human reading)
    tokens = [tok_hf.convert_ids_to_tokens([tid])[0] for tid in inst.input_ids]
    masked_tokens = []
    for t, label in zip(tokens, inst.mlm_labels):
        if label != -100:
            # mark masked tokens visibly
            masked_tokens.append(f"*{t}*")
        else:
            masked_tokens.append(t)
    print("Tokens:", " ".join(masked_tokens))
    print()

---- Instance 0 ----
NSP label: IsNext
len(input_ids): 12
Tokens: [CLS] hello world . [SEP] this *[MASK]* a test document . [SEP]

---- Instance 1 ----
NSP label: NotNext
len(input_ids): 23
Tokens: [CLS] hello world . this *[MASK]* *[MASK]* test document . [SEP] bert *[MASK]* - training uses ml ##m and ns ##p . [SEP]

---- Instance 2 ----
NSP label: NotNext
len(input_ids): 23
Tokens: [CLS] bert *[MASK]* *-* training *[MASK]* ml ##m and ns ##p . [SEP] hello world . this is a test document . [SEP]



In [153]:
vocab_file = "vocab.txt"  # path to downloaded vocab
tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=True)

In [155]:
#Check tokenizer using vocab
print(tokenizer.tokenize("The quick brown fox jumped over the lazy dog."))
print(tokenizer.convert_tokens_to_ids(["[CLS]", "the", "quick", "brown", "[SEP]"]))
print(len(tokenizer.vocab))

['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', '.']
[101, 1996, 4248, 2829, 102]
30522
