<a href="https://colab.research.google.com/github/ML-Bioinfo-CEITEC/cDNA-pretraining/blob/main/experiments/kmer_tokenization/BPETokenizer_for_DNA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -qq transformers datasets --quiet

In [None]:
#CONFIG
k = 10 
limit = 263659 #263659 is full dataset
vocab_size_limit = 30000 #30000
hf_name = 'Vlasta'

In [2]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [3]:
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Split
from tokenizers.processors import BertProcessing
from tokenizers import Regex

In [4]:
ds = load_dataset("simecek/Human_DNA_v0")
train_portion = ds['train']['Seq']

Using custom data configuration simecek--Human_DNA_v0-d7be3fc44fadbb72
Reusing dataset parquet (/root/.cache/huggingface/datasets/simecek___parquet/simecek--Human_DNA_v0-d7be3fc44fadbb72/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8)


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
#split-to-kmers-pretokenizer
num = 3
pre_tokenizer = Split(Regex(f"[a-zA-Z]{{{num}}}"), behavior='isolated')
pre_tokenizer.pre_tokenize_str('ACCTTTTTTAGGAGGTNNCGACT')


[('ACC', (0, 3)),
 ('TTT', (3, 6)),
 ('TTT', (6, 9)),
 ('AGG', (9, 12)),
 ('AGG', (12, 15)),
 ('TNN', (15, 18)),
 ('CGA', (18, 21)),
 ('CT', (21, 23))]

In [6]:
def compute_limit_tokens(k):
  """
  Computes the upper limit on tokens if dset would be split to 
  chunks of size K and the tokenizer would enumerate all possible tokens
  """
  res = 0
  for i in range(1,k+1):
    res+=4**i #NOT including the letter N
  return res + 5 #special tokens

compute_limit_tokens(8)

87385

In [7]:
len(train_portion)

263659

In [8]:
def batch_iterator(dataset, batch_size):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]

In [None]:
%%time

tokenizer_name = f"DNA_BPE_max_vocab_{vocab_size_limit}_maxk_{k}"

unk_token = "[UNK]"  
spl_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] 

tokenizer = Tokenizer(BPE(unk_token = unk_token))

pre_tokenizer = Split(Regex(f"[a-zA-Z]{{{k}}}"), behavior='isolated')
tokenizer.pre_tokenizer = pre_tokenizer
tokenizer.post_processor = BertProcessing(sep=('[SEP]', spl_tokens.index('[SEP]')), cls=('[CLS]', spl_tokens.index('[CLS]')))
# tokenizer.enable_padding(length=30)
# tokenizer.enable_padding()
# print(tokenizer.padding)

TRA = train_portion[:limit]

#Default vocab_size_limit is 30000
trainer = BpeTrainer(special_tokens = spl_tokens, vocab_size=vocab_size_limit)
tokenizer.train_from_iterator(TRA, trainer) 
# tokenizer.train_from_iterator(batch_iterator(TRA, batch_size=1000), trainer) 


#If these are equal, BPE only enumerated all possible combinations
print(f"BPE tokenizer vocab size: {tokenizer.get_vocab_size()}")
print(f"Theoretical max vocab size: {compute_limit_tokens(k)}")



In [None]:
tokens = sorted(tokenizer.get_vocab().keys(), key=len)
print('SHORTEST TOKENS\n', *tokens[:10])
print(f'\nLONGEST TOKENS (max {len(tokens[-1])})', *tokens[-10:], sep='\n')
tokenizer_name = tokenizer_name+f'_max_tokenlen_{len(tokens[-1])}'

In [None]:
encoding = tokenizer.encode('NNACTGACACGAAAAAAAGGGC[MASK]GCGCAACTCCAG')
print(encoding.ids)
print(encoding.tokens)
print(tokenizer.decode(encoding.ids))

In [None]:
from transformers import PreTrainedTokenizerFast

fast_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    model_max_length = 512,
    padding_side='right',
    truncation_side='right',
    pad_token='[PAD]',
    mask_token='[MASK]',
    sep_token='[SEP]',
    cls_token='[CLS]',
    unk_token='[UNK]',
  )

fast_tokenizer.push_to_hub(tokenizer_name)

In [None]:
from transformers import AutoTokenizer
myDownloadedTokenizer = AutoTokenizer.from_pretrained(f"{hf_name}/{tokenizer_name}")
dnabert_tokenizer = AutoTokenizer.from_pretrained("armheb/DNA_bert_6")

long_sequence = '[MASK]' + 'AAAAAA '*1000 
short_sequence = 'NNACTGACACGAAAAAAAGGGC[MASK]GCGCAACTCCAG'
long_encoded_dnabert = dnabert_tokenizer(long_sequence)
long_encoded_ours = myDownloadedTokenizer(long_sequence)
#Padding doesnt show up for
short_encoded_dnabert = dnabert_tokenizer(short_sequence, padding=True)
short_encoded_ours = myDownloadedTokenizer(short_sequence, padding=True)
print("\nDNABERT BEHAVIOR")
print(long_encoded_dnabert)
print(dnabert_tokenizer.decode(long_encoded_dnabert['input_ids']))
print(short_encoded_dnabert)
print(dnabert_tokenizer.decode(short_encoded_dnabert['input_ids']))
print(dnabert_tokenizer.special_tokens_map)
print(dnabert_tokenizer.padding_side)
print('\nOURS BEHAVIOR')
print(long_encoded_ours)
print(myDownloadedTokenizer.decode(long_encoded_ours['input_ids']))
print(short_encoded_ours)
print(myDownloadedTokenizer.decode(short_encoded_ours['input_ids']))
print(myDownloadedTokenizer.special_tokens_map)
print(myDownloadedTokenizer.padding_side)



