In [1]:
from datasets import load_dataset

ds = load_dataset("simecek/Human_DNA_v0")

Using custom data configuration simecek--Human_DNA_v0-3127ba11a87ac1a1
Reusing dataset parquet (/home/jovyan/.cache/huggingface/datasets/simecek___parquet/simecek--Human_DNA_v0-3127ba11a87ac1a1/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
def kmers_stride1(s, k=6):
    return [s[i:i + k] for i in range(0, len(s)-k+1)]

def batch_iterator(k=6, max_seqs=1_000):
    for i in range(0, min(max_seqs, len(ds["train"]))):
        yield kmers_stride1(ds["train"][i]['Seq'], k=k)    

## Training from scratch with `tokenizers` library

I am able to train WordLevel tokenizer but it is slow and I still clearly miss some pieces (e.g. my tokenizer is not callable).

In [3]:
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers

In [4]:
tokenizer = Tokenizer(models.WordLevel(unk_token="[UNK]"))
tokenizer.normalizer = normalizers.Strip()
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
#tokenizer.decoder = decoders.Whitespace()

trainer = trainers.WordLevelTrainer(
    min_frequency=3,
    show_progress=True,
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
)

In [5]:
%%time
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer, length=len(ds["train"]))

CPU times: user 1min 1s, sys: 29.6 s, total: 1min 31s
Wall time: 12 s


In [6]:
output = tokenizer.encode("ACCTGA CCCGGG")

In [7]:
output.tokens

['ACCTGA', 'CCCGGG']

In [8]:
output.ids

[1641, 2872]

In [9]:
tokenizer.token_to_id("ACCTGA")

1641

In [10]:
output = tokenizer.encode_batch(["ACCTGA CCCGGG", "CCCGGG ACCTGA"])
output[1].tokens

['CCCGGG', 'ACCTGA']

In [11]:
output[1].attention_mask

[1, 1]

## Retraining old tokenizer

This seems to work but on closer inspection it breaks 7-character word into several tokens. So it is not the best solution after all.

In [12]:
from transformers import AutoTokenizer

old_tokenizer =  AutoTokenizer.from_pretrained("armheb/DNA_bert_6")

In [13]:
%%time
K = 7

new_tokenizer = old_tokenizer.train_new_from_iterator(batch_iterator(k=K, max_seqs=1000), vocab_size=4**K+5)




CPU times: user 47.2 s, sys: 4.8 s, total: 52 s
Wall time: 6.87 s


In [20]:
def tokenize_function(s, k=6):
    seq_split = " ".join(kmers_stride1(s['Seq'], k))
    return new_tokenizer(seq_split)

tokenize_function({'Seq':'ACCTGCTGGACGATCATA'}, k=K)

{'input_ids': [2, 6349, 3110, 3361, 3197, 10016, 13645, 15506, 10920, 11, 21, 836, 105, 13083, 13447, 10313, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [21]:
new_tokenizer.decode(tokenize_function({'Seq':'ACCTGCTGGACGATCATA'}, k=7)['input_ids'])

'[CLS] ACCTGCT CCTGCTG CTGCTGG TGCTGGA GCTGGAC CTGGACG TGGACGA GGACGAT GACGATC ACGATCA CGATCAT GATCATA [SEP]'

In [22]:
dl = list(old_tokenizer.vocab.items())
sorted(dl, key=lambda x: x[1])[:10]

[('[PAD]', 0),
 ('[UNK]', 1),
 ('[CLS]', 2),
 ('[SEP]', 3),
 ('[MASK]', 4),
 ('AAAAAA', 5),
 ('AAAAAT', 6),
 ('AAAAAC', 7),
 ('AAAAAG', 8),
 ('AAAATA', 9)]

## Contructing a tokenizer from the vocab file

This would seems the most promising way but it is not working for me somehow.

In [28]:
from transformers import BertTokenizer, BertTokenizerFast, PreTrainedTokenizerFast, ElectraTokenizer, ElectraTokenizerFast

In [23]:
from itertools import product

K = 7

alphabet = ('A', 'C', 'T', 'G')
vocab = list(map(''.join, product(alphabet, repeat=K)))

print(len(vocab), 4**K)
vocab[1:7]



16384 16384


['AAAAAAC', 'AAAAAAT', 'AAAAAAG', 'AAAAACA', 'AAAAACC', 'AAAAACT']

In [24]:
full_vocab = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'] + vocab
vocab_dict = {k: i for i, k in enumerate(full_vocab)}
#vocab_dict 

In [25]:
old_tokenizer.vocab = vocab_dict 

AttributeError: can't set attribute

In [26]:
with open('vocab.txt', 'w') as f:
    for item in vocab:
        f.write("%s\n" % item)

In [29]:
BertTokenizer("vocab.txt")

PreTrainedTokenizer(name_or_path='', vocab_size=16384, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

## Just add new words to the dictionary of the old tokenizer

In [31]:
old_tokenizer.add_tokens(vocab)

16384

In [32]:
def tokenize_function(s, k=6):
    seq_split = " ".join(kmers_stride1(s['Seq'], k))
    return old_tokenizer(seq_split)

tokenize_function({'Seq':'ACCTGCTGGACGATCATA'}, k=K)

{'input_ids': [2, 5563, 9952, 11124, 15809, 18166, 11212, 16161, 19575, 16846, 5929, 11415, 16973, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [33]:
old_tokenizer.decode(tokenize_function({'Seq':'ACCTGCTGGACGATCATA'}, k=7)['input_ids'])

'[CLS] ACCTGCT CCTGCTG CTGCTGG TGCTGGA GCTGGAC CTGGACG TGGACGA GGACGAT GACGATC ACGATCA CGATCAT GATCATA [SEP]'

In [34]:
# but this is really suspicious
old_tokenizer.vocab_size

4101