# Data Tokenization

G-quadruplex data from: https://github.com/ML-Bioinfo-CEITEC/penguinn/tree/master/Datasets
Tokenization from: https://github.com/ML-Bioinfo-CEITEC/ECCB2022/blob/main/notebooks/03_Transformers_and_transfer_learning.ipynb

In [2]:
!pip install transformers datasets tokenizers --quiet

In [3]:
TOKENIZER = 'armheb/DNA_bert_6'
DATASET = "roa7n/G_quad_DNA_tokenized_K6"

TRAIN_PATH = '/home/jovyan/data/g_quad/train_set_1_1.txt'
TEST_PATH = '/home/jovyan/data/g_quad/test_set_1_1.txt'

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Load the Tokenizer:

In [5]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(TOKENIZER, num_labels=2)
tokenizer

PreTrainedTokenizer(name_or_path='armheb/DNA_bert_6', vocab_size=4101, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

#### Define custom K-mer tokenization:

In [6]:
def kmers(s, k=6):
  return [s[i:i + k] for i in range(0, len(s)-k+1)]

def tokenization(x): 
  return tokenizer(" ".join(kmers(x["seq"])))

example = {'seq': 'ATGGAAAGAGGCACCATTCT'}
print(f'Example: {example}')

example_kmers = " ".join(kmers(example['seq']))
print(f'Example_kmers: {example_kmers}')

tokenized_example = tokenization(example)
print(f'Tokenization example: {tokenized_example}')

decoded_example = tokenizer.decode(tokenized_example['input_ids'])
print(f'Decoded tokenized example: {decoded_example}')

Example: {'seq': 'ATGGAAAGAGGCACCATTCT'}
Example_kmers: ATGGAA TGGAAA GGAAAG GAAAGA AAAGAG AAGAGG AGAGGC GAGGCA AGGCAC GGCACC GCACCA CACCAT ACCATT CCATTC CATTCT
Tokenization example: {'input_ids': [2, 501, 1989, 3848, 3089, 56, 212, 835, 3325, 999, 3983, 3629, 2214, 650, 2587, 2142, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Decoded tokenized example: [CLS] ATGGAA TGGAAA GGAAAG GAAAGA AAAGAG AAGAGG AGAGGC GAGGCA AGGCAC GGCACC GCACCA CACCAT ACCATT CCATTC CATTCT [SEP]


### Tokenize the data:

In [7]:
import pandas as pd

df_train = pd.read_fwf(TRAIN_PATH, header=None)
df_train.columns = ['seq', 'label']
df_train

Unnamed: 0,seq,label
0,TCATAGACCCGGTCTCATATGACAAGGAGGGGCATGTCAGACAGTA...,positive
1,NNNNACGAGATCACACAGGTTCTCCGTCGTACACCTCAGTTTTTTC...,positive
2,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGTCCTACCAGAG...,positive
3,TTTGAATATAAAAAAGTACCTAGAAGAATAAAAGGCACATTCCTTT...,negative
4,NNNNNNNNNNNNNNNGCCTCCCAAACTGCTGGGATTACAGGCGTGA...,negative
...,...,...
299995,CTCTATAAAAGAATTTTAAAATAAGAATTTTCCATGTAAATTAAAG...,negative
299996,NNNNNNNNNNNNNNNNNNCAACAGCAGAGTGTTAAACCAAGTGTGA...,negative
299997,GCCACAGCATTGGAGGAAGGCCTCTCTAAGGCAAGATGTCAGCACT...,negative
299998,NNNNNNNNNNNNNNNACATAAAAGCTACATTTTCAATACAAGATAA...,negative


In [8]:
# rename labels (str -> int)
df_train['label'] = df_train['label'].map(lambda x: 1 if x == 'positive' else 0)
df_train

Unnamed: 0,seq,label
0,TCATAGACCCGGTCTCATATGACAAGGAGGGGCATGTCAGACAGTA...,1
1,NNNNACGAGATCACACAGGTTCTCCGTCGTACACCTCAGTTTTTTC...,1
2,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGTCCTACCAGAG...,1
3,TTTGAATATAAAAAAGTACCTAGAAGAATAAAAGGCACATTCCTTT...,0
4,NNNNNNNNNNNNNNNGCCTCCCAAACTGCTGGGATTACAGGCGTGA...,0
...,...,...
299995,CTCTATAAAAGAATTTTAAAATAAGAATTTTCCATGTAAATTAAAG...,0
299996,NNNNNNNNNNNNNNNNNNCAACAGCAGAGTGTTAAACCAAGTGTGA...,0
299997,GCCACAGCATTGGAGGAAGGCCTCTCTAAGGCAAGATGTCAGCACT...,0
299998,NNNNNNNNNNNNNNNACATAAAAGCTACATTTTCAATACAAGATAA...,0


In [10]:
df_test = pd.read_fwf(TEST_PATH, header=None)
df_test.columns = ['seq', 'label']
df_test['label'] = df_test['label'].map(lambda x: 1 if x == 'positive' else 0)
df_test

Unnamed: 0,seq,label
0,NNNNNNNNNNNATGTTTATTTTGACTATTTACCACTGTTCTTGGTG...,0
1,AACCCGGGTCCCCTGGGTCCGGGGTGGGGTCGGTTAAGGTAGTGGT...,1
2,NNNNTTGTTCTCTTCTTAGTTCCTTGAGGAGCAACATTATGTTATT...,0
3,NNNNNNNNNNNNNNNNNNNNNNNNNNNNTGTGGAGGTACAGATGAG...,1
4,NNNCATGTACCTTATGCATTCAAATTGATGACTTGCTGACTTGTGT...,0
...,...,...
99995,NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...,0
99996,GATATGGACAATAAGGTCCAGGCTGACGTGGTGTCAGATGGAGATG...,0
99997,CAGCCCTGAGGGCAGGACTTGACCCCAGGCTTCCGGGCACTTACCT...,0
99998,CTTGCGTCACGTATGGCCTCATGTCTGTGCACAGGTCATGGAGACT...,0


In [11]:
from datasets import Dataset, DatasetDict

dataset_train = Dataset.from_pandas(df_train)
dataset_train

Dataset({
    features: ['seq', 'label'],
    num_rows: 300000
})

In [10]:
dataset_test = Dataset.from_pandas(df_test)
dataset_test

Dataset({
    features: ['seq', 'label'],
    num_rows: 100000
})

In [11]:
# one training sample sequence and its label
dataset_train[0]

{'seq': 'TCATAGACCCGGTCTCATATGACAAGGAGGGGCATGTCAGACAGTACCGAAGAAAACTGATCCTTTCCCTTTAGGGGGTCGGGGAACACGAAGGACCCACTCTGCTACGGGGTGGGACGAAGTCGGGGGGGAGGTACTCGACATGGGTGATAGGTTGGTCAGGGTTACCCTACTTGAACCATGGAGTCTACCTTTACGTC',
 'label': 1}

In [12]:
dataset_train_tokenized = dataset_train.map(tokenization, batched=False)
dataset_test_tokenized = dataset_test.map(tokenization, batched=False)
dataset_train_tokenized

  0%|          | 0/300000 [00:00<?, ?ex/s]

  0%|          | 0/100000 [00:00<?, ?ex/s]

Dataset({
    features: ['seq', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 300000
})

In [13]:
dataset_tokenized = DatasetDict()

dataset_tokenized['train'] = dataset_train_tokenized
dataset_tokenized['test'] = dataset_test_tokenized
 
dataset_tokenized

DatasetDict({
    train: Dataset({
        features: ['seq', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 300000
    })
    test: Dataset({
        features: ['seq', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100000
    })
})

In [14]:
dataset_tokenized.push_to_hub(DATASET)

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Check the structure of tokenized Dataset:

In [15]:
dataset_tokenized['train'][0]['seq']

'TCATAGACCCGGTCTCATATGACAAGGAGGGGCATGTCAGACAGTACCGAAGAAAACTGATCCTTTCCCTTTAGGGGGTCGGGGAACACGAAGGACCCACTCTGCTACGGGGTGGGACGAAGTCGGGGGGGAGGTACTCGACATGGGTGATAGGTTGGTCAGGGTTACCCTACTTGAACCATGGAGTCTACCTTTACGTC'

In [16]:
print(dataset_tokenized['train'][0]['input_ids'])

[2, 1560, 2129, 311, 1231, 815, 3248, 692, 2754, 2811, 3038, 3947, 3485, 1638, 2441, 1558, 2124, 289, 1143, 461, 1829, 3208, 532, 2113, 248, 980, 3908, 3332, 1027, 4093, 4070, 3980, 3618, 2171, 477, 1896, 3473, 1591, 2253, 808, 3218, 569, 2263, 847, 3376, 1201, 693, 2760, 2833, 3125, 197, 773, 3079, 14, 44, 161, 630, 2507, 1823, 3182, 426, 1690, 2651, 2399, 1391, 1454, 1706, 2714, 2649, 2392, 1364, 1348, 1284, 1028, 4098, 4091, 4064, 3956, 3524, 1796, 3073, 4085, 4039, 3853, 3111, 144, 561, 2229, 712, 2836, 3137, 247, 975, 3887, 3245, 679, 2702, 2603, 2206, 620, 2467, 1662, 2537, 1943, 3664, 2356, 1220, 772, 3074, 4092, 4068, 3972, 3585, 2039, 4048, 3889, 3253, 712, 2834, 3131, 224, 884, 3524, 1796, 3076, 4100, 4100, 4097, 4088, 4052, 3906, 3321, 983, 3918, 3371, 1184, 625, 2487, 1741, 2854, 3212, 548, 2180, 514, 2044, 4065, 3958, 3529, 1816, 3156, 322, 1274, 988, 3940, 3458, 1531, 2013, 3944, 3476, 1604, 2306, 1018, 4057, 3927, 3407, 1327, 1198, 681, 2711, 2638, 2346, 1180, 609, 2421,