In [1]:
import os, sys
import numpy as np
import pandas as pd
import pickle
import random
from collections import Counter
import pysam
import time
import tqdm
import glob
import torch
import json
from tokenizers import (ByteLevelBPETokenizer, CharBPETokenizer, SentencePieceBPETokenizer, BertWordPieceTokenizer)

In [1]:
input_txt_path = './pretraining_data/token_NC_000019.9.txt' # Generated file from `sample_chromosome_matrix.py`
output_txt_path = './pretraining_data/pretraining_data_ch19.txt' # Output pretraining dataset file

In [18]:
nt_biallele_code = json.load(open('./resource/snp_vocab.json', 'r'))
nt_to_index = json.load(open('./resource/nucleotide_to_index.json', 'r'))
index_to_nt = {v:k for k,v in nt_to_index.items()}

# Remove leading and ending 'N' tokens

In [7]:
n_token_code = nt_biallele_code['N']
seq = open(input_txt_path, 'rb').read().decode('utf-8')

s_idx = -1
for i in range(len(seq)):
    if seq[i] != n_token_code:
        s_idx = i
        break

e_idx = -1
for i in range(len(seq)-1,0,-1):
    if seq[i] != n_token_code:
        e_idx = i
        break            

# Trim sequence
trim_seq = seq[s_idx:e_idx+1]
print(f'{txt_path.split("/")[-1]} | len(seq) {len(seq)} | s_idx {s_idx} | e_idx {e_idx} | len(trim_seq) {len(trim_seq)}')

# Save to file
prefix_path = '/'.join(txt_path.split("/")[:-1])
out_path = f'{prefix_path}/clean_{txt_path.split("/")[-1]}'
out_file = open(out_path, 'wb')
out_file.write(trim_seq.encode('utf8'))
out_file.close()

token_NC_000019.9.txt | len(seq) 59128983 | s_idx 60000 | e_idx 59118982 | len(trim_seq) 59058983


# Generate sentences from Chromosome 19

In [10]:
num_iterations = 50
    
possible_start_idxs = list(range(4096))
possible_segment_lens = list(range(512, 4096))

clean_txt_path = f'{prefix_path}/clean_{txt_path.split("/")[-1]}'
out_file = open(output_txt_path, 'wb')

seq = open(clean_txt_path, 'rb').read().decode('utf-8')
start_idxs = np.random.choice(possible_start_idxs, num_iterations, replace=False)

for start_idx in start_idxs:
    s_idx = start_idx
    while s_idx < ch19_len_clean:
        segment_len = np.random.choice(possible_segment_lens)
        segment = seq[s_idx:s_idx+segment_len]

        out_file.write((segment + '\n').encode('utf8'))
        s_idx = s_idx + segment_len
out_file.close()

# Build BPE tokenizer vocab

In [4]:
seqs = open(output_txt_path, 'r', encoding='utf-8').read().split('\n')
unigram_tokens = list(nt_biallele_code.values())

In [5]:
random.seed(0)
random.shuffle(seqs)

char_bpe_tokenizer = SentencePieceBPETokenizer()
max_num_sentences = 3000000

char_bpe_tokenizer.train_from_iterator(seqs[:max_num_sentences], special_tokens=['[UNK]','[CLS]','[SEP]'], vocab_size=32000, initial_alphabet=unigram_tokens)
char_bpe_tokenizer.save_model(directory='./tokenizers/', prefix='chr19_diploid')

['./tokenizers/chr19_diploid-vocab.json',
 './tokenizers/chr19_diploid-merges.txt']

# Test Tokenizer

In [6]:
char_bpe_tokenizer = SentencePieceBPETokenizer(
    vocab='./tokenizers/chr19_diploid-vocab.json',
    merges='./tokenizers/chr19_diploid-merges.txt',
    unk_token='[UNK]'
)

In [7]:
for i in range(0,3):
    print(char_bpe_tokenizer.encode(seqs[i]).tokens)

['▁', 'TAGAG', 'ACTT', 'AAGTGACC', 'ACCCC', 'AGGGCTG', 'CCG', 'TTGCTCAGG', 'TGTGTT', 'TCTGG', 'CATTCC', 'CAAGTTGG', 'TACCCTT', 'ACATGCAA', 'TATTTT', 'ACATT', 'AGAATG', 'CATG', 'CGTTTG', 'AAATAG', 'CGAA', 'TTGGTGAA', 'CATGTAA', 'ACCAGATG', 'CTAAG', 'AGGGCTTTG', 'AGAG', 'CT', 'AGCCTGGG', 'CACC', 'CAATT', 'ACA', '锕', 'AAGGGG', 'CATT', 'ATTTCC', 'ACAG', 'CAAAG', 'CATATT', 'TCAGGGTCC', 'ATACAG', 'CAA', 'ACAAGG', 'TCAAATCTG', 'AAAG', 'ATGGGG', 'TCTGCAA', 'ACCTCC', 'ATCTT', 'CTTTG', 'TAGTGATG', 'TTTTAATCC', 'CATTCC', 'TCTCCAA', 'TCCTAG', 'CTCAAGAA', 'TCTGTTAG', 'TAAGAG', 'CATGGGGG', 'AGAATG', 'ATGAGAAG', 'CCTGGTT', 'TAAAAAACAA', 'ACACATCAA', 'ATTCTCTG', 'CTACC', 'ACTT', 'ATGAGTT', 'CTGTGACTT', 'TAGTCC', 'TTTG', 'CTTGGAA', 'TCTTCC', 'AGAA', 'TCATACC', 'CTAAG', 'ATAAGG', 'ATTTAGG', 'CACAGGCAG', 'TTTTTT', 'GGGAGG', 'AGGG', 'AGAGAAG', 'ATCC', 'CAGGAAG', 'TACAAGG', 'AGGGAGTGG', 'GGGG', 'AGCAG', 'ATGTAGG', 'CGTG', 'TTTTG', 'AAGCAGG', 'TGATCC', 'CTG', 'TGGGAA', 'ATGGAG', 'TCTTATT', 'CCTG', 'CTGGGACC