This notebook documents processing Human genome (downloaded from ensembl.org as gzipped FASTA file) into Hugging Face datasets:


*   [Human_DNA_v0](https://huggingface.co/datasets/simecek/Human_DNA_v0): DNA splitted into 10kb pieces
*   [Human_DNA_v0_DNABert6tokenized](https://huggingface.co/datasets/simecek/Human_DNA_v0_DNABert6tokenized): DNA tokenized and ready for language model training (tensors of 512 tokens)



## 0) PIP installation & FASTA.GZ download

In [None]:
# Config:


In [1]:
!pip install transformers datasets Bio



In [2]:
# !wget http://ftp.ensembl.org/pub/release-106/fasta/mus_musculus/dna/Mus_musculus.GRCm39.dna.toplevel.fa.gz

In [3]:
# !wget http://ftp.ensembl.org/pub/release-106/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz

# !result=$(wget -qO- http://ftp.ensembl.org/pub/release-106/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz)
# !echo "$result"

## 1) Preprocessing FASTA into pandas DataFrame

Chromosomes are cutted into 10 kb pieces. Pieces with >1% N frequency are filtered out.

In [4]:
import gzip
from Bio import SeqIO
from Bio.Seq import Seq
from tqdm.autonotebook import tqdm

import os

def _fastagz2dict(fasta_path, fasta_total=None, stop_id=None, region_name_transform=lambda x: x):
    # load gzipped fasta into dictionary
    fasta = {}
    print('20')
    print(os.getcwd())
    print(os.system("ls"))

    with gzip.open(fasta_path, "rt") as handle:
        print('30')
        for record in tqdm(SeqIO.parse(handle, "fasta"), total=fasta_total):
            fasta[region_name_transform(record.id)] = str(record.seq)
            
            print(record.id)
            if stop_id and (record.id == stop_id):
                # stop, do not read small contigs
                break
    return fasta

print('10')

dna_raw = _fastagz2dict("Mus_musculus.GRCm39.dna.toplevel.fa.gz", 22, "MT")
# use location /content/... if running on google colab
# dna_raw = _fastagz2dict("/content/Homo_sapiens.GRCh38.dna.toplevel.fa.gz", 24, "MT")

10
20
/home/jovyan/cDNA-pretraining
comet_key.txt
content
DNA_data.ipynb
download_dataset.sh
env_init.ipynb
Homo_sapiens.GRCh38.dna.toplevel.fa.gz
Human_DNA_small.ipynb
model
Model_tryout_cDNA.ipynb
Mus_musculus.GRCm39.dna.toplevel.fa.gz
README.md
Training_with_cDNA.ipynb
0
30


  from tqdm.autonotebook import tqdm


  0%|          | 0/22 [00:00<?, ?it/s]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
X
Y
MT


Mouse:


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
X
Y
MT
JH584299.1
GL456233.2
JH584301.1
GL456211.1
GL456221.1
JH584297.1
JH584296.1
GL456354.1
JH584298.1
JH584300.1
GL456219.1
GL456210.1
JH584303.1
JH584302.1
GL456212.1
JH584304.1
GL456379.1
GL456366.1
GL456367.1
GL456239.1
GL456383.1
GL456385.1
GL456360.1
GL456378.1
MU069435.1
GL456389.1
GL456372.1
GL456370.1
GL456381.1
GL456387.1
GL456390.1
GL456394.1
GL456392.1
GL456382.1
GL456359.1
GL456396.1
GL456368.1
MU069434.1
JH584295.1

In [5]:
sum([len(x) for x in dna_raw.values()]) / 10**5

27234.31143

In [16]:
def kmers(s, k=6):
    return [s[i:i + k] for i in range(0, len(s), k) if i + k <= len(s)]

kmers("ACTGACTA", 3)

['ACT', 'GAC']

In [7]:
output = dict()

for chr in dna_raw:
    for i, chunk in enumerate(kmers(dna_raw[chr], 10_000)):
        key = chr + "_" + str(i)
        output[key] = chunk

len(output.keys())

272332

In [8]:
import pandas as pd

s  = pd.Series(output,index=output.keys())
sum(s.str.count("N") >= 100)

7493

In [9]:
df = pd.DataFrame(s[s.str.count("N") < 100])
df

Unnamed: 0,0
1_305,CAGTAAAAGAAACAAATTTTAGAGGAACTAAAATATTGCAATCTCC...
1_306,ACAGCAGAGGACTGCTGATTCTGGTTTTAGTCAGAGAAGATGTACC...
1_307,TATTTTGTGGTTACTTTGAGGAGAGTTGGAATTAGGTCTTCTTTGA...
1_308,TGTTTTGATTATTATGTGCCTGTTTGGCATTCTGTAAGCTTCTTGT...
1_309,TTTGCTGGTCCTTTGAGTTGAAAATCTTCATTCTCATCCACTCCTA...
...,...
Y_9081,TTTTATCATGTTTAGGTGTGGGCCTCGAATTCCTGATCTTTCCAAA...
Y_9082,GTGTGGGATTAAATTAGGAATCAGGGAATCTGTGCAGGTTTAAATT...
Y_9083,GACTGGGACAGAGACAGATGGACAGACCCTGAGCTGCACAGGTGTT...
Y_9084,GAGAGAGGCTCTGTCCCTGCACTTCTACAGGTACAACCTGGAGCGG...


## 2) From pandas DataFrame into HF Dataset (splitted into train/test and uploaded to HF Hub)

In [10]:
from datasets import Dataset

raw_dataset = Dataset.from_pandas(df).shuffle(seed=42).rename_column(original_column_name="0", new_column_name="Seq").remove_columns(['__index_level_0__'])
raw_dataset

Dataset({
    features: ['Seq'],
    num_rows: 264839
})

In [11]:
raw_dataset[0]

{'Seq': 'GCTGCATTATTTTTAAAGAAAACAATCAATATACAGGAAGTAGCTAAACTTTAAGTGGTTTTGTGTTCTAAAGACAGTCTCAGGGGGTGGGTATGGGGGACTTTTGGGATAGCATTGAAAATGTAAACGAGGAAAATACCTAATTAAAAAAAAAAAAAAGATAGTCTCGATGGGCAGTGGTGGTGCACGCCTTTAATCCCAGCACTTGGGAGGCAGAGGCAGGTGGATTTCTGAGTTTGAGGCCAGCCTGGTCTACAGAGTGAGTTCCAGGACAGGCAGGGCTACACAGAGAAACCCTGTCTCAACCCCCACTGCTCCCTGCTTTAAAAAAAAAAAAAAAAAGACAGGGTCTCAATATGTGGAGCTGGCTGGCCTCAATCTCAGAATCACTTGCCTCTGCTGAAATGAAAGCATGTGCCGTCCTCCCAGTTGAGGGCAGAACGTTCTCTCCACCCGCACGGTCTATCATGGGTCTCACAGCTATGCCAGCCCCGAGAGAGGATTCCCACCTCTCCACCATGCCCTTTGGCACCCATGTGGCTTTGTGCACACTGTGCATGTATGTTGAGAACTGAGAACCTCTGTGGAGTTGGCTCTACTTCCAACTTTACCTGGGTTCTAGGGATCAAATGCTGCTCACCACTTTGCACAGCAATGCCTTTACCTGCTAGCTTACTGCCAGCTTTAAAGGGGAAGGTAGGTGATAGATATAAAGAGCTAAACAGTCACCACGTGGCATGTTGTGGGGAACAGTTTATCGTAGATGTTTCATCATCCCTCCAAAAGCACTAACTGTGGTTTAAAGCATGATATAAAAACCCACCCTCTTCTCATCTTACAAAACTGAACTCTAGTCACCACCTTTCATCTCCTGTGTTCTTTCAATGAAGTACCAATTAAAGGCCCAATAATTAAGGACTGCACTGCCGAGGGAAAGCAGCACAGCAGCTGTCAGAACCACTGAGCATAGCGCCACTTGCTTCGCCCGGCT

In [12]:
splitted_datasets = raw_dataset.train_test_split(0.1, seed=42)
splitted_datasets

DatasetDict({
    train: Dataset({
        features: ['Seq'],
        num_rows: 238355
    })
    test: Dataset({
        features: ['Seq'],
        num_rows: 26484
    })
})

In [17]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
dataset_name = 'Mouse_DNA_v0'
user_name = 'davidcechak'

In [19]:
splitted_datasets.push_to_hub(dataset_name)

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/5 [00:00<?, ?it/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

## 3) Tokenization and chunking into 512 tokens tensors

In [20]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("armheb/DNA_bert_6")

In [23]:
from datasets import load_dataset

splitted_datasets = load_dataset(user_name + '/' + dataset_name)

Downloading:   0%|          | 0.00/680 [00:00<?, ?B/s]

Using custom data configuration davidcechak--Mouse_DNA_v0-be31eaf5d3dd6916


Downloading and preparing dataset None/None (download: 1.13 GiB, generated: 2.47 GiB, post-processed: Unknown size, total: 3.60 GiB) to /home/jovyan/.cache/huggingface/datasets/davidcechak___parquet/davidcechak--Mouse_DNA_v0-be31eaf5d3dd6916/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/219M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/219M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/219M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/219M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/219M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/122M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /home/jovyan/.cache/huggingface/datasets/davidcechak___parquet/davidcechak--Mouse_DNA_v0-be31eaf5d3dd6916/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [24]:
def tokenize_function(s, k=6):
  seq_split = " ".join(kmers(s['Seq'], k))
  return tokenizer(seq_split)

tokenize_function({'Seq':'ACCTGCTGGACGATCATA'})  

{'input_ids': [2, 675, 2000, 393, 3], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [25]:
tokenized_datasets = splitted_datasets.map(tokenize_function, remove_columns='Seq', num_proc=8)
tokenized_datasets

            

#0:   0%|          | 0/29795 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/29795 [00:00<?, ?ex/s]

#2:   0%|          | 0/29795 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors


 

Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors


#3:   0%|          | 0/29794 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors


 

#4:   0%|          | 0/29794 [00:00<?, ?ex/s]

 

Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors


#5:   0%|          | 0/29794 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors


#6:   0%|          | 0/29794 [00:00<?, ?ex/s]

#7:   0%|          | 0/29794 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors


              

#1:   0%|          | 0/3311 [00:00<?, ?ex/s]

#0:   0%|          | 0/3311 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/3311 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors


 

Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors


#3:   0%|          | 0/3311 [00:00<?, ?ex/s]

#4:   0%|          | 0/3310 [00:00<?, ?ex/s]

#5:   0%|          | 0/3310 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors


#7:   0%|          | 0/3310 [00:00<?, ?ex/s]

#6:   0%|          | 0/3310 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1668 > 512). Running this sequence through the model will result in indexing errors


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 238355
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 26484
    })
})

In [26]:
tokenized_datasets['train'][0]['input_ids']

[2,
 1342,
 1173,
 1774,
 1352,
 2058,
 460,
 2190,
 2550,
 2933,
 1546,
 3498,
 1065,
 1094,
 298,
 3234,
 45,
 3235,
 1274,
 1349,
 204,
 1509,
 3738,
 3145,
 3701,
 1005,
 1413,
 740,
 3366,
 4003,
 1113,
 1379,
 1093,
 437,
 1233,
 1160,
 3962,
 1633,
 1868,
 2361,
 1522,
 1441,
 104,
 330,
 1385,
 2379,
 1389,
 75,
 2172,
 2415,
 1101,
 185,
 1096,
 677,
 3106,
 3097,
 1293,
 1111,
 2302,
 3162,
 76,
 1441,
 1337,
 1674,
 2084,
 2280,
 200,
 309,
 1149,
 1410,
 1943,
 1178,
 3209,
 342,
 3333,
 2167,
 502,
 1307,
 1630,
 3675,
 780,
 1527,
 518,
 349,
 2179,
 568,
 1197,
 1350,
 925,
 1386,
 2295,
 1683,
 2142,
 2233,
 605,
 1645,
 1617,
 3362,
 1085,
 2017,
 3108,
 2507,
 479,
 1556,
 1990,
 60,
 799,
 165,
 1900,
 2169,
 822,
 924,
 2234,
 29,
 417,
 1897,
 2657,
 3932,
 109,
 1956,
 3630,
 3260,
 1881,
 3558,
 347,
 1081,
 908,
 2262,
 426,
 1482,
 3899,
 804,
 3278,
 3908,
 3989,
 524,
 2723,
 1909,
 3893,
 347,
 2302,
 618,
 2539,
 825,
 422,
 483,
 1658,
 1098,
 394,
 3658,


In [27]:
from itertools import chain
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
# grabbed from: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py

def group_texts(examples, max_length=512):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

chunked_datasets = tokenized_datasets.map(group_texts, batched=True, desc=f"Grouping texts in chunks of 512")
chunked_datasets

Grouping texts in chunks of 512:   0%|          | 0/239 [00:00<?, ?ba/s]

Grouping texts in chunks of 512:   0%|          | 0/27 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 776322
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 86258
    })
})

In [28]:
chunked_datasets.push_to_hub("Mouse_DNA_v0_DNABert6tokenized")

Pushing split train to the Hub.
Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]