In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from genomic_benchmarks.loc2seq.with_biopython import _fastagz2dict
from genomic_benchmarks.seq2loc import fasta2loc
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm, trange
from pathlib import Path
import yaml
import tarfile

np.random.seed(42)

  from tqdm.autonotebook import tqdm


## Load genomic references

In [3]:
human = _fastagz2dict(Path.home() / ".genomic_benchmarks/fasta/Homo_sapiens.GRCh38.dna.toplevel.fa.gz",
                      24, 'MT')
human.keys()

  0%|          | 0/24 [00:00<?, ?it/s]

dict_keys(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y', 'MT'])

In [4]:
worm = _fastagz2dict(Path.home() / ".genomic_benchmarks/fasta/Caenorhabditis_elegans.WBcel235.dna.toplevel.fa.gz", 6)
worm.keys()

  0%|          | 0/6 [00:00<?, ?it/s]

dict_keys(['I', 'II', 'III', 'IV', 'V', 'X', 'MtDNA'])

In [5]:
human_chr_lengths = pd.Series({chr: len(human[chr]) for chr in human})
worm_chr_lengths = pd.Series({chr: len(worm[chr]) for chr in worm})
human_chr_lengths, worm_chr_lengths

(1     248956422
 2     242193529
 3     198295559
 4     190214555
 5     181538259
 6     170805979
 7     159345973
 8     145138636
 9     138394717
 10    133797422
 11    135086622
 12    133275309
 13    114364328
 14    107043718
 15    101991189
 16     90338345
 17     83257441
 18     80373285
 19     58617616
 20     64444167
 21     46709983
 22     50818468
 X     156040895
 Y      57227415
 MT        16569
 dtype: int64,
 I        15072434
 II       15279421
 III      13783801
 IV       17493829
 V        20924180
 X        17718942
 MtDNA       13794
 dtype: int64)

## Utils for random generation

In [6]:
def get_random_chr(chr_lengths: pd.Series):
    chr_probs = chr_lengths / chr_lengths.sum()
    chrs = chr_lengths.index.to_list()
    return chrs[np.argwhere(np.random.multinomial(1, chr_probs))[0][0]]

def get_random_int(int_len, chr_lengths: pd.Series):
    c = get_random_chr(chr_lengths)
    c_len = chr_lengths[c]
    pos = np.random.randint(c_len)-int_len+1
    strand = ['+', '-'][np.random.randint(2)]
    return c, pos, pos+int_len, strand

In [7]:
get_random_int(200, human_chr_lengths)

('2', 191099811, 191100011, '-')

## Data generation

In [8]:
human_df = pd.DataFrame.from_records([get_random_int(200, human_chr_lengths) for i in trange(50_000)], 
                                     columns = ["region", "start", "end", "strand"])
human_df["region"] = "chr" + human_df["region"]
human_df.index.name = "id"
human_df.head()

  0%|          | 0/50000 [00:00<?, ?it/s]

Unnamed: 0_level_0,region,start,end,strand
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,chr9,85652772,85652972,-
1,chr21,3385158,3385358,-
2,chr19,36601495,36601695,-
3,chr4,137584141,137584341,-
4,chr4,183378100,183378300,+


In [9]:
worm_df = pd.DataFrame.from_records([get_random_int(200, worm_chr_lengths) for i in trange(50_000)],
                                    columns = ["region", "start", "end", "strand"])
worm_df.index.name = "id"
worm_df.head()

  0%|          | 0/50000 [00:00<?, ?it/s]

Unnamed: 0_level_0,region,start,end,strand
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,X,1014719,1014919,+
1,V,5305780,5305980,+
2,II,5680456,5680656,+
3,V,2118470,2118670,+
4,V,9614692,9614892,-


## Train/test split

In [10]:
train_human, test_human = train_test_split(human_df, shuffle=True, random_state=42)
train_human.shape, test_human.shape

((37500, 4), (12500, 4))

In [11]:
train_worm, test_worm = train_test_split(worm_df, shuffle=True, random_state=42)
train_worm.shape, test_worm.shape

((37500, 4), (12500, 4))

## YAML file

In [12]:
BASE_FILE_PATH = Path("../../datasets/demo_human_or_worm/")

# copied from https://stackoverflow.com/a/57892171
def rm_tree(pth: Path):
    for child in pth.iterdir():
        if child.is_file():
            child.unlink()
        else:
            rm_tree(child)
    pth.rmdir()

if BASE_FILE_PATH.exists():
    rm_tree(BASE_FILE_PATH)
    
BASE_FILE_PATH.mkdir()
(BASE_FILE_PATH / 'train').mkdir()
(BASE_FILE_PATH / 'test').mkdir()

In [13]:
with open(BASE_FILE_PATH / 'metadata.yaml', 'w') as fw:
    desc = {
        'version': 0,
        'classes': {
            'human': {
                'type': 'fa.gz',
                'url': 'http://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz',
                'extra_processing': 'ENSEMBL_HUMAN_GENOME' 
            },    
            'worm': {
                'type': 'fa.gz',
                'url': 'http://ftp.ensembl.org/pub/release-104/fasta/caenorhabditis_elegans/dna/Caenorhabditis_elegans.WBcel235.dna.toplevel.fa.gz'
            }
        }
    }
    
    yaml.dump(desc, fw)

desc

{'version': 0,
 'classes': {'human': {'type': 'fa.gz',
   'url': 'http://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz',
   'extra_processing': 'ENSEMBL_HUMAN_GENOME'},
  'worm': {'type': 'fa.gz',
   'url': 'http://ftp.ensembl.org/pub/release-104/fasta/caenorhabditis_elegans/dna/Caenorhabditis_elegans.WBcel235.dna.toplevel.fa.gz'}}}

## CSV files

In [14]:
train_human.to_csv(BASE_FILE_PATH / 'train' / 'human.csv.gz', index=True, compression='gzip')
train_worm.to_csv(BASE_FILE_PATH / 'train' / 'worm.csv.gz', index=True, compression='gzip')
test_human.to_csv(BASE_FILE_PATH / 'test' / 'human.csv.gz', index=True, compression='gzip')
test_worm.to_csv(BASE_FILE_PATH / 'test' / 'worm.csv.gz', index=True, compression='gzip')

## Test that it can be downloaded

In [16]:
from genomic_benchmarks.loc2seq import download_dataset

download_dataset("demo_human_or_worm")

Reference /home/petr/.genomic_benchmarks/fasta/Homo_sapiens.GRCh38.dna.toplevel.fa.gz already exists. Skipping.
Reference /home/petr/.genomic_benchmarks/fasta/Caenorhabditis_elegans.WBcel235.dna.toplevel.fa.gz already exists. Skipping.




  0%|          | 0/24 [00:00<?, ?it/s]

0it [00:00, ?it/s]

PosixPath('/home/petr/.genomic_benchmarks/demo_human_or_worm')

In [18]:
from genomic_benchmarks.data_check import info

info("demo_human_or_worm", 0)

Dataset `demo_human_or_worm` has 2 classes: human, worm.

All lenghts of genomic intervals equals 200.

Totally 100000 sequences have been found, 75000 for training and 25000 for testing.


Unnamed: 0,train,test
human,37500,12500
worm,37500,12500
