In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from genomic_benchmarks.loc2seq.with_biopython import _fastagz2dict
from genomic_benchmarks.seq2loc import fasta2loc
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.notebook import tqdm
from pathlib import Path
import yaml

  from tqdm.autonotebook import tqdm


## Load genomic reference and download data from GitHub

In [3]:
genome = _fastagz2dict("~/.genomic_benchmarks/fasta/Homo_sapiens.GRCh38.dna.toplevel.fa.gz",
                      24, 'MT')
genome.keys()

  0%|          | 0/24 [00:00<?, ?it/s]

dict_keys(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y', 'MT'])

In [4]:
!wget https://raw.githubusercontent.com/solovictor/CNNPromoterData/master/human_non_tata.fa
!wget https://raw.githubusercontent.com/solovictor/CNNPromoterData/master/human_nonprom_big.fa

--2021-10-14 23:24:11--  https://raw.githubusercontent.com/solovictor/CNNPromoterData/master/human_non_tata.fa
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6339520 (6.0M) [text/plain]
Saving to: ‘human_non_tata.fa’


2021-10-14 23:24:12 (19.3 MB/s) - ‘human_non_tata.fa’ saved [6339520/6339520]

--2021-10-14 23:24:12--  https://raw.githubusercontent.com/solovictor/CNNPromoterData/master/human_nonprom_big.fa
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8790727 (8.4M) [text/plain]
Saving to: ‘human_nonprom_big.fa’


2021-10-14 

## Get promoters

In [5]:
# slow!
promoters = fasta2loc("./human_non_tata.fa", genome)

19811 sequences read and parsed.


  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/248956422 [00:00<?, ?it/s]

  0%|          | 0/242193529 [00:00<?, ?it/s]

  0%|          | 0/198295559 [00:00<?, ?it/s]

  0%|          | 0/190214555 [00:00<?, ?it/s]

  0%|          | 0/181538259 [00:00<?, ?it/s]

  0%|          | 0/170805979 [00:00<?, ?it/s]

  0%|          | 0/159345973 [00:00<?, ?it/s]

  0%|          | 0/145138636 [00:00<?, ?it/s]

  0%|          | 0/138394717 [00:00<?, ?it/s]

  0%|          | 0/133797422 [00:00<?, ?it/s]

  0%|          | 0/135086622 [00:00<?, ?it/s]

  0%|          | 0/133275309 [00:00<?, ?it/s]

  0%|          | 0/114364328 [00:00<?, ?it/s]

  0%|          | 0/107043718 [00:00<?, ?it/s]

  0%|          | 0/101991189 [00:00<?, ?it/s]

  0%|          | 0/90338345 [00:00<?, ?it/s]

  0%|          | 0/83257441 [00:00<?, ?it/s]

  0%|          | 0/80373285 [00:00<?, ?it/s]

  0%|          | 0/58617616 [00:00<?, ?it/s]

  0%|          | 0/64444167 [00:00<?, ?it/s]

  0%|          | 0/46709983 [00:00<?, ?it/s]

  0%|          | 0/50818468 [00:00<?, ?it/s]

  0%|          | 0/156040895 [00:00<?, ?it/s]

  0%|          | 0/57227415 [00:00<?, ?it/s]

  0%|          | 0/16569 [00:00<?, ?it/s]

19657 sequences found in the reference.


### A few checks

In [6]:
len(promoters.keys())

19657

In [7]:
promoters['FP000001']

('1', 925542, 925793, '+')

In [8]:
genome['1'][925542:925793]

'GCCGCCTCTTCCTGCCGCGCAGGCCGAGGGTCCCGACGGCGCCGCTCACCGCTCCGGGACTCAGCCTTTCTGGGCCCGGCCTGCGGTTCCCTCGGGGCCGGGGAGAGGGTGGAGCGCGGGAGGAGGGGCGCCGGGTGGGGACGCCCAGGCCCTTCGTCGGGGGAGGGCGCTCCACCCGGGCTGGAGTTGCAGAGCCCAGCAGATCCCTGCGGCGTTCGCGAGGGTGGGACGGGAAGCGGGCTGGGAAGTCG'

In [9]:
promoters['FP000003']

('1', 959200, 959451, '-')

In [10]:
from Bio.Seq import Seq

def _rev(seq, strand):
    # reverse complement
    if strand == '-':
        return str(Seq(seq).reverse_complement())
    else:
        return seq

In [11]:
_rev(genome['1'][959200:959451], "-")

'GCATCTGGGCCCCACCGGGGCTGCCCGCACCGAGCACGCGAACGCGCCCTCCCGCCCTGAGGCCGCCGGCGTTGCGGTCGGAGAACCATAGAGCCACTCGGCTGGGCGTGGCGCGGCGGGGCGGGGAAAGGGGCGGGGCCTGGGCGGCGGAAGTGCGCAGCCGCGCGGCATTCTGGGGCCGGAAGTGGGGTGCACGCTTCGGGTTGGTGTCATGGCAGCTGCGGGGAGCCGCAAGAGGTAAGCCGCGGGTC'

In [12]:
promoters_df = pd.DataFrame.from_dict(promoters, orient='index', columns=['region','start','end','strand']).rename_axis('id')
promoters_df.to_csv("positive.csv")
promoters_df.head()

Unnamed: 0_level_0,region,start,end,strand
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FP000001,1,925542,925793,+
FP000002,1,939072,939323,+
FP000003,1,959200,959451,-
FP000004,1,960383,960634,+
FP000005,1,966281,966532,+


## Get non-promoters

In [13]:
# slow!
nonpromoters = fasta2loc("./human_nonprom_big.fa", genome, use_seq_ids=False)

27731 sequences read and parsed.


  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/248956422 [00:00<?, ?it/s]

  0%|          | 0/242193529 [00:00<?, ?it/s]

  0%|          | 0/198295559 [00:00<?, ?it/s]

  0%|          | 0/190214555 [00:00<?, ?it/s]

  0%|          | 0/181538259 [00:00<?, ?it/s]

  0%|          | 0/170805979 [00:00<?, ?it/s]

  0%|          | 0/159345973 [00:00<?, ?it/s]

  0%|          | 0/145138636 [00:00<?, ?it/s]

  0%|          | 0/138394717 [00:00<?, ?it/s]

  0%|          | 0/133797422 [00:00<?, ?it/s]

  0%|          | 0/135086622 [00:00<?, ?it/s]

  0%|          | 0/133275309 [00:00<?, ?it/s]

  0%|          | 0/114364328 [00:00<?, ?it/s]

  0%|          | 0/107043718 [00:00<?, ?it/s]

  0%|          | 0/101991189 [00:00<?, ?it/s]

  0%|          | 0/90338345 [00:00<?, ?it/s]

  0%|          | 0/83257441 [00:00<?, ?it/s]

  0%|          | 0/80373285 [00:00<?, ?it/s]

  0%|          | 0/58617616 [00:00<?, ?it/s]

  0%|          | 0/64444167 [00:00<?, ?it/s]

  0%|          | 0/46709983 [00:00<?, ?it/s]

  0%|          | 0/50818468 [00:00<?, ?it/s]

  0%|          | 0/156040895 [00:00<?, ?it/s]

  0%|          | 0/57227415 [00:00<?, ?it/s]

  0%|          | 0/16569 [00:00<?, ?it/s]

16474 sequences found in the reference.


In [14]:
nonpromoters_df = pd.DataFrame.from_dict(nonpromoters, orient='index', columns=['region','start','end','strand']).rename_axis('id')
nonpromoters_df.head()

Unnamed: 0_level_0,region,start,end,strand
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AGAGCAGAAGACCGAAAGGTGAGTCGGCCTGCGGACTCTTCCGGCCCGAACTTCTCTTACCTACCCCGCGCTCCCCGGTGCAGCCGGGCTGTGGAAGGCTTGCAGGGGAGGAAGCTAAAAAGTTTGCACAGGGCAACTCCCGCCCTTGCTCCCTCGGGACTCTCCGTGGAGCTCCCACGGACTGAAAGAGCGTGCCCCCCAACCCGAACGAGCCCCGCCGGGGCCTTTGCAAAGGGCAGCAGTGGCCGTCG,1,67685520,67685771,+
TCGGCCTGCGGACTCTTCCGGCCCGAACTTCTCTTACCTACCCCGCGCTCCCCGGTGCAGCCGGGCTGTGGAAGGCTTGCAGGGGAGGAAGCTAAAAAGTTTGCACAGGGCAACTCCCGCCCTTGCTCCCTCGGGACTCTCCGTGGAGCTCCCACGGACTGAAAGAGCGTGCCCCCCAACCCGAACGAGCCCCGCCGGGGCCTTTGCAAAGGGCAGCAGTGGCCGTCGCTGCCCGTGCGGCTCCCGTGGCT,1,67685543,67685794,+
CGGACTCTTCCGGCCCGAACTTCTCTTACCTACCCCGCGCTCCCCGGTGCAGCCGGGCTGTGGAAGGCTTGCAGGGGAGGAAGCTAAAAAGTTTGCACAGGGCAACTCCCGCCCTTGCTCCCTCGGGACTCTCCGTGGAGCTCCCACGGACTGAAAGAGCGTGCCCCCCAACCCGAACGAGCCCCGCCGGGGCCTTTGCAAAGGGCAGCAGTGGCCGTCGCTGCCCGTGCGGCTCCCGTGGCTGGCAGCCT,1,67685551,67685802,+
CCCCGGTGCAGCCGGGCTGTGGAAGGCTTGCAGGGGAGGAAGCTAAAAAGTTTGCACAGGGCAACTCCCGCCCTTGCTCCCTCGGGACTCTCCGTGGAGCTCCCACGGACTGAAAGAGCGTGCCCCCCAACCCGAACGAGCCCCGCCGGGGCCTTTGCAAAGGGCAGCAGTGGCCGTCGCTGCCCGTGCGGCTCCCGTGGCTGGCAGCCTGTGGCAGGGGCACTCTCGGGACTTCTCACGGGACGCCCGGT,1,67685592,67685843,+
GGGGAGGAAGCTAAAAAGTTTGCACAGGGCAACTCCCGCCCTTGCTCCCTCGGGACTCTCCGTGGAGCTCCCACGGACTGAAAGAGCGTGCCCCCCAACCCGAACGAGCCCCGCCGGGGCCTTTGCAAAGGGCAGCAGTGGCCGTCGCTGCCCGTGCGGCTCCCGTGGCTGGCAGCCTGTGGCAGGGGCACTCTCGGGACTTCTCACGGGACGCCCGGTCCTTGGGCGTGCAGGGGTCATGGGGGGTGACG,1,67685624,67685875,+


In [15]:
# check one sequence
genome['1'][67685592:67685843] == nonpromoters_df.index[3]

True

In [16]:
nonpromoters_df.reset_index(inplace=True)
nonpromoters_df['id'] = list(range(nonpromoters_df.shape[0]))
nonpromoters_df.head()

Unnamed: 0,id,region,start,end,strand
0,0,1,67685520,67685771,+
1,1,1,67685543,67685794,+
2,2,1,67685551,67685802,+
3,3,1,67685592,67685843,+
4,4,1,67685624,67685875,+


In [17]:
nonpromoters_df.to_csv("negative.csv", index=False)
len(nonpromoters.keys())

16474

In [18]:
promoters_df = pd.read_csv("positive.csv")
nonpromoters_df = pd.read_csv("negative.csv")

promoters_df['region'] = "chr" + promoters_df['region']
nonpromoters_df['region'] = "chr" + nonpromoters_df['region']

## Train/test split

In [19]:
train_promoters, test_promoters = train_test_split(promoters_df, shuffle=True, random_state=42)
train_promoters.shape, test_promoters.shape

((14742, 5), (4915, 5))

In [20]:
train_nonpromoters, test_nonpromoters = train_test_split(nonpromoters_df, shuffle=True, random_state=42)
train_nonpromoters.shape, test_nonpromoters.shape

((12355, 5), (4119, 5))

## YAML file

In [23]:
BASE_FILE_PATH = Path("../../datasets/human_nontata_promoters/")

# copied from https://stackoverflow.com/a/57892171
def rm_tree(pth: Path):
    for child in pth.iterdir():
        if child.is_file():
            child.unlink()
        else:
            rm_tree(child)
    pth.rmdir()

if BASE_FILE_PATH.exists():
    rm_tree(BASE_FILE_PATH)
    
BASE_FILE_PATH.mkdir()
(BASE_FILE_PATH / 'train').mkdir()
(BASE_FILE_PATH / 'test').mkdir()

In [24]:
with open(BASE_FILE_PATH / 'metadata.yaml', 'w') as fw:
    desc = {
        'version': 0,
        'classes': {
            'positive': {
                'type': 'fa.gz',
                'url': 'http://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz',
                'extra_processing': 'ENSEMBL_HUMAN_GENOME' 
            },    
            'negative': {
                'type': 'fa.gz',
                'url': 'http://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz',
                'extra_processing': 'ENSEMBL_HUMAN_GENOME' 
            }
        }
    }
    
    yaml.dump(desc, fw)

desc

{'version': 0,
 'classes': {'positive': {'type': 'fa.gz',
   'url': 'http://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz',
   'extra_processing': 'ENSEMBL_HUMAN_GENOME'},
  'negative': {'type': 'fa.gz',
   'url': 'http://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz',
   'extra_processing': 'ENSEMBL_HUMAN_GENOME'}}}

## CSV files

In [25]:
train_promoters.to_csv(BASE_FILE_PATH / 'train' / 'positive.csv.gz', index=False, compression='gzip')
train_nonpromoters.to_csv(BASE_FILE_PATH / 'train' / 'negative.csv.gz', index=False, compression='gzip')
test_promoters.to_csv(BASE_FILE_PATH / 'test' / 'positive.csv.gz', index=False, compression='gzip')
test_nonpromoters.to_csv(BASE_FILE_PATH / 'test' / 'negative.csv.gz', index=False, compression='gzip')

## Cleaning

In [26]:
!rm human_non_tata.fa human_nonprom_big.fa
!rm positive.csv negative.csv