In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tools.loc2seq.with_biopython import _fastagz2dict  # TODO: should be later updated to the name of the package

## Load genome into the memory

In [3]:
genome = _fastagz2dict("/home/petr/.genomic_benchmarks/fasta/Homo_sapiens.GRCh38.dna.toplevel.fa.gz", fasta_total=24, stop_id="MT")

  0%|          | 0/24 [00:00<?, ?it/s]

In [10]:
list(genome.keys())

['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 'X',
 'Y',
 'MT']

## Loading database

In [22]:
from Bio import SeqIO
import requests
from io import StringIO
from tqdm.auto import tqdm

In [5]:
link = "https://raw.githubusercontent.com/solovictor/CNNPromoterData/master/human_non_tata.fa"
data = requests.get(link).text

In [12]:
def update_tree(root, seq_str, seq_name, direction):
    position = root

    for c in seq_str:
        if c in position:
            position = position[c]
        else:
            position[c] = {}
            position = position[c]
    position['terminal'] = (seq_name, direction, len(seq_str))



In [30]:
tree = {}
Nseqs = 0

for seq in SeqIO.parse(StringIO(data), "fasta"):
    s = str(seq.seq)
    rev = str(seq.seq.reverse_complement())
    Nseqs += 1
   
    update_tree(tree, s, s, '+')
    update_tree(tree, rev, s, '-')
    
Nseqs

19811

In [31]:
results = {}
curr_positions = []
chrom = '1'

for i, c in tqdm(enumerate(genome['1']), total=len(genome['1'])):
    
    prev_positions = curr_positions + [tree]
    curr_positions = []
    
    for pos in prev_positions:
        if c in pos:
            pos = pos[c]
            curr_positions.append(pos)
            
            if 'terminal' in pos:
                results[pos['terminal'][0]] = ("chr"+chrom, i-pos['terminal'][2]+1, i+1, pos['terminal'][1])
    

  0%|          | 0/248956422 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [25]:
results[0]

['GCCGCCTCTTCCTGCCGCGCAGGCCGAGGGTCCCGACGGCGCCGCTCACCGCTCCGGGACTCAGCCTTTCTGGGCCCGGCCTGCGGTTCCCTCGGGGCCGGGGAGAGGGTGGAGCGCGGGAGGAGGGGCGCCGGGTGGGGACGCCCAGGCCCTTCGTCGGGGGAGGGCGCTCCACCCGGGCTGGAGTTGCAGAGCCCAGCAGATCCCTGCGGCGTTCGCGAGGGTGGGACGGGAAGCGGGCTGGGAAGTCG',
 '+',
 251,
 925792]

In [29]:
len(results)

2074

In [28]:
genome['1'][(925792-250):925793]


'GCCGCCTCTTCCTGCCGCGCAGGCCGAGGGTCCCGACGGCGCCGCTCACCGCTCCGGGACTCAGCCTTTCTGGGCCCGGCCTGCGGTTCCCTCGGGGCCGGGGAGAGGGTGGAGCGCGGGAGGAGGGGCGCCGGGTGGGGACGCCCAGGCCCTTCGTCGGGGGAGGGCGCTCCACCCGGGCTGGAGTTGCAGAGCCCAGCAGATCCCTGCGGCGTTCGCGAGGGTGGGACGGGAAGCGGGCTGGGAAGTCG'

In [32]:
import re

r = re.compile(".*" + str(seq.seq) + ".*")

In [42]:
%%time
bool(r.match(genome['1']))

CPU times: user 1.18 s, sys: 0 ns, total: 1.18 s
Wall time: 1.18 s


False

In [47]:
%%time
str(seq.seq) in genome['1']

CPU times: user 804 ms, sys: 0 ns, total: 804 ms
Wall time: 802 ms


False

In [48]:
%%time
genome['1'].find(str(seq.seq))

CPU times: user 781 ms, sys: 0 ns, total: 781 ms
Wall time: 779 ms


-1

In [39]:
bool(r.match(genome['Y']))

True

In [12]:
!ls ~/.genomic_benchmarks/demo_mouse_enhancers/test/negative

0.txt	 112.txt  16.txt  29.txt  41.txt  54.txt  67.txt  7.txt   92.txt
100.txt  113.txt  17.txt  2.txt   42.txt  55.txt  68.txt  80.txt  93.txt
101.txt  114.txt  18.txt  30.txt  43.txt  56.txt  69.txt  81.txt  94.txt
102.txt  115.txt  19.txt  31.txt  44.txt  57.txt  6.txt   82.txt  95.txt
103.txt  116.txt  1.txt   32.txt  45.txt  58.txt  70.txt  83.txt  96.txt
104.txt  117.txt  20.txt  33.txt  46.txt  59.txt  71.txt  84.txt  97.txt
105.txt  118.txt  21.txt  34.txt  47.txt  5.txt   72.txt  85.txt  98.txt
106.txt  119.txt  22.txt  35.txt  48.txt  60.txt  73.txt  86.txt  99.txt
107.txt  11.txt   23.txt  36.txt  49.txt  61.txt  74.txt  87.txt  9.txt
108.txt  120.txt  24.txt  37.txt  4.txt   62.txt  75.txt  88.txt
109.txt  12.txt   25.txt  38.txt  50.txt  63.txt  76.txt  89.txt
10.txt	 13.txt   26.txt  39.txt  51.txt  64.txt  77.txt  8.txt
110.txt  14.txt   27.txt  3.txt   52.txt  65.txt  78.txt  90.txt
111.txt  15.txt   28.txt  40.txt  53.txt  66.txt  79.txt  91.txt


In [13]:
!cat ~/.genomic_benchmarks/demo_mouse_enhancers/test/negative/0.txt

TAGGTGAAATTTACACTTTTGACCTAAACTTCTGGGTCCACCCTATGGCTTTAAGATGGATGGATAGGGGATCAAGACCCTTAACTTGCATTGCATTTTCCTTCTTCTTGAACTCATATTNNNNNNNNNNNNNNNNNGCTCATGGTGGTGATGATTCTGTTGGCCTGGACGTTCTGATTTGTAACTGCTAGACAGAGATACCCTTAAAGAGACAGATAGATCCTATCACATTTTGTAACATAAANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGCTGCTAAGGCACATTGTAACAGTGATGAAAATGATTTACATGAATGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN