# Data preparation

In [1]:
import pandas as pd
from tqdm import tqdm_notebook, tqdm
from collections import defaultdict
import numpy as np
from Sparse_vector.sparse_vector import SparseVector
import os
from joblib import load, dump, Parallel, delayed

taget_antigen = ['DNase-seq', 'Histone', 'RNA polymerase', 'TFs and others']
chroms = [f'chr3']

In [10]:
%%bash
mkdir ../data/mm9_dna
mkdir ../data/mm9_dna/raw
mkdir ../data/mm9_dna/sparse
cd ../data/mm9_dna/raw
wget http://hgdownload.cse.ucsc.edu/goldenPath/mm9/chromosomes/chr3.fa.gz
gzip -d chr3.fa.gz

--2021-08-27 01:11:43--  http://hgdownload.cse.ucsc.edu/goldenPath/mm9/chromosomes/chr3.fa.gz
Распознаётся hgdownload.cse.ucsc.edu (hgdownload.cse.ucsc.edu)… 128.114.119.163
Подключение к hgdownload.cse.ucsc.edu (hgdownload.cse.ucsc.edu)|128.114.119.163|:80... соединение установлено.
HTTP-запрос отправлен. Ожидание ответа… 200 OK
Длина: 51311935 (49M) [application/x-gzip]
Сохранение в: «chr3.fa.gz»

     0K .......... .......... .......... .......... ..........  0% 88.5K 9m26s
    50K .......... .......... .......... .......... ..........  0%  188K 6m56s
   100K .......... .......... .......... .......... ..........  0%  176K 6m12s
   150K .......... .......... .......... .......... ..........  0%  194K 5m43s
   200K .......... .......... .......... .......... ..........  0% 6.33M 4m35s
   250K .......... .......... .......... .......... ..........  0%  191K 4m33s
   300K .......... .......... .......... .......... ..........  0%  191K 4m31s
   350K .......... .......... .......... ...

In [11]:
from Bio import SeqIO

for record in tqdm(SeqIO.parse("../data/mm9_dna/raw/chr3.fa", "fasta")):
    if record.id in chroms:
        dump(str(record.seq.upper()), f"../data/mm9_dna/sparse/{record.id}.pkl", 3)

1it [00:05,  5.30s/it]


# Sequences

In [2]:
dna = {chrom:load(f'../data/mm9_dna/sparse/{chrom}.pkl') for chrom in tqdm(chroms)}
lens_of_chroms = {chrom: len(dna[chrom]) for chrom in dna}

100%|██████████| 1/1 [00:00<00:00,  1.12it/s]


In [3]:
files = sorted(os.listdir('../data/mm9_features/raw/'))[::-1]

def sparser(file, lens_of_chroms, chroms):
#     print(file)
    loc_dd = {chrom:SparseVector(lens_of_chroms[chrom]) for chrom in chroms}
    df = pd.read_csv(f'../data/mm9_features/raw/{file}', header=None, sep='\t')
    for chrom, sub_df in tqdm(df.groupby(0), desc=file):
        if chrom not in chroms:
            continue
        vec = np.zeros(lens_of_chroms[chrom])
        for inter in sub_df.values:
            vec[inter[1]:inter[2]+1] = np.maximum(vec[inter[1]:inter[2]+1], inter[4])
        loc_dd[chrom] = SparseVector(vec)

    dump(loc_dd, f'../data/mm9_features/sparse/{file[:-4]}.pkl', 3)  

In [None]:
Parallel(n_jobs = -1)(delayed(sparser)(file, lens_of_chroms, chroms) 
                                      for file in files if file.endswith('.bed'))

# ZDNA prepare

In [7]:
def processor(file):
    loc_dd = {chrm:SparseVector(lens_of_chroms[chrm]) for chrm in chroms}
    df = pd.read_csv(f'../data/mm9_zdna/raw/{file}', sep='\t', header=None)
    for chrom, sub_df in tqdm(df.groupby(0)):
        if chrom not in chroms:
            continue
        vec = np.zeros(lens_of_chroms[chrom])
        for inter in sub_df.values:
            vec[inter[1]:inter[2]+1] = np.maximum(vec[inter[1]:inter[2]+1], 1)
        loc_dd[chrom] = SparseVector(vec)

    dump(loc_dd, f'../data/mm9_zdna/sparse/{file[:-4]}.pkl', 3)

In [10]:
for file in os.listdir('../data/mm9_zdna/raw/'):
    if file.startswith("zdna_"):
        processor(file)

100%|██████████| 27/27 [00:01<00:00, 13.55it/s]


In [11]:
loc_dd = {chrm:SparseVector(lens_of_chroms[chrm]) for chrm in chroms}
df = pd.read_csv('../data/mm9_zdna/raw/blacklist_mm9.bed', sep='\t', header=None, names=range(3), index_col=False)
for chrom, sub_df in tqdm(df.groupby(0)):
    if chrom not in chroms:
        continue
    vec = np.zeros(lens_of_chroms[chrom])
    for inter in sub_df.values:
        vec[inter[1]:inter[2]+1] = np.maximum(vec[inter[1]:inter[2]+1], 1)
    loc_dd[chrom] = SparseVector(vec)
dump(loc_dd, f'../data/mm9_zdna/sparse/blacklist_mm9.pkl', 3)

100%|██████████| 24/24 [00:02<00:00, 11.41it/s]


['../data/mm9_zdna/sparse/blacklist_mm9.pkl']

In [12]:
%%bash
mkdir ../data/mm9_dna
mkdir ../data/mm9_dna/raw
mkdir ../data/mm9_dna/sparse