# Processing

In [1]:
import pandas as pd
from tqdm import tqdm_notebook, tqdm
from collections import defaultdict
import numpy as np
from sparse_vector.sparse_vector import SparseVector
import os
from joblib import load, dump, Parallel, delayed

taget_antigen = ['DNase-seq', 'Histone', 'RNA polymerase', 'TFs and others']
chroms = [f'chr{i}' for i in list(range(1, 23)) + ['X', 'Y','M']]

In [2]:
def chrom_reader(chrom):
    files = sorted([i for i in os.listdir(f'../data/hg19_dna/') if f"{chrom}_" in i])
    return ''.join([load(f"../data/hg19_dna/{file}") for file in files])

# DNA sequence downloading and processing

In [3]:
dna = {chrom:chrom_reader(chrom) for chrom in tqdm_notebook(chroms)}
lens_of_chroms = {chrom: len(dna[chrom]) for chrom in dna}
del dna

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))




In [4]:
files = sorted(os.listdir('../data/hg19_features/raw/'))[::-1]

def sparser(file, lens_of_chroms, chroms):
#     print(file)
    loc_dd = {}
    df = pd.read_csv(f'../data/hg19_features/raw/{file}', header=None, sep='\t')
    for chrom, sub_df in tqdm(df.groupby(0), desc=file):
        if chrom not in chroms:
            continue
        vec = np.zeros(lens_of_chroms[chrom])
        for inter in sub_df.values:
            vec[inter[1]:inter[2]+1] = np.maximum(vec[inter[1]:inter[2]+1], inter[4])
        loc_dd[chrom] = SparseVector(vec)

    dump(loc_dd, f'../data/hg19_features/sparse/{file[:-4]}.pkl', 3)    

In [5]:
done_files = set([i[:-4] for i in os.listdir('../data/hg19_features/sparse/')])
files = [file for file in files if file[:-4] not in done_files]

In [None]:
Parallel(n_jobs = -1, 
         backend= "multiprocessing")(delayed(sparser)(file, lens_of_chroms, chroms) 
                                      for file in files if file.endswith('.bed'))

# ZDNA

In [19]:
loc_dd = {chrm:SparseVector(lens_of_chroms[chrm]) for chrm in chroms}
df = pd.read_csv(f'../data/hg19_zdna/raw/ZDNA.bed', sep=',')
for chrom, sub_df in tqdm(df.groupby('chrom')):
    if chrom not in chroms:
        continue
    vec = np.zeros(lens_of_chroms[chrom])
    for inter in sub_df.values:
        vec[inter[1]:inter[2]+1] = np.maximum(vec[inter[1]:inter[2]+1], 1)
    loc_dd[chrom] = SparseVector(vec)

dump(loc_dd, f'../data/hg19_zdna/sparse/ZDNA.pkl', 3)

100%|██████████| 24/24 [00:43<00:00,  1.82s/it]


['../data/hg19_zdna/sparse/ZDNA.pkl']

# ZHUNT

In [20]:
loc_dd = {chrm:SparseVector(lens_of_chroms[chrm]) for chrm in chroms}
df = pd.read_csv(f'../data/hg19_zdna/raw/zhunt.bed', sep=',')
for chrom, sub_df in tqdm(df.groupby('chrom')):
    if chrom not in chroms:
        continue
    vec = np.zeros(lens_of_chroms[chrom])
    for inter in sub_df.values:
        vec[inter[1]:inter[2]+1] = np.maximum(vec[inter[1]:inter[2]+1], 1)
    loc_dd[chrom] = SparseVector(vec)

dump(loc_dd, f'../data/hg19_zdna/sparse/ZHUNT.pkl', 3)

100%|██████████| 24/24 [00:43<00:00,  1.82s/it]


['../data/hg19_zdna/sparse/ZHUNT.pkl']