# Download all files

In [None]:
!wget http://dbarchive.biosciencedbc.jp/kyushu-u/metadata/experimentList.tab -O ../data/ChipSeq/experimentList.tab

In [None]:
!wget http://dbarchive.biosciencedbc.jp/kyushu-u/metadata/fileList.tab -O ../data/ChipSeq/fileList.tab

In [None]:
!wget http://dbarchive.biosciencedbc.jp/kyushu-u/hg19/allPeaks_light/allPeaks_light.hg19.05.bed.gz -O ../data/ChipSeq/allPeaks_light.hg19.05.bed.gz

# Processing

In [4]:
import pandas as pd
from tqdm import tqdm_notebook
from collections import defaultdict
import numpy as np
from sparse_vector.sparse_vector import SparseVector
import os
from joblib import load, dump

taget_antigen = ['DNase-seq', 'Histone', 'RNA polymerase', 'TFs and others']
chroms = [f'chr{i}' for i in list(range(1, 23)) + ['X', 'Y','M']]

In [5]:
experiments = []
for line in open('../data/ChipSeq/experimentList.tab'):
    experiments.append(line.split('\t')[:9])
    
experiments = pd.DataFrame(experiments, columns = ["Experiments ID", 
                                                     "Genome assembly",
                                                     "Antigen class",
                                                     "Antigen",
                                                     "Cell type class",
                                                     "Cell type",
                                                     "Cell type description",
                                                     "Logs",
                                                     "Title"])

hg19_exp = experiments[experiments["Genome assembly"] == 'hg19']
hg19_exp = hg19_exp[hg19_exp["Antigen class"].isin(taget_antigen)]

hg19_grb = hg19_exp.groupby('Antigen class')

In [6]:
exp = {}
for key, df in hg19_exp.groupby(["Antigen class", 'Antigen']):
    for ID in df["Experiments ID"].values:
        exp[ID] = key

In [7]:
all_feutures = list(hg19_exp.groupby(["Antigen class", 'Antigen']).groups.keys())

first_part = set(all_feutures[:len(all_feutures) // 2])
secon_part = set(all_feutures[len(all_feutures) // 2:])

first_part_exp = {i:exp[i] for i in exp if exp[i] in first_part}
secon_part_exp = {i:exp[i] for i in exp if exp[i] in secon_part}

In [8]:
open_files = {}
for file in first_part:
    open_files[file] = open(f"../data/hg19_features/raw/{'_'.join(file)}.bed", 'x')


FileNotFoundError: [Errno 2] No such file or directory: '../data/hg19_features/raw/TFs and others_CLOCK.bed'

In [6]:
for line in tqdm_notebook(open('../data/ChipSeq/allPeaks_light.hg19.05.bed')):
    exp_name = line.split('\t')[3]
    if exp_name in first_part_exp:
        open_files[first_part_exp[exp_name]].write(line)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [7]:
for file in open_files:
    open_files[file].close()

In [8]:
open_files = {}
for file in secon_part:
    open_files[file] = open(f"../data/hg19_features/raw/{'_'.join(file)}.bed", 'x')


In [9]:
for line in tqdm_notebook(open('../data/ChipSeq/allPeaks_light.hg19.05.bed')):
    exp_name = line.split('\t')[3]
    if exp_name in secon_part_exp:
        open_files[secon_part_exp[exp_name]].write(line)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [10]:
for file in open_files:
    open_files[file].close()

# DNA sequence downloading and processing

In [2]:
def chrom_reader(chrom):
    return ''.join([i[:-1] for i in list(open(f'../data/hg19_dna/{chrom}.fa'))[1:]])

dna = {chrom:chrom_reader(chrom) for chrom in tqdm_notebook(chroms)}
lens_of_chroms = {chrom: len(dna[chrom]) for chrom in dna}
del dna

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))




# Creating sparse vector

In [None]:
for file in tqdm_notebook(os.listdir('../data/hg19_features/raw/')):
    df = pd.read_csv(f'../data/hg19_features/raw/{file}', header=None, sep='\t')
    loc_dd = {}
    for chrom, sub_df in df.groupby(0):
        if chrom in chroms:
            vec = np.zeros(lens_of_chroms[chrom], dtype=np.int16)
            for line in sub_df.values:
                 vec[line[1]:line[2]] = np.maximum(vec[line[1]:line[2]], line[4])
            loc_dd[chrom] = SparseVector(vec)
    dump(loc_dd, f'../data/hg19_features/sparse/{file}.pkl')