Skip to content

Commit

Permalink
Ability to change the resolution of the filtering dependent on the av…
Browse files Browse the repository at this point in the history
…erage size of the bed file. This will require changes to the API on the REST side for accessing this file, but has drastically reduced its size and is less than the original file. Issue was a problem when storing large sequence features at the single base pair level.
  • Loading branch information
markmcdowall committed May 16, 2017
1 parent 4dcbc2a commit 4f42f19
Showing 1 changed file with 39 additions and 17 deletions.
56 changes: 39 additions & 17 deletions tool/bed_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@ class bedIndexerTool(Tool):
Tool for running indexers over a BED file for use in the RESTful API
"""

self.feature_break_length = 50

def __init__(self):
"""
Init function
Expand Down Expand Up @@ -127,6 +125,8 @@ def bed_feature_length(self, file_bed):
long_feature_count += 1
long_feature_length += l

total_feature_length = small_feature_length + long_feature_length
total_feature_count = small_feature_count + long_feature_count
return total_feature_length / total_feature_count

@task(file_sorted_bed=FILE_IN, file_chrom=FILE_IN, file_bb=FILE_OUT, bed_type=IN)
Expand Down Expand Up @@ -218,13 +218,17 @@ def bed2hdf5(self, file_id, assembly, feature_length, file_sorted_bed, file_hdf5
grp = f[str(assembly)]
meta = f['meta']

dset = grp['data']
dset1 = grp['data1']
dset1k = grp['data1k']
fset = grp['files']
cset = grp['chromosomes']
file_idx_1 = [fs for fs in fset[0] if fs != '']
file_idx_1k = [fs for fs in fset[1] if fs != '']
if file_id not in file_idx:
file_idx.append(file_id)
if file_id not in file_idx_1 and file_id not in file_idx_1k:
if feature_length == 1000:
file_idx_1k.append(file_id)
else:
file_idx_1.append(file_id)
dset1.resize((dset1.shape[0], dset1.shape[1]+1, MAX_CHROMOSOME_SIZE))
dset1k.resize((dset1k.shape[0], dset1k.shape[1]+1, MAX_CHROMOSOME_SIZE/1000))
chrom_idx = [c for c in cset if c != '']
Expand All @@ -239,11 +243,21 @@ def bed2hdf5(self, file_id, assembly, feature_length, file_sorted_bed, file_hdf5
fset = grp.create_dataset('files', (2, MAX_FILES), dtype=dtf)
cset = grp.create_dataset('chromosomes', (MAX_CHROMOSOMES,), dtype=dtc)

file_idx = [file_id]
file_idx_1 = []
file_idx_1k = []
chrom_idx = []

dset1 = grp.create_dataset('data', (0, 1, MAX_CHROMOSOME_SIZE), maxshape=(MAX_CHROMOSOMES, MAX_FILES, MAX_CHROMOSOME_SIZE), dtype='int8', chunks=True, compression="gzip")
dset1k = grp.create_dataset('data', (0, 1, MAX_CHROMOSOME_SIZE/1000), maxshape=(MAX_CHROMOSOMES, MAX_FILES, MAX_CHROMOSOME_SIZE/1000), dtype='int8', chunks=True, compression="gzip")
dset1 = grp.create_dataset('data1', (0, 1, MAX_CHROMOSOME_SIZE),
maxshape=(MAX_CHROMOSOMES, MAX_FILES, MAX_CHROMOSOME_SIZE),
dtype='str', chunks=True, compression="gzip")
dset1k = grp.create_dataset('data1k', (0, 1, MAX_CHROMOSOME_SIZE/1000),
maxshape=(MAX_CHROMOSOMES, MAX_FILES, MAX_CHROMOSOME_SIZE/1000),
dtype='str', chunks=True, compression="gzip")

if feature_length == 1000:
file_idx_1k.append(file_id)
else:
file_idx_1.append(file_id)

# Save the list of files
fset[0, 0:len(file_idx_1)] = file_idx_1
Expand All @@ -252,8 +266,10 @@ def bed2hdf5(self, file_id, assembly, feature_length, file_sorted_bed, file_hdf5
file_chrom_count = 0

if feature_length == 1000:
# dnp = np.full([MAX_CHROMOSOME_SIZE/1000], None, dtype='str')
dnp = np.zeros([MAX_CHROMOSOME_SIZE/1000], dtype='int8')
else:
# dnp = np.full([MAX_CHROMOSOME_SIZE], None, dtype='str')
dnp = np.zeros([MAX_CHROMOSOME_SIZE], dtype='int8')

previous_chrom = ''
Expand All @@ -274,33 +290,39 @@ def bed2hdf5(self, file_id, assembly, feature_length, file_sorted_bed, file_hdf5
loaded = False

if c != previous_chrom and previous_chrom != '':
#if c != 'chr2':
# loaded = True
# continue
file_chrom_count += 1
if previous_chrom not in chrom_idx:
chrom_idx.append(previous_chrom)
cset[0:len(chrom_idx)] = chrom_idx
dset1.resize((dset1.shape[0]+1, dset.shape[1], MAX_CHROMOSOME_SIZE))
dset1k.resize((dset1l.shape[0]+1, dset.shape[1], MAX_CHROMOSOME_SIZE/1000))
dset1.resize((dset1.shape[0]+1, dset1.shape[1], MAX_CHROMOSOME_SIZE))
dset1k.resize((dset1k.shape[0]+1, dset1k.shape[1], MAX_CHROMOSOME_SIZE/1000))

dset[chrom_idx.index(previous_chrom), file_idx.index(file_id), :] = dnp
loaded = True

if feature_length == 1000:
dset1k[chrom_idx.index(previous_chrom), file_idx_1k.index(file_id), :] = dnp
# dnp = np.full([MAX_CHROMOSOME_SIZE/1000], None, dtype='str')
dnp = np.zeros([MAX_CHROMOSOME_SIZE/1000], dtype='int8')
else:
dset1[chrom_idx.index(previous_chrom), file_idx_1.index(file_id), :] = dnp
# dnp = np.full([MAX_CHROMOSOME_SIZE], None, dtype='str')
dnp = np.zeros([MAX_CHROMOSOME_SIZE], dtype='int8')

previous_chrom = c
if feature_length == 1000:
dnp[(s/1000):(e/1000)+1] = 1
dnp[(s/1000):(e/1000)+1] = '1'
else:
dnp[s:e+1] = 1
dnp[s:e+1] = '1'

if loaded == False:
if previous_chrom not in chrom_idx:
chrom_idx.append(c)
cset[0:len(chrom_idx)] = chrom_idx
dset1.resize((dset1.shape[0]+1, dset.shape[1], MAX_CHROMOSOME_SIZE))
dset1k.resize((dset1l.shape[0]+1, dset.shape[1], MAX_CHROMOSOME_SIZE/1000))
dset1.resize((dset1.shape[0]+1, dset1.shape[1], MAX_CHROMOSOME_SIZE))
dset1k.resize((dset1k.shape[0]+1, dset1k.shape[1], MAX_CHROMOSOME_SIZE/1000))

if feature_length == 1000:
dset1k[chrom_idx.index(previous_chrom), file_idx_1k.index(file_id), :] = dnp
Expand Down Expand Up @@ -382,10 +404,10 @@ def run(self, input_files, metadata):

feature_length = self.bed_feature_length(bed_sorted_file)
storage_level = 1000
if feature_length < 100:
if feature_length < 10:
storage_level = 1

if not self.bed2hdf5(metadata['file_id'], assembly, storage_level, bed_file, hdf5_file):
if not self.bed2hdf5(metadata['file_id'], assembly, storage_level, bed_sorted_file, hdf5_file):
output_metadata.set_exception(
Exception(
"bed2hdf5: Could not process files {}, {}.".format(*input_files)))
Expand Down

0 comments on commit 4f42f19

Please sign in to comment.