Skip to content

Commit

Permalink
added readcount_min
Browse files Browse the repository at this point in the history
  • Loading branch information
ploy-np committed Oct 29, 2020
1 parent 8ff79fe commit c362584
Showing 1 changed file with 11 additions and 8 deletions.
19 changes: 11 additions & 8 deletions xpore/scripts/dataprep.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def get_args():
# parser.add_argument('--features', dest='features', help='Signal features to extract.',type=list,default=['norm_mean'])
optional.add_argument('--genome', dest='genome', help='to run on Genomic coordinates. Without this argument, the program will run on transcriptomic coordinates',default=False,action='store_true')
optional.add_argument('--n_processes', dest='n_processes', help='number of processes to run.',type=int, default=1)
optional.add_argument('--readcount_min', dest='readcount_min', help='minimum read counts per gene.',type=int, default=1)
optional.add_argument('--readcount_max', dest='readcount_max', help='maximum read counts per gene.',type=int, default=1000)
optional.add_argument('--resume', dest='resume', help='with this argument, the program will resume from the previous run.',default=False,action='store_true') #todo

Expand Down Expand Up @@ -174,7 +175,7 @@ def t2g(gene_id,ensembl):

return tx_ids, t2g_dict

def parallel_preprocess_gene(ensembl,out_dir,n_processes,readcount_max,resume):
def parallel_preprocess_gene(ensembl,out_dir,n_processes,readcount_min,readcount_max,resume):

# Create output paths and locks.
out_paths,locks = dict(),dict()
Expand Down Expand Up @@ -245,11 +246,11 @@ def parallel_preprocess_gene(ensembl,out_dir,n_processes,readcount_max,resume):

# n_reads += len(f[tx_id])
for read_id in f[tx_id].keys():
if (n_reads < readcount_max) and (read_id not in read_ids):
if (n_reads > readcount_min) and (n_reads < readcount_max) and (read_id not in read_ids):
data_dict[read_id] = f[tx_id][read_id]['events'][:]
read_ids += [read_id]
n_reads += 1
elif n_reads >= readcount_max:
elif (n_reads <= readcount_min) or (n_reads >= readcount_max):
break

if len(read_ids) > 0:
Expand Down Expand Up @@ -376,7 +377,7 @@ def preprocess_gene(gene_id,data_dict,t2g_mapping,out_paths,locks):
f.write(log_str + '\n')


def parallel_preprocess_tx(out_dir,n_processes,readcount_max,resume):
def parallel_preprocess_tx(out_dir,n_processes,readcount_min,readcount_max,resume):

# Create output paths and locks.
out_paths,locks = dict(),dict()
Expand Down Expand Up @@ -416,7 +417,7 @@ def parallel_preprocess_tx(out_dir,n_processes,readcount_max,resume):
n_reads = 0
data_dict = dict()
for read_id in f[tx_id].keys():
if n_reads < readcount_max:
if (n_reads > readcount_min) adn (n_reads < readcount_max):
data_dict[read_id] = f[tx_id][read_id]['events'][:]
read_ids += [read_id]
n_reads += 1
Expand Down Expand Up @@ -521,6 +522,7 @@ def main():
out_dir = args.out_dir
ensembl_version = args.ensembl
ensembl_species = args.species
readcount_min = args.readcount_min
readcount_max = args.readcount_max
resume = args.resume
genome = args.genome
Expand All @@ -536,10 +538,10 @@ def main():
# (2) Create a .json file, where the info of all reads are stored per position, for modelling.
if genome:
ensembl = EnsemblRelease(ensembl_version,ensembl_species) # Default: human reference genome GRCh38 release 91 used in the ont mapping.
parallel_preprocess_gene(ensembl,out_dir,n_processes,readcount_max,resume)
parallel_preprocess_gene(ensembl,out_dir,n_processes,readcount_min,readcount_max,resume)

else:
parallel_preprocess_tx(out_dir,n_processes,readcount_max,resume)
parallel_preprocess_tx(out_dir,n_processes,readcount_min,readcount_max,resume)


if __name__ == '__main__':
Expand All @@ -548,7 +550,8 @@ def main():
--eventalign EVENTALIGN --summary SUMMARY --out_dir OUT_DIR \
[--ensembl ENSEMBL] [--genome] \
[--n_processes N_PROCESSES] \
[--readcount_max READCOUNT_MAX] [--resume] \
[--readcount_min READCOUNT_MIN] [--readcount_max READCOUNT_MAX] \
[--resume] \
"""
main()

Expand Down

0 comments on commit c362584

Please sign in to comment.