skip eventalign index

GoekeLab · Mar 1, 2021 · 740bb6d · 740bb6d
1 parent 7321ee4
commit 740bb6d
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 14 deletions.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -28,7 +28,7 @@
 # The short X.Y version
 version = ''
 # The full version, including alpha/beta/rc tags
-release = '0.5.3'
+release = '1.0.0'
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst
@@ -23,7 +23,7 @@ Each dataset under the ``data`` directory contains the following directories:
 * ``bamtx`` : Transcriptome-aligned sequence
 * ``nanopolish``: Eventalign files obtained from `nanopolish eventalign <https://nanopolish.readthedocs.io/en/latest/quickstart_eventalign.html>`_
 
-1. Preprocess the data for each data set using ``xpore-dataprep`` (This step will take a while..).::
+1. Preprocess the data for each data set using ``xpore-dataprep``. (This step will take approximately 5h for 1 million reads)::
 
     # Within each dataset directory i.e. demo/data/HEK293T-METTL3-KO-rep1 and demo/data/HEK293T-WT-rep1, run
     xpore-dataprep \
@@ -34,8 +34,7 @@ Each dataset under the ``data`` directory contains the following directories:
 
 The output files are stored under ``dataprep`` in each  dataset directory:
 
-* ``eventalign.hdf5`` : Merged segments from ``nanopolish eventalign``, stored with the hierarchical keys ``<TRANSCRIPT_ID>/<READ_ID>/events`` 
-* ``eventalign.log`` : Log file
+* ``eventalign.index`` : Index file to access ``eventalign.txt``, the output from nanopolish eventalign
 * ``data.json`` : Preprocessed data for ``xpore-diffmod``
 * ``data.index`` : File index of ``data.json`` for random access per gene
 * ``data.readcount`` : Summary of readcounts per gene

diff --git a/setup.py b/setup.py
@@ -13,8 +13,8 @@
     maintainer_email="naruemon.p@chula.ac.th",
     name=__pkg_name__,
     license="MIT",
-    description='xpore is a python package for Nanopore data analysis.',
-    version='v0.5.6',
+    description='xpore is a python package for Nanopore data analysis of differential RNA modifications.',
+    version='v1.0.0',
     long_description=README,
     long_description_content_type='text/markdown',
     url='https://github.com/GoekeLab/xpore',

diff --git a/xpore/scripts/dataprep.py b/xpore/scripts/dataprep.py
@@ -25,8 +25,6 @@ def get_args():
     required.add_argument('--eventalign', dest='eventalign', help='eventalign filepath, the output from nanopolish.',required=True)
     required.add_argument('--summary', dest='summary', help='eventalign summary filepath, the output from nanopolish.',required=True)
     required.add_argument('--out_dir', dest='out_dir', help='output directory.',required=True)
-
-
 
 
     # Optional
@@ -36,12 +34,13 @@ def get_args():
 
     # Use customised db
     # These arguments will be passed to Genome from pyensembl
-    optional.add_argument('--customised_genome', dest='customised_genome', help='customised_genome.',default=False,action='store_true')
-    optional.add_argument('--reference_name', dest='reference_name', help='reference_name.',type=str)
-    optional.add_argument('--annotation_name', dest='annotation_name', help='annotation_name.',type=str)
-    optional.add_argument('--gtf_path_or_url', dest='gtf_path_or_url', help='gtf_path_or_url.',type=str)
-    optional.add_argument('--transcript_fasta_paths_or_urls', dest='transcript_fasta_paths_or_urls', help='transcript_fasta_paths_or_urls.',type=str)
+    optional.add_argument('--customised_genome', dest='customised_genome', help='if customised genome provided.',default=False,action='store_true')
+    optional.add_argument('--reference_name', dest='reference_name', help='reference name.',type=str)
+    optional.add_argument('--annotation_name', dest='annotation_name', help='annotation name.',type=str)
+    optional.add_argument('--gtf_path_or_url', dest='gtf_path_or_url', help='gtf file path or url.',type=str)
+    optional.add_argument('--transcript_fasta_paths_or_urls', dest='transcript_fasta_paths_or_urls', help='transcript fasta paths or urls.',type=str)
 
+    optional.add_argument('--skip_eventalign_index', dest='skip_eventalign_index', help='skip indexing the eventalign nanopolish output.',default=False,action='store_true')
 
     # parser.add_argument('--features', dest='features', help='Signal features to extract.',type=list,default=['norm_mean'])
     optional.add_argument('--genome', dest='genome', help='to run on Genomic coordinates. Without this argument, the program will run on transcriptomic coordinates',default=False,action='store_true') 
@@ -626,7 +625,8 @@ def main():
     misc.makedirs(out_dir) #todo: check every level.
 
     # (1) For each read, combine multiple events aligned to the same positions, the results from nanopolish eventalign, into a single event per position.
-    parallel_index(eventalign_filepath,summary_filepath,chunk_size,out_dir,n_processes,resume)
+    if not args.skip_eventalign_index:
+        parallel_index(eventalign_filepath,summary_filepath,chunk_size,out_dir,n_processes,resume)
 
     # (2) Create a .json file, where the info of all reads are stored per position, for modelling.
     if genome: