Merge pull request #58 from yuukiiwa/master

replace if statement with KeyError in g2t mapping, add post-processing option, and add xpore wrapper
GoekeLab · Jul 15, 2021 · 1e5f255 · 1e5f255
2 parents 783cbde + 902b6fe
commit 1e5f255
Show file tree

Hide file tree

Showing 7 changed files with 120 additions and 76 deletions.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -26,9 +26,9 @@
 author = 'Ploy N. Pratanwanich'
 
 # The short X.Y version
-version = '1.0'
+version = '1.1'
 # The full version, including alpha/beta/rc tags
-release = '1.0'
+release = '1.1'
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/setup.py b/setup.py
@@ -14,7 +14,7 @@
     name=__pkg_name__,
     license="MIT",
     description='xpore is a python package for Nanopore data analysis of differential RNA modifications.',
-    version='v1.0',
+    version='v1.1',
     long_description=README,
     long_description_content_type='text/markdown',
     url='https://github.com/GoekeLab/xpore',
@@ -30,8 +30,7 @@
             'ujson>=4.0.1'
             ],
     python_requires=">=3.8",
-    entry_points={'console_scripts': ["xpore-dataprep={}.scripts.dataprep:main".format(__pkg_name__),
-                                      "xpore-diffmod={}.scripts.diffmod:main".format(__pkg_name__)]},
+    entry_points={'console_scripts': ["xpore={}.scripts.xpore:main".format(__pkg_name__)]},
     classifiers=[
         # Trove classifiers
         # (https://pypi.python.org/pypi?%3Aaction=list_classifiers)

diff --git a/xpore/__init__.py b/xpore/__init__.py
@@ -1 +1 @@
-__version__ = "1.0"
+__version__ = "1.1"
diff --git a/xpore/scripts/dataprep.py b/xpore/scripts/dataprep.py
@@ -1,4 +1,3 @@
-import argparse
 import numpy as np
 import pandas as pd
 import os,re
@@ -13,33 +12,6 @@
 from . import helper
 from ..utils import misc
 
-def get_args():
-    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-    optional = parser._action_groups.pop()
-    required = parser.add_argument_group('required arguments')
-
-    # Required arguments
-    required.add_argument('--eventalign', dest='eventalign', help='eventalign filepath, the output from nanopolish.',required=True)
-    ##required.add_argument('--summary', dest='summary', help='eventalign summary filepath, the output from nanopolish.',required=True)
-    required.add_argument('--out_dir', dest='out_dir', help='output directory.',required=True)
-    optional.add_argument('--gtf_path_or_url', dest='gtf_path_or_url', help='gtf file path or url.',type=str)
-    optional.add_argument('--transcript_fasta_paths_or_urls', dest='transcript_fasta_paths_or_urls', help='transcript fasta paths or urls.',type=str)
-
-    # Optional
-    optional.add_argument('--skip_eventalign_indexing', dest='skip_eventalign_indexing', help='skip indexing the eventalign nanopolish output.',default=False,action='store_true')
-
-    # parser.add_argument('--features', dest='features', help='Signal features to extract.',type=list,default=['norm_mean'])
-    optional.add_argument('--genome', dest='genome', help='to run on Genomic coordinates. Without this argument, the program will run on transcriptomic coordinates',default=False,action='store_true') 
-    optional.add_argument('--n_processes', dest='n_processes', help='number of processes to run.',type=int, default=1)
-    optional.add_argument('--chunk_size', dest='chunk_size', help='number of lines from nanopolish eventalign.txt for processing.',type=int, default=1000000)
-    optional.add_argument('--readcount_min', dest='readcount_min', help='minimum read counts per gene.',type=int, default=1)
-    optional.add_argument('--readcount_max', dest='readcount_max', help='maximum read counts per gene.',type=int, default=1000)
-    optional.add_argument('--resume', dest='resume', help='with this argument, the program will resume from the previous run.',default=False,action='store_true') #todo
-
-    parser._action_groups.append(optional)
-    return parser.parse_args()
-
 def index(eventalign_result,pos_start,out_paths,locks):
     eventalign_result = eventalign_result.set_index(['contig','read_index'])
     pos_end=pos_start
@@ -276,9 +248,8 @@ def parallel_preprocess_gene(eventalign_filepath,fasta_dict,gtf_dict,out_dir,n_p
     for tx_id in set(df_eventalign_index.index):
         try:
 ##           g_id = ensembl.transcript_by_id(tx_id).gene_id 
-            if tx_id in gtf_dict:
-                g_id = gtf_dict[tx_id]['g_id'] 
-        except ValueError:
+            g_id = gtf_dict[tx_id]['g_id'] 
+        except KeyError:
             continue
         else:
 #             gene_ids = gene_ids.union([g_id])
@@ -687,8 +658,7 @@ def mergeGTFtxIDversion(gtf_path_or_url,out_dir):
     new_gtf.close()
     return new_gtf_path
 
-def main():
-    args = get_args()
+def dataprep(args):
     #
     n_processes = args.n_processes        
     eventalign_filepath = args.eventalign
@@ -720,10 +690,6 @@ def main():
             gtf_path_or_url = mergeGTFtxIDversion(gtf_path_or_url,out_dir)
         fasta_dict = readFasta(transcript_fasta_paths_or_urls)
         gtf_dict = readGTF(gtf_path_or_url)
-        print(len(gtf_dict))
         parallel_preprocess_gene(eventalign_filepath,fasta_dict,gtf_dict,out_dir,n_processes,readcount_min,readcount_max,resume)
     else:
         parallel_preprocess_tx(eventalign_filepath,out_dir,n_processes,readcount_min,readcount_max,resume)
-
-if __name__ == '__main__':
-    main()
diff --git a/xpore/scripts/diffmod.py b/xpore/scripts/diffmod.py
@@ -1,4 +1,3 @@
-import argparse
 import numpy as np
 import pandas
 import os
@@ -12,25 +11,6 @@
 from ..diffmod.gmm import GMM
 from ..diffmod import io
 from ..diffmod.statstest import StatsTest
-
-def get_args():
-    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-    optional = parser._action_groups.pop()
-    required = parser.add_argument_group('required arguments')
-
-    # Required arguments
-    required.add_argument('--config', dest='config', help='yaml configuraion filepath.',required=True)
-
-    # Optional arguments
-    optional.add_argument('--n_processes', dest='n_processes', help='number of processes to run.',type=int,default=1)
-    optional.add_argument('--save_models', dest='save_models', help='with this argument, the program will save the model parameters for each id.',default=False,action='store_true') # todo
-    optional.add_argument('--resume', dest='resume', help='with this argument, the program will resume from the previous run.',default=False,action='store_true') 
-
-    optional.add_argument('--ids', dest='ids', help='gene / transcript ids to model.',default=[],nargs='*')
-
-    parser._action_groups.append(optional)
-    return parser.parse_args()
 
 def execute(idx, data_dict, data_info, method, criteria, model_kmer, prior_params, out_paths, save_models,locks):
     """
@@ -96,8 +76,7 @@ def execute(idx, data_dict, data_info, method, criteria, model_kmer, prior_param
     with locks['log'], open(out_paths['log'],'a') as f:
         f.write(idx + '\n')
 
-def main():
-    args = get_args()
+def diffmod(args):
 
     n_processes = args.n_processes       
     config_filepath = args.config
@@ -205,16 +184,6 @@ def main():
     # Close data files
     for f in f_data.values():
         f.close()   
-        
+
     with open(out_paths['log'],'a+') as f:
         f.write(helper.decor_message('successfully finished'))
-
-
-if __name__ == '__main__':
-    """
-    Usage:
-        xpore-diffmod --config CONFIG [--n_processes N_PROCESSES] \
-                     [--save_models] [--resume] \
-                     [--ids [IDS [IDS ...]]]
-    """
-    main()
diff --git a/xpore/scripts/postprocessing.py b/xpore/scripts/postprocessing.py
@@ -0,0 +1,40 @@
+import os
+
+def run_postprocessing(diffmod_table_path,out_dir):
+    file=open(diffmod_table_path,"r")
+    header=file.readline()
+    entries=file.readlines()
+    outfile_path=os.path.join(out_dir,"majority_direction_kmer_diffmod.table")
+    outfile=open(outfile_path,"w")
+    outfile.write(header)
+    header=header.strip().split(',')
+    kmer_ind,dir_ind=header.index('kmer'),header.index('mod_assignment')    
+    dict={}
+    for ln in entries:
+        l=ln.strip().split(",")
+        if l[kmer_ind] not in dict:
+            dict[l[kmer_ind]]={l[dir_ind]:1}
+        else:
+            if l[dir_ind] not in dict[l[kmer_ind]]:
+                dict[l[kmer_ind]][l[dir_ind]]=1
+            else:
+                dict[l[kmer_ind]][l[dir_ind]]+=1
+    for k in dict:
+        if len(dict[k]) > 1:  ##consider one modification type per k-mer
+            if dict[k]['higher'] <= dict[k]['lower']: ##choose the majority
+                dict[k]['choose']='lower'
+            else:
+                dict[k]['choose']='higher'
+        else:
+            dict[k]['choose']=list(dict[k].keys())[0]
+    for ln in entries:
+        l=ln.strip().split(",")
+        if l[dir_ind] == dict[l[kmer_ind]]['choose']:
+            outfile.write(ln)
+    outfile.close()
+
+def postprocessing(args):
+    diffmod_dir = args.diffmod_dir
+    diffmod_table_path = os.path.join(diffmod_dir,"diffmod.table")
+    run_postprocessing(diffmod_table_path,diffmod_dir)
+
diff --git a/xpore/scripts/xpore.py b/xpore/scripts/xpore.py
@@ -0,0 +1,70 @@
+import sys
+
+from .dataprep import dataprep
+from .diffmod import diffmod
+from .postprocessing import postprocessing
+
+def parse_options(argv):
+
+    """Parses options from the command line """
+
+    from argparse import ArgumentParser
+    from xpore import __version__
+
+    parser = ArgumentParser(prog='xpore')
+    subparsers = parser.add_subparsers(help='Running modes', metavar='{dataprep, diffmod, postprocessing}')
+    parser.add_argument('-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__))
+
+    ### RUN MODE "DATAPREP"
+    parser_dataprep = subparsers.add_parser('dataprep', help='run mode to preprocess nanopolish eventalign.txt before differential modification analysis')
+    optional_dataprep = parser_dataprep._action_groups.pop()
+    required_dataprep = parser_dataprep.add_argument_group('required arguments')
+    # Required arguments
+    required_dataprep.add_argument('--eventalign', dest='eventalign', help='eventalign filepath, the output from nanopolish.',required=True)
+    ##required.add_argument('--summary', dest='summary', help='eventalign summary filepath, the output from nanopolish.',required=True)
+    required_dataprep.add_argument('--out_dir', dest='out_dir', help='output directory.',required=True)
+    optional_dataprep.add_argument('--gtf_path_or_url', dest='gtf_path_or_url', help='gtf file path or url.',type=str)
+    optional_dataprep.add_argument('--transcript_fasta_paths_or_urls', dest='transcript_fasta_paths_or_urls', help='transcript fasta paths or urls.',type=str)
+    # Optional arguments
+    optional_dataprep.add_argument('--skip_eventalign_indexing', dest='skip_eventalign_indexing', help='skip indexing the eventalign nanopolish output.',default=False,action='store_true')
+    # parser.add_argument('--features', dest='features', help='Signal features to extract.',type=list,default=['norm_mean'])
+    optional_dataprep.add_argument('--genome', dest='genome', help='to run on Genomic coordinates. Without this argument, the program will run on transcriptomic coordinates',default=False,action='store_true') 
+    optional_dataprep.add_argument('--n_processes', dest='n_processes', help='number of processes to run.',type=int, default=1)
+    optional_dataprep.add_argument('--chunk_size', dest='chunk_size', help='number of lines from nanopolish eventalign.txt for processing.',type=int, default=1000000)
+    optional_dataprep.add_argument('--readcount_min', dest='readcount_min', help='minimum read counts per gene.',type=int, default=1)
+    optional_dataprep.add_argument('--readcount_max', dest='readcount_max', help='maximum read counts per gene.',type=int, default=1000)
+    optional_dataprep.add_argument('--resume', dest='resume', help='with this argument, the program will resume from the previous run.',default=False,action='store_true') #todo
+    parser_dataprep._action_groups.append(optional_dataprep)
+    parser_dataprep.set_defaults(func=dataprep)
+
+    ### RUN MODE "DIFFMOD"
+    parser_diffmod = subparsers.add_parser('diffmod', help='run mode to perform differential modification analysis')
+    optional_diffmod = parser_diffmod._action_groups.pop()
+    required_diffmod = parser_diffmod.add_argument_group('required arguments')
+    # Required arguments
+    required_diffmod.add_argument('--config', dest='config', help='yaml configuraion filepath.',required=True)
+    # Optional arguments
+    optional_diffmod.add_argument('--n_processes', dest='n_processes', help='number of processes to run.',type=int,default=1)
+    optional_diffmod.add_argument('--save_models', dest='save_models', help='with this argument, the program will save the model parameters for each id.',default=False,action='store_true') # todo
+    optional_diffmod.add_argument('--resume', dest='resume', help='with this argument, the program will resume from the previous run.',default=False,action='store_true') 
+    optional_diffmod.add_argument('--ids', dest='ids', help='gene / transcript ids to model.',default=[],nargs='*')
+    parser_diffmod._action_groups.append(optional_diffmod)
+    parser_diffmod.set_defaults(func=diffmod)
+
+    ### RUN MODE "POSTPROCESSING"
+    parser_postprocessing = subparsers.add_parser('postprocessing', help='run mode to post process diffmod.table')
+    required_postprocessing = parser_postprocessing.add_argument_group('required arguments')
+    # Required arguments
+    required_postprocessing.add_argument('--diffmod_dir', dest='diffmod_dir', help='diffmod directory path, the output from xpore-diffmod.',required=True)
+    parser_postprocessing.set_defaults(func=postprocessing)
+
+    return parser.parse_args(argv[1:])
+
+def main(argv=sys.argv):
+
+    ### get command line options
+    options = parse_options(argv)
+    options.func(options)
+
+if __name__ == "__main__":
+    main(sys.argv)