Skip to content

Commit

Permalink
Merge pull request #58 from yuukiiwa/master
Browse files Browse the repository at this point in the history
replace if statement with KeyError in g2t mapping, add post-processing option, and add xpore wrapper
  • Loading branch information
ploy-np committed Jul 15, 2021
2 parents 783cbde + 902b6fe commit 1e5f255
Show file tree
Hide file tree
Showing 7 changed files with 120 additions and 76 deletions.
4 changes: 2 additions & 2 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@
author = 'Ploy N. Pratanwanich'

# The short X.Y version
version = '1.0'
version = '1.1'
# The full version, including alpha/beta/rc tags
release = '1.0'
release = '1.1'


# -- General configuration ---------------------------------------------------
Expand Down
5 changes: 2 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
name=__pkg_name__,
license="MIT",
description='xpore is a python package for Nanopore data analysis of differential RNA modifications.',
version='v1.0',
version='v1.1',
long_description=README,
long_description_content_type='text/markdown',
url='https://github.com/GoekeLab/xpore',
Expand All @@ -30,8 +30,7 @@
'ujson>=4.0.1'
],
python_requires=">=3.8",
entry_points={'console_scripts': ["xpore-dataprep={}.scripts.dataprep:main".format(__pkg_name__),
"xpore-diffmod={}.scripts.diffmod:main".format(__pkg_name__)]},
entry_points={'console_scripts': ["xpore={}.scripts.xpore:main".format(__pkg_name__)]},
classifiers=[
# Trove classifiers
# (https://pypi.python.org/pypi?%3Aaction=list_classifiers)
Expand Down
2 changes: 1 addition & 1 deletion xpore/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.0"
__version__ = "1.1"
40 changes: 3 additions & 37 deletions xpore/scripts/dataprep.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import argparse
import numpy as np
import pandas as pd
import os,re
Expand All @@ -13,33 +12,6 @@
from . import helper
from ..utils import misc

def get_args():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

optional = parser._action_groups.pop()
required = parser.add_argument_group('required arguments')

# Required arguments
required.add_argument('--eventalign', dest='eventalign', help='eventalign filepath, the output from nanopolish.',required=True)
##required.add_argument('--summary', dest='summary', help='eventalign summary filepath, the output from nanopolish.',required=True)
required.add_argument('--out_dir', dest='out_dir', help='output directory.',required=True)
optional.add_argument('--gtf_path_or_url', dest='gtf_path_or_url', help='gtf file path or url.',type=str)
optional.add_argument('--transcript_fasta_paths_or_urls', dest='transcript_fasta_paths_or_urls', help='transcript fasta paths or urls.',type=str)

# Optional
optional.add_argument('--skip_eventalign_indexing', dest='skip_eventalign_indexing', help='skip indexing the eventalign nanopolish output.',default=False,action='store_true')

# parser.add_argument('--features', dest='features', help='Signal features to extract.',type=list,default=['norm_mean'])
optional.add_argument('--genome', dest='genome', help='to run on Genomic coordinates. Without this argument, the program will run on transcriptomic coordinates',default=False,action='store_true')
optional.add_argument('--n_processes', dest='n_processes', help='number of processes to run.',type=int, default=1)
optional.add_argument('--chunk_size', dest='chunk_size', help='number of lines from nanopolish eventalign.txt for processing.',type=int, default=1000000)
optional.add_argument('--readcount_min', dest='readcount_min', help='minimum read counts per gene.',type=int, default=1)
optional.add_argument('--readcount_max', dest='readcount_max', help='maximum read counts per gene.',type=int, default=1000)
optional.add_argument('--resume', dest='resume', help='with this argument, the program will resume from the previous run.',default=False,action='store_true') #todo

parser._action_groups.append(optional)
return parser.parse_args()

def index(eventalign_result,pos_start,out_paths,locks):
eventalign_result = eventalign_result.set_index(['contig','read_index'])
pos_end=pos_start
Expand Down Expand Up @@ -276,9 +248,8 @@ def parallel_preprocess_gene(eventalign_filepath,fasta_dict,gtf_dict,out_dir,n_p
for tx_id in set(df_eventalign_index.index):
try:
## g_id = ensembl.transcript_by_id(tx_id).gene_id
if tx_id in gtf_dict:
g_id = gtf_dict[tx_id]['g_id']
except ValueError:
g_id = gtf_dict[tx_id]['g_id']
except KeyError:
continue
else:
# gene_ids = gene_ids.union([g_id])
Expand Down Expand Up @@ -687,8 +658,7 @@ def mergeGTFtxIDversion(gtf_path_or_url,out_dir):
new_gtf.close()
return new_gtf_path

def main():
args = get_args()
def dataprep(args):
#
n_processes = args.n_processes
eventalign_filepath = args.eventalign
Expand Down Expand Up @@ -720,10 +690,6 @@ def main():
gtf_path_or_url = mergeGTFtxIDversion(gtf_path_or_url,out_dir)
fasta_dict = readFasta(transcript_fasta_paths_or_urls)
gtf_dict = readGTF(gtf_path_or_url)
print(len(gtf_dict))
parallel_preprocess_gene(eventalign_filepath,fasta_dict,gtf_dict,out_dir,n_processes,readcount_min,readcount_max,resume)
else:
parallel_preprocess_tx(eventalign_filepath,out_dir,n_processes,readcount_min,readcount_max,resume)

if __name__ == '__main__':
main()
35 changes: 2 additions & 33 deletions xpore/scripts/diffmod.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import argparse
import numpy as np
import pandas
import os
Expand All @@ -12,25 +11,6 @@
from ..diffmod.gmm import GMM
from ..diffmod import io
from ..diffmod.statstest import StatsTest

def get_args():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

optional = parser._action_groups.pop()
required = parser.add_argument_group('required arguments')

# Required arguments
required.add_argument('--config', dest='config', help='yaml configuraion filepath.',required=True)

# Optional arguments
optional.add_argument('--n_processes', dest='n_processes', help='number of processes to run.',type=int,default=1)
optional.add_argument('--save_models', dest='save_models', help='with this argument, the program will save the model parameters for each id.',default=False,action='store_true') # todo
optional.add_argument('--resume', dest='resume', help='with this argument, the program will resume from the previous run.',default=False,action='store_true')

optional.add_argument('--ids', dest='ids', help='gene / transcript ids to model.',default=[],nargs='*')

parser._action_groups.append(optional)
return parser.parse_args()

def execute(idx, data_dict, data_info, method, criteria, model_kmer, prior_params, out_paths, save_models,locks):
"""
Expand Down Expand Up @@ -96,8 +76,7 @@ def execute(idx, data_dict, data_info, method, criteria, model_kmer, prior_param
with locks['log'], open(out_paths['log'],'a') as f:
f.write(idx + '\n')

def main():
args = get_args()
def diffmod(args):

n_processes = args.n_processes
config_filepath = args.config
Expand Down Expand Up @@ -205,16 +184,6 @@ def main():
# Close data files
for f in f_data.values():
f.close()

with open(out_paths['log'],'a+') as f:
f.write(helper.decor_message('successfully finished'))


if __name__ == '__main__':
"""
Usage:
xpore-diffmod --config CONFIG [--n_processes N_PROCESSES] \
[--save_models] [--resume] \
[--ids [IDS [IDS ...]]]
"""
main()
40 changes: 40 additions & 0 deletions xpore/scripts/postprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os

def run_postprocessing(diffmod_table_path,out_dir):
file=open(diffmod_table_path,"r")
header=file.readline()
entries=file.readlines()
outfile_path=os.path.join(out_dir,"majority_direction_kmer_diffmod.table")
outfile=open(outfile_path,"w")
outfile.write(header)
header=header.strip().split(',')
kmer_ind,dir_ind=header.index('kmer'),header.index('mod_assignment')
dict={}
for ln in entries:
l=ln.strip().split(",")
if l[kmer_ind] not in dict:
dict[l[kmer_ind]]={l[dir_ind]:1}
else:
if l[dir_ind] not in dict[l[kmer_ind]]:
dict[l[kmer_ind]][l[dir_ind]]=1
else:
dict[l[kmer_ind]][l[dir_ind]]+=1
for k in dict:
if len(dict[k]) > 1: ##consider one modification type per k-mer
if dict[k]['higher'] <= dict[k]['lower']: ##choose the majority
dict[k]['choose']='lower'
else:
dict[k]['choose']='higher'
else:
dict[k]['choose']=list(dict[k].keys())[0]
for ln in entries:
l=ln.strip().split(",")
if l[dir_ind] == dict[l[kmer_ind]]['choose']:
outfile.write(ln)
outfile.close()

def postprocessing(args):
diffmod_dir = args.diffmod_dir
diffmod_table_path = os.path.join(diffmod_dir,"diffmod.table")
run_postprocessing(diffmod_table_path,diffmod_dir)

70 changes: 70 additions & 0 deletions xpore/scripts/xpore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import sys

from .dataprep import dataprep
from .diffmod import diffmod
from .postprocessing import postprocessing

def parse_options(argv):

"""Parses options from the command line """

from argparse import ArgumentParser
from xpore import __version__

parser = ArgumentParser(prog='xpore')
subparsers = parser.add_subparsers(help='Running modes', metavar='{dataprep, diffmod, postprocessing}')
parser.add_argument('-v', '--version', action='version', version='%(prog)s {version}'.format(version=__version__))

### RUN MODE "DATAPREP"
parser_dataprep = subparsers.add_parser('dataprep', help='run mode to preprocess nanopolish eventalign.txt before differential modification analysis')
optional_dataprep = parser_dataprep._action_groups.pop()
required_dataprep = parser_dataprep.add_argument_group('required arguments')
# Required arguments
required_dataprep.add_argument('--eventalign', dest='eventalign', help='eventalign filepath, the output from nanopolish.',required=True)
##required.add_argument('--summary', dest='summary', help='eventalign summary filepath, the output from nanopolish.',required=True)
required_dataprep.add_argument('--out_dir', dest='out_dir', help='output directory.',required=True)
optional_dataprep.add_argument('--gtf_path_or_url', dest='gtf_path_or_url', help='gtf file path or url.',type=str)
optional_dataprep.add_argument('--transcript_fasta_paths_or_urls', dest='transcript_fasta_paths_or_urls', help='transcript fasta paths or urls.',type=str)
# Optional arguments
optional_dataprep.add_argument('--skip_eventalign_indexing', dest='skip_eventalign_indexing', help='skip indexing the eventalign nanopolish output.',default=False,action='store_true')
# parser.add_argument('--features', dest='features', help='Signal features to extract.',type=list,default=['norm_mean'])
optional_dataprep.add_argument('--genome', dest='genome', help='to run on Genomic coordinates. Without this argument, the program will run on transcriptomic coordinates',default=False,action='store_true')
optional_dataprep.add_argument('--n_processes', dest='n_processes', help='number of processes to run.',type=int, default=1)
optional_dataprep.add_argument('--chunk_size', dest='chunk_size', help='number of lines from nanopolish eventalign.txt for processing.',type=int, default=1000000)
optional_dataprep.add_argument('--readcount_min', dest='readcount_min', help='minimum read counts per gene.',type=int, default=1)
optional_dataprep.add_argument('--readcount_max', dest='readcount_max', help='maximum read counts per gene.',type=int, default=1000)
optional_dataprep.add_argument('--resume', dest='resume', help='with this argument, the program will resume from the previous run.',default=False,action='store_true') #todo
parser_dataprep._action_groups.append(optional_dataprep)
parser_dataprep.set_defaults(func=dataprep)

### RUN MODE "DIFFMOD"
parser_diffmod = subparsers.add_parser('diffmod', help='run mode to perform differential modification analysis')
optional_diffmod = parser_diffmod._action_groups.pop()
required_diffmod = parser_diffmod.add_argument_group('required arguments')
# Required arguments
required_diffmod.add_argument('--config', dest='config', help='yaml configuraion filepath.',required=True)
# Optional arguments
optional_diffmod.add_argument('--n_processes', dest='n_processes', help='number of processes to run.',type=int,default=1)
optional_diffmod.add_argument('--save_models', dest='save_models', help='with this argument, the program will save the model parameters for each id.',default=False,action='store_true') # todo
optional_diffmod.add_argument('--resume', dest='resume', help='with this argument, the program will resume from the previous run.',default=False,action='store_true')
optional_diffmod.add_argument('--ids', dest='ids', help='gene / transcript ids to model.',default=[],nargs='*')
parser_diffmod._action_groups.append(optional_diffmod)
parser_diffmod.set_defaults(func=diffmod)

### RUN MODE "POSTPROCESSING"
parser_postprocessing = subparsers.add_parser('postprocessing', help='run mode to post process diffmod.table')
required_postprocessing = parser_postprocessing.add_argument_group('required arguments')
# Required arguments
required_postprocessing.add_argument('--diffmod_dir', dest='diffmod_dir', help='diffmod directory path, the output from xpore-diffmod.',required=True)
parser_postprocessing.set_defaults(func=postprocessing)

return parser.parse_args(argv[1:])

def main(argv=sys.argv):

### get command line options
options = parse_options(argv)
options.func(options)

if __name__ == "__main__":
main(sys.argv)

0 comments on commit 1e5f255

Please sign in to comment.