Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix config and setup of user project #104

Merged
merged 17 commits into from
Sep 13, 2020
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,6 @@ tests/data/*
!.vscode/launch.json
!.vscode/extensions.json
*.code-workspace

# Mac
.DS_Store
250 changes: 237 additions & 13 deletions autometa/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,14 @@
import sys

import multiprocessing as mp
import pandas as pd

from autometa.common import coverage, kmers, markers, utilities
from autometa.common.metagenome import Metagenome
from autometa.config.user import AutometaUser
from autometa.taxonomy import vote
from autometa.binning import recursive_dbscan
from autometa.common.exceptions import AutometaError


logger = logging.getLogger("autometa")
Expand Down Expand Up @@ -97,6 +103,214 @@ def init_logger(fpath=None, verbosity=0):
return logger


@utilities.timeit
def run_autometa(mgargs):
"""Run the autometa metagenome binning pipeline using the provided metagenome args.

Pipeline
--------

#. Filter contigs in metagenome by length
#. Determine metagenome coverages
#. Count metagenome k-mers
#. Call metagenome ORFs
#. Assign contig taxonomy and filter by kingdom of interest
#. Annotate gene markers
#. Prepare primary table from annotations
#. Bin contigs using primary table

Parameters
----------
mgargs : argparse.Namespace
metagenome args

Returns
-------
NoneType

Raises
-------
TODO: Need to enumerate all exceptions raised from within binning pipeline.
I.e. Demarkate new exception (not yet handled) vs. handled exception.
Subclassing an AutometaError class may be most appropriate use case here.
"""

# 1. Apply length filter (if desired).
mg = Metagenome(
assembly=mgargs.files.metagenome,
outdir=mgargs.parameters.outdir,
nucl_orfs_fpath=mgargs.files.nucleotide_orfs,
prot_orfs_fpath=mgargs.files.amino_acid_orfs,
fwd_reads=mgargs.files.fwd_reads,
rev_reads=mgargs.files.rev_reads,
se_reads=mgargs.files.se_reads,
)
# TODO asynchronous execution here (work-queue/Makeflow tasks) 1a.
# Describe metagenome prior to length filter
# COMBAK: Checkpoint length filtered
try:
# Original (raw) file should not be manipulated so we return a new object
mg = mg.length_filter(
out=mgargs.files.length_filtered, cutoff=mgargs.parameters.length_cutoff
)
except FileExistsError as err:
logger.debug(f"{mgargs.files.length_filtered} already exists. Continuing..")
mg = Metagenome(
assembly=mgargs.files.length_filtered,
outdir=mgargs.parameters.outdir,
nucl_orfs_fpath=mgargs.files.nucleotide_orfs,
prot_orfs_fpath=mgargs.files.amino_acid_orfs,
fwd_reads=mgargs.files.fwd_reads,
rev_reads=mgargs.files.rev_reads,
se_reads=mgargs.files.se_reads,
)
# TODO asynchronous execution here (work-queue/Makeflow tasks) 1b.
# Describe metagenome after length filter

# TODO asynchronous execution here (work-queue/Makeflow tasks) 2a.
counts = kmers.count(
assembly=mg.assembly,
size=mgargs.parameters.kmer_size,
out=mgargs.files.kmer_counts,
force=mgargs.parameters.force,
verbose=mgargs.parameters.verbose,
multiprocess=mgargs.parameters.kmer_multiprocess,
cpus=mgargs.parameters.cpus,
)
kmers.normalize(
df=counts,
method=mgargs.parameters.kmer_transform,
out=mgargs.files.kmer_normalized,
force=mgargs.parameters.force,
)
# COMBAK: Checkpoint kmers
# TODO asynchronous execution here (work-queue/Makeflow tasks) 2b.
coverage.get(
fasta=mg.assembly,
out=mgargs.files.coverages,
from_spades=mgargs.parameters.cov_from_spades,
fwd_reads=mgargs.files.fwd_reads,
rev_reads=mgargs.files.rev_reads,
se_reads=mgargs.files.se_reads,
sam=mgargs.files.sam,
bam=mgargs.files.bam,
lengths=mgargs.files.lengths,
bed=mgargs.files.bed,
cpus=mgargs.parameters.cpus,
)
# TODO asynchronous execution here (work-queue/Makeflow tasks) 2c.
mg.call_orfs(
force=mgargs.parameters.force,
cpus=mgargs.parameters.cpus,
parallel=mgargs.parameters.parallel,
)

# 3. Assign taxonomy (if desired).
# COMBAK: Checkpoint coverages
if mgargs.parameters.do_taxonomy:
# First assign taxonomy
vote.assign(
method=mgargs.parameters.taxon_method,
outfpath=mgargs.files.taxonomy,
fasta=mg.assembly,
prot_orfs=mgargs.files.amino_acid_orfs,
nucl_orfs=mgargs.files.nucleotide_orfs,
blast=mgargs.files.blastp,
hits=mgargs.files.blastp_hits,
lca_fpath=mgargs.files.lca,
ncbi_dir=mgargs.databases.ncbi,
tmpdir=mgargs.parameters.tmpdir,
usepickle=mgargs.parameters.usepickle,
force=mgargs.parameters.force,
verbose=mgargs.parameters.verbose,
parallel=mgargs.parameters.parallel,
cpus=mgargs.parameters.cpus,
)
# Now filter by Kingdom
metabin = vote.get(
fpath=mgargs.files.taxonomy,
assembly=mg.assembly,
ncbi_dir=mgargs.databases.ncbi,
kingdom=mgargs.parameters.kingdom,
outdir=mgargs.parameters.outdir,
)
orfs_fpath = os.path.join(
mgargs.parameters.outdir, f"{mgargs.parameters.kingdom}.orfs.faa"
)
metabin.write_orfs(fpath=orfs_fpath, orf_type="prot")
contigs = set(metabin.contig_ids)
else:
orfs_fpath = mgargs.files.amino_acid_orfs
contigs = {record.id for record in mg.seqrecords}

# 4. Annotate marker genes for retained ORFs.
# markers.get(...)
# We use orfs_fpath here instead of our config filepath
# because we may have filtered out ORFs by taxonomy.
if mgargs.parameters.kingdom == "bacteria":
scans = mgargs.files.bacteria_hmmscan
markers_outfpath = mgargs.files.bacteria_markers
else:
scans = mgargs.files.archaea_hmmscan
markers_outfpath = mgargs.files.archaea_markers

markers_df = markers.get(
kingdom=mgargs.parameters.kingdom,
orfs=orfs_fpath,
dbdir=mgargs.databases.markers,
scans=scans,
out=markers_outfpath,
force=mgargs.parameters.force,
seed=mgargs.parameters.seed,
)

# 5. Prepare contigs' annotations for binning
# At this point, we should have kmer counts, coverages and possibly taxonomy info
# Embed the kmer counts to retrieve our initial master dataframe
master = kmers.embed(
kmers=mgargs.files.kmer_normalized,
out=mgargs.files.kmer_embedded,
force=mgargs.parameters.force,
embed_dimensions=mgargs.parameters.embed_dimensions,
do_pca=mgargs.parameters.do_pca,
pca_dimensions=mgargs.parameters.pca_dimensions,
method=mgargs.parameters.embed_method,
seed=mgargs.parameters.seed,
)
# Add coverage and possibly taxonomy annotations
master = master[master.index.isin(contigs)]
if mgargs.parameters.do_taxonomy:
annotations = [mgargs.files.coverages, mgargs.files.taxonomy]
else:
annotations = [mgargs.files.coverages]
for fpath in annotations:
df = pd.read_csv(fpath, sep="\t", index_col="contig")
master = pd.merge(master, df, how="left", left_index=True, right_index=True)
master = master.convert_dtypes()

# 6. Bin contigs
bins_df = recursive_dbscan.binning(
master=master,
markers=markers_df,
domain=mgargs.parameters.kingdom,
completeness=mgargs.parameters.completeness,
purity=mgargs.parameters.purity,
taxonomy=mgargs.parameters.do_taxonomy,
starting_rank=mgargs.parameters.starting_rank,
method=mgargs.parameters.clustering_method,
reverse_ranks=mgargs.parameters.reverse_ranks,
verbose=mgargs.parameters.verbose,
)
binning_fpath = (
mgargs.files.bacteria_binning
if mgargs.parameters.kingdom == "bacteria"
else mgargs.files.archaea_binning
)
binning_cols = ["cluster", "completeness", "purity"]
bins_df[binning_cols].to_csv(binning_fpath, sep="\t", index=True, header=True)
return bins_df


def main(args):
"""
Main logic for running autometa pipeline.
Expand All @@ -119,15 +333,20 @@ def main(args):
# Configure AutometaUser
# TODO: master from WorkQueue is AutometaUser
user = AutometaUser(nproc=args.cpus)
user.configure(dryrun=args.check_dependencies)
user.configure(dryrun=args.check_dependencies, update=args.update)

# all_bins = {}
for config in args.config:
# TODO: Add directions to master from WorkQueue
mgargs = user.prepare_binning_args(config)
user.run_binning(mgargs)
# user.refine_binning()
# user.process_binning()
# user.get_pangenomes()
bins = run_autometa(mgargs)

# TODO: refinement/processing/prep for pangenome algos
# refined_bins = refine_binning(bins)
# processed_bins = process_binning(refined_bins)
# sample = mgargs.parameters.metagenome_num
# all_bins.update({sample: processed_bins})
# pangenomes = get_pangenomes(all_bins)


def entrypoint():
Expand Down Expand Up @@ -177,6 +396,11 @@ def entrypoint():
help="Check user executables and databases accessible to Autometa and exit.",
action="store_true",
)
parser.add_argument(
"--update",
help="Update existing databases to most recent releases",
action="store_true",
)
args = parser.parse_args()

try:
Expand All @@ -185,16 +409,16 @@ def entrypoint():
logger.info("User cancelled run. Exiting...")
sys.exit(1)
except Exception as err:
issue_request = """
An error was encountered!
logger.exception(err)
if not issubclass(err.__class__, AutometaError):
issue_request = """
An error was encountered!

Please help us fix your problem!
Please help us fix your problem!

You may file an issue with us at https://github.com/KwanLab/Autometa/issues/new/choose
"""
err.issue_request = issue_request
logger.exception(err)
logger.info(issue_request)
You may file an issue with us at https://github.com/KwanLab/Autometa/issues/new/choose
"""
logger.info(issue_request)
sys.exit(1)


Expand Down
24 changes: 13 additions & 11 deletions autometa/binning/recursive_dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@
from sklearn.cluster import DBSCAN
from hdbscan import HDBSCAN

from autometa.common.markers import Markers
from autometa.common.markers import load as load_markers
from autometa.common import kmers

# TODO: This should be from autometa.common.kmers import Kmers
# So later we can simply/and more clearly do Kmers.load(kmers_fpath).embed(method)
from autometa.common.exceptions import BinningError
from autometa.common.exceptions import TableFormatError
from autometa.taxonomy.ncbi import NCBI

pd.set_option("mode.chained_assignment", None)
Expand Down Expand Up @@ -147,7 +147,7 @@ def run_dbscan(df, eps, dropcols=["cluster", "purity", "completeness"]):
if "coverage" in df.columns:
cols.append("coverage")
if np.any(df.isnull()):
raise BinningError(
raise TableFormatError(
f"df is missing {df.isnull().sum().sum()} kmer/coverage annotations"
)
X = df.loc[:, cols].to_numpy()
Expand Down Expand Up @@ -289,9 +289,11 @@ def run_hdbscan(

Raises
-------
BinningError
------------
Dataframe is missing kmer/coverage annotations
ValueError
sets `usecols` and `dropcols` may not share elements
TableFormatError
`df` is missing k-mer or coverage annotations.

"""
for col in dropcols:
if col in df.columns:
Expand All @@ -304,7 +306,7 @@ def run_hdbscan(
if "coverage" in df.columns:
cols.append("coverage")
if np.any(df.isnull()):
raise BinningError(
raise TableFormatError(
f"df is missing {df.isnull().sum().sum()} kmer/coverage annotations"
)
X = df.loc[:, cols].to_numpy()
Expand Down Expand Up @@ -552,12 +554,12 @@ def binning(

Raises
-------
BinningError
TableFormatError
No marker information is availble for contigs to be binned.
"""
# First check needs to ensure we have markers available to check binning quality...
if master.loc[master.index.isin(markers.index)].empty:
raise BinningError(
raise TableFormatError(
"No markers for contigs in table. Unable to assess binning quality"
)

Expand Down Expand Up @@ -653,7 +655,7 @@ def main():
import logging as logger

logger.basicConfig(
format="%(asctime)s : %(name)s : %(levelname)s : %(message)s",
format="[%(asctime)s %(levelname)s] %(name)s: %(message)s",
datefmt="%m/%d/%Y %I:%M:%S %p",
level=logger.DEBUG,
)
Expand Down Expand Up @@ -726,7 +728,7 @@ def main():
kmers_df, cov_df[["coverage"]], how="left", left_index=True, right_index=True,
)

markers_df = Markers.load(args.markers)
markers_df = load_markers(args.markers)
markers_df = markers_df.convert_dtypes()
# Taxonomy.load()
if args.taxonomy:
Expand Down
5 changes: 2 additions & 3 deletions autometa/binning/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,7 @@
from autometa import config
from autometa.taxonomy.ncbi import NCBI
from autometa.taxonomy.majority_vote import rank_taxids
from autometa.common.metabin import MetaBin
from autometa.common.markers import Markers
from autometa.common import markers


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -173,7 +172,7 @@ def get_metabin_stats(bin_df, markers_fpath, assembly):
dataframe consisting of various metabin statistics indexed by cluster.
"""
stats = []
markers_df = Markers.load(markers_fpath)
markers_df = markers.load(markers_fpath)
for cluster, dff in bin_df.fillna(value={"cluster": "unclustered"}).groupby(
"cluster"
):
Expand Down
Loading