KwanLab · evanroyrees · Sep 13, 2020 · Jul 21, 2020 · Jul 21, 2020 · Jul 29, 2020
diff --git a/.gitignore b/.gitignore
@@ -151,3 +151,6 @@ tests/data/*
 !.vscode/launch.json
 !.vscode/extensions.json
 *.code-workspace
+
+# Mac
+.DS_Store
diff --git a/autometa/__main__.py b/autometa/__main__.py
@@ -30,8 +30,14 @@
 import sys
 
 import multiprocessing as mp
+import pandas as pd
 
+from autometa.common import coverage, kmers, markers, utilities
+from autometa.common.metagenome import Metagenome
 from autometa.config.user import AutometaUser
+from autometa.taxonomy import vote
+from autometa.binning import recursive_dbscan
+from autometa.common.exceptions import AutometaError
 
 
 logger = logging.getLogger("autometa")
@@ -97,6 +103,214 @@ def init_logger(fpath=None, verbosity=0):
     return logger
 
 
+@utilities.timeit
+def run_autometa(mgargs):
+    """Run the autometa metagenome binning pipeline using the provided metagenome args.
+
+    Pipeline
+    --------
+
+        #. Filter contigs in metagenome by length
+        #. Determine metagenome coverages
+        #. Count metagenome k-mers
+        #. Call metagenome ORFs
+        #. Assign contig taxonomy and filter by kingdom of interest
+        #. Annotate gene markers
+        #. Prepare primary table from annotations
+        #. Bin contigs using primary table
+
+    Parameters
+    ----------
+    mgargs : argparse.Namespace
+        metagenome args
+
+    Returns
+    -------
+    NoneType
+
+    Raises
+    -------
+    TODO: Need to enumerate all exceptions raised from within binning pipeline.
+    I.e. Demarkate new exception (not yet handled) vs. handled exception.
+    Subclassing an AutometaError class may be most appropriate use case here.
+    """
+
+    # 1. Apply length filter (if desired).
+    mg = Metagenome(
+        assembly=mgargs.files.metagenome,
+        outdir=mgargs.parameters.outdir,
+        nucl_orfs_fpath=mgargs.files.nucleotide_orfs,
+        prot_orfs_fpath=mgargs.files.amino_acid_orfs,
+        fwd_reads=mgargs.files.fwd_reads,
+        rev_reads=mgargs.files.rev_reads,
+        se_reads=mgargs.files.se_reads,
+    )
+    # TODO asynchronous execution here (work-queue/Makeflow tasks) 1a.
+    # Describe metagenome prior to length filter
+    # COMBAK: Checkpoint length filtered
+    try:
+        # Original (raw) file should not be manipulated so we return a new object
+        mg = mg.length_filter(
+            out=mgargs.files.length_filtered, cutoff=mgargs.parameters.length_cutoff
+        )
+    except FileExistsError as err:
+        logger.debug(f"{mgargs.files.length_filtered} already exists. Continuing..")
+        mg = Metagenome(
+            assembly=mgargs.files.length_filtered,
+            outdir=mgargs.parameters.outdir,
+            nucl_orfs_fpath=mgargs.files.nucleotide_orfs,
+            prot_orfs_fpath=mgargs.files.amino_acid_orfs,
+            fwd_reads=mgargs.files.fwd_reads,
+            rev_reads=mgargs.files.rev_reads,
+            se_reads=mgargs.files.se_reads,
+        )
+    # TODO asynchronous execution here (work-queue/Makeflow tasks) 1b.
+    # Describe metagenome after length filter
+
+    # TODO asynchronous execution here (work-queue/Makeflow tasks) 2a.
+    counts = kmers.count(
+        assembly=mg.assembly,
+        size=mgargs.parameters.kmer_size,
+        out=mgargs.files.kmer_counts,
+        force=mgargs.parameters.force,
+        verbose=mgargs.parameters.verbose,
+        multiprocess=mgargs.parameters.kmer_multiprocess,
+        cpus=mgargs.parameters.cpus,
+    )
+    kmers.normalize(
+        df=counts,
+        method=mgargs.parameters.kmer_transform,
+        out=mgargs.files.kmer_normalized,
+        force=mgargs.parameters.force,
+    )
+    # COMBAK: Checkpoint kmers
+    # TODO asynchronous execution here (work-queue/Makeflow tasks) 2b.
+    coverage.get(
+        fasta=mg.assembly,
+        out=mgargs.files.coverages,
+        from_spades=mgargs.parameters.cov_from_spades,
+        fwd_reads=mgargs.files.fwd_reads,
+        rev_reads=mgargs.files.rev_reads,
+        se_reads=mgargs.files.se_reads,
+        sam=mgargs.files.sam,
+        bam=mgargs.files.bam,
+        lengths=mgargs.files.lengths,
+        bed=mgargs.files.bed,
+        cpus=mgargs.parameters.cpus,
+    )
+    # TODO asynchronous execution here (work-queue/Makeflow tasks) 2c.
+    mg.call_orfs(
+        force=mgargs.parameters.force,
+        cpus=mgargs.parameters.cpus,
+        parallel=mgargs.parameters.parallel,
+    )
+
+    # 3. Assign taxonomy (if desired).
+    # COMBAK: Checkpoint coverages
+    if mgargs.parameters.do_taxonomy:
+        # First assign taxonomy
+        vote.assign(
+            method=mgargs.parameters.taxon_method,
+            outfpath=mgargs.files.taxonomy,
+            fasta=mg.assembly,
+            prot_orfs=mgargs.files.amino_acid_orfs,
+            nucl_orfs=mgargs.files.nucleotide_orfs,
+            blast=mgargs.files.blastp,
+            hits=mgargs.files.blastp_hits,
+            lca_fpath=mgargs.files.lca,
+            ncbi_dir=mgargs.databases.ncbi,
+            tmpdir=mgargs.parameters.tmpdir,
+            usepickle=mgargs.parameters.usepickle,
+            force=mgargs.parameters.force,
+            verbose=mgargs.parameters.verbose,
+            parallel=mgargs.parameters.parallel,
+            cpus=mgargs.parameters.cpus,
+        )
+        # Now filter by Kingdom
+        metabin = vote.get(
+            fpath=mgargs.files.taxonomy,
+            assembly=mg.assembly,
+            ncbi_dir=mgargs.databases.ncbi,
+            kingdom=mgargs.parameters.kingdom,
+            outdir=mgargs.parameters.outdir,
+        )
+        orfs_fpath = os.path.join(
+            mgargs.parameters.outdir, f"{mgargs.parameters.kingdom}.orfs.faa"
+        )
+        metabin.write_orfs(fpath=orfs_fpath, orf_type="prot")
+        contigs = set(metabin.contig_ids)
+    else:
+        orfs_fpath = mgargs.files.amino_acid_orfs
+        contigs = {record.id for record in mg.seqrecords}
+
+    # 4. Annotate marker genes for retained ORFs.
+    # markers.get(...)
+    # We use orfs_fpath here instead of our config filepath
+    # because we may have filtered out ORFs by taxonomy.
+    if mgargs.parameters.kingdom == "bacteria":
+        scans = mgargs.files.bacteria_hmmscan
+        markers_outfpath = mgargs.files.bacteria_markers
+    else:
+        scans = mgargs.files.archaea_hmmscan
+        markers_outfpath = mgargs.files.archaea_markers
+
+    markers_df = markers.get(
+        kingdom=mgargs.parameters.kingdom,
+        orfs=orfs_fpath,
+        dbdir=mgargs.databases.markers,
+        scans=scans,
+        out=markers_outfpath,
+        force=mgargs.parameters.force,
+        seed=mgargs.parameters.seed,
+    )
+
+    # 5. Prepare contigs' annotations for binning
+    # At this point, we should have kmer counts, coverages and possibly taxonomy info
+    # Embed the kmer counts to retrieve our initial master dataframe
+    master = kmers.embed(
+        kmers=mgargs.files.kmer_normalized,
+        out=mgargs.files.kmer_embedded,
+        force=mgargs.parameters.force,
+        embed_dimensions=mgargs.parameters.embed_dimensions,
+        do_pca=mgargs.parameters.do_pca,
+        pca_dimensions=mgargs.parameters.pca_dimensions,
+        method=mgargs.parameters.embed_method,
+        seed=mgargs.parameters.seed,
+    )
+    # Add coverage and possibly taxonomy annotations
+    master = master[master.index.isin(contigs)]
+    if mgargs.parameters.do_taxonomy:
+        annotations = [mgargs.files.coverages, mgargs.files.taxonomy]
+    else:
+        annotations = [mgargs.files.coverages]
+    for fpath in annotations:
+        df = pd.read_csv(fpath, sep="\t", index_col="contig")
+        master = pd.merge(master, df, how="left", left_index=True, right_index=True)
+    master = master.convert_dtypes()
+
+    # 6. Bin contigs
+    bins_df = recursive_dbscan.binning(
+        master=master,
+        markers=markers_df,
+        domain=mgargs.parameters.kingdom,
+        completeness=mgargs.parameters.completeness,
+        purity=mgargs.parameters.purity,
+        taxonomy=mgargs.parameters.do_taxonomy,
+        starting_rank=mgargs.parameters.starting_rank,
+        method=mgargs.parameters.clustering_method,
+        reverse_ranks=mgargs.parameters.reverse_ranks,
+        verbose=mgargs.parameters.verbose,
+    )
+    binning_fpath = (
+        mgargs.files.bacteria_binning
+        if mgargs.parameters.kingdom == "bacteria"
+        else mgargs.files.archaea_binning
+    )
+    binning_cols = ["cluster", "completeness", "purity"]
+    bins_df[binning_cols].to_csv(binning_fpath, sep="\t", index=True, header=True)
+    return bins_df
+
+
 def main(args):
     """
     Main logic for running autometa pipeline.
@@ -119,15 +333,20 @@ def main(args):
     # Configure AutometaUser
     # TODO: master from WorkQueue is AutometaUser
     user = AutometaUser(nproc=args.cpus)
-    user.configure(dryrun=args.check_dependencies)
+    user.configure(dryrun=args.check_dependencies, update=args.update)
 
+    # all_bins = {}
     for config in args.config:
         # TODO: Add directions to master from WorkQueue
         mgargs = user.prepare_binning_args(config)
-        user.run_binning(mgargs)
-        # user.refine_binning()
-        # user.process_binning()
-    # user.get_pangenomes()
+        bins = run_autometa(mgargs)
+
+        # TODO: refinement/processing/prep for pangenome algos
+        # refined_bins = refine_binning(bins)
+        # processed_bins = process_binning(refined_bins)
+        # sample = mgargs.parameters.metagenome_num
+        # all_bins.update({sample: processed_bins})
+    # pangenomes = get_pangenomes(all_bins)
 
 
 def entrypoint():
@@ -177,6 +396,11 @@ def entrypoint():
         help="Check user executables and databases accessible to Autometa and exit.",
         action="store_true",
     )
+    parser.add_argument(
+        "--update",
+        help="Update existing databases to most recent releases",
+        action="store_true",
+    )
     args = parser.parse_args()
 
     try:
@@ -185,16 +409,16 @@ def entrypoint():
         logger.info("User cancelled run. Exiting...")
         sys.exit(1)
     except Exception as err:
-        issue_request = """
-        An error was encountered!
+        logger.exception(err)
+        if not issubclass(err.__class__, AutometaError):
+            issue_request = """
+            An error was encountered!
 
-        Please help us fix your problem!
+            Please help us fix your problem!
 
-        You may file an issue with us at https://github.com/KwanLab/Autometa/issues/new/choose
-        """
-        err.issue_request = issue_request
-        logger.exception(err)
-        logger.info(issue_request)
+            You may file an issue with us at https://github.com/KwanLab/Autometa/issues/new/choose
+            """
+            logger.info(issue_request)
         sys.exit(1)
 
 

diff --git a/autometa/binning/recursive_dbscan.py b/autometa/binning/recursive_dbscan.py
@@ -36,12 +36,12 @@
 from sklearn.cluster import DBSCAN
 from hdbscan import HDBSCAN
 
-from autometa.common.markers import Markers
+from autometa.common.markers import load as load_markers
 from autometa.common import kmers
 
 # TODO: This should be from autometa.common.kmers import Kmers
 # So later we can simply/and more clearly do Kmers.load(kmers_fpath).embed(method)
-from autometa.common.exceptions import BinningError
+from autometa.common.exceptions import TableFormatError
 from autometa.taxonomy.ncbi import NCBI
 
 pd.set_option("mode.chained_assignment", None)
@@ -147,7 +147,7 @@ def run_dbscan(df, eps, dropcols=["cluster", "purity", "completeness"]):
     if "coverage" in df.columns:
         cols.append("coverage")
     if np.any(df.isnull()):
-        raise BinningError(
+        raise TableFormatError(
             f"df is missing {df.isnull().sum().sum()} kmer/coverage annotations"
         )
     X = df.loc[:, cols].to_numpy()
@@ -289,9 +289,11 @@ def run_hdbscan(
 
     Raises
     -------
-    BinningError
-    ------------
-    Dataframe is missing kmer/coverage annotations
+    ValueError
+        sets `usecols` and `dropcols` may not share elements
+    TableFormatError
+        `df` is missing k-mer or coverage annotations.
+
     """
     for col in dropcols:
         if col in df.columns:
@@ -304,7 +306,7 @@ def run_hdbscan(
     if "coverage" in df.columns:
         cols.append("coverage")
     if np.any(df.isnull()):
-        raise BinningError(
+        raise TableFormatError(
             f"df is missing {df.isnull().sum().sum()} kmer/coverage annotations"
         )
     X = df.loc[:, cols].to_numpy()
@@ -552,12 +554,12 @@ def binning(
 
     Raises
     -------
-    BinningError
+    TableFormatError
         No marker information is availble for contigs to be binned.
     """
     # First check needs to ensure we have markers available to check binning quality...
     if master.loc[master.index.isin(markers.index)].empty:
-        raise BinningError(
+        raise TableFormatError(
             "No markers for contigs in table. Unable to assess binning quality"
         )
 
@@ -653,7 +655,7 @@ def main():
     import logging as logger
 
     logger.basicConfig(
-        format="%(asctime)s : %(name)s : %(levelname)s : %(message)s",
+        format="[%(asctime)s %(levelname)s] %(name)s: %(message)s",
         datefmt="%m/%d/%Y %I:%M:%S %p",
         level=logger.DEBUG,
     )
@@ -726,7 +728,7 @@ def main():
         kmers_df, cov_df[["coverage"]], how="left", left_index=True, right_index=True,
     )
 
-    markers_df = Markers.load(args.markers)
+    markers_df = load_markers(args.markers)
     markers_df = markers_df.convert_dtypes()
     # Taxonomy.load()
     if args.taxonomy:

diff --git a/autometa/binning/summary.py b/autometa/binning/summary.py
@@ -39,8 +39,7 @@
 from autometa import config
 from autometa.taxonomy.ncbi import NCBI
 from autometa.taxonomy.majority_vote import rank_taxids
-from autometa.common.metabin import MetaBin
-from autometa.common.markers import Markers
+from autometa.common import markers
 
 
 logger = logging.getLogger(__name__)
@@ -173,7 +172,7 @@ def get_metabin_stats(bin_df, markers_fpath, assembly):
         dataframe consisting of various metabin statistics indexed by cluster.
     """
     stats = []
-    markers_df = Markers.load(markers_fpath)
+    markers_df = markers.load(markers_fpath)
     for cluster, dff in bin_df.fillna(value={"cluster": "unclustered"}).groupby(
         "cluster"
     ):