Refactor autometa-taxonomy-lca (#211)

* start to fixing issue-#170 🎨 lca entrypoint now has updated output parameter 🍏🎨 update lca.nf with updated entrypoint param * 🎨:green-apple:🐍 WIP * 🎨🐛 Update entrypoint parameters fixes #170 stores pickled data structures for LCA/RMQ to specified directory :bug: Update entrypoint parameters in autometa.sh workflow for autometa-taxonomy-lca :art::green_apple: Include meta.id in LCA outputs * :bug: Replace LCA(...) instantiation outdir param to cache * :bug: Replace incorrect variable to prevent passing pd.DataFrame to load(...) func in markers.get(...) * :art: change process_low to process_medium in prepare_lca.nf
KwanLab · Jan 12, 2022 · cf23ce3 · cf23ce3
1 parent 974d324
commit cf23ce3
Show file tree

Hide file tree

Showing 11 changed files with 507 additions and 204 deletions.
diff --git a/autometa/common/markers.py b/autometa/common/markers.py
@@ -193,7 +193,7 @@ def get(
         )
 
     if not os.path.exists(out) or not os.path.getsize(out):
-        out = hmmscan.filter_tblout_markers(
+        df = hmmscan.filter_tblout_markers(
             infpath=scans,
             outfpath=out,
             cutoffs=cutoffs,

diff --git a/autometa/taxonomy/lca.py b/autometa/taxonomy/lca.py
diff --git a/autometa/taxonomy/majority_vote.py b/autometa/taxonomy/majority_vote.py
@@ -267,7 +267,6 @@ def majority_vote(
     ncbi_dir: str,
     verbose: bool = False,
     orfs: str = None,
-    force: bool = False,
 ) -> str:
     """Wrapper for modified majority voting algorithm from Autometa 1.0
 
@@ -292,7 +291,8 @@ def majority_vote(
         Path to assigned taxids table.
 
     """
-    lca = LCA(dbdir=ncbi_dir, verbose=verbose)
+    outdir = os.path.dirname(os.path.realpath(out))
+    lca = LCA(dbdir=ncbi_dir, verbose=verbose, cache=outdir)
     # retrieve lca taxids for each contig
     classifications = lca.parse(lca_fpath=lca_fpath, orfs_fpath=orfs)
     # Vote for majority lca taxid from contig lca taxids
@@ -304,7 +304,6 @@ def majority_vote(
 def main():
     import argparse
 
-    basedir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
     parser = argparse.ArgumentParser(
         description="Script to assign taxonomy via a modified majority voting"
         " algorithm.",

diff --git a/autometa/taxonomy/ncbi.py b/autometa/taxonomy/ncbi.py
@@ -24,6 +24,7 @@
 """
 
 
+import gzip
 import logging
 import os
 import string
@@ -34,6 +35,7 @@
 
 from tqdm import tqdm
 
+from autometa.common.utilities import file_length
 from autometa.common.exceptions import DatabaseOutOfSyncError
 from autometa.config.utilities import DEFAULT_CONFIG
 
@@ -65,23 +67,46 @@ def __init__(self, dirpath, verbose=False):
 
         Parameters
         ----------
+
         dirpath : str
             Path to the database directory
+
         verbose : bool, optional
             log progress to terminal, by default False
+
         """
         self.dirpath = dirpath
         self.verbose = verbose
         self.disable = not self.verbose
         self.names_fpath = os.path.join(self.dirpath, "names.dmp")
         self.nodes_fpath = os.path.join(self.dirpath, "nodes.dmp")
         self.merged_fpath = os.path.join(self.dirpath, "merged.dmp")
+        # Set prot.accession2taxid filepath
         self.accession2taxid_fpath = os.path.join(self.dirpath, "prot.accession2taxid")
         acc2taxid_gz = ".".join([self.accession2taxid_fpath, "gz"])
         if not os.path.exists(self.accession2taxid_fpath) and os.path.exists(
             acc2taxid_gz
         ):
             self.accession2taxid_fpath = acc2taxid_gz
+        # Set prot.accession2taxid.FULL.gz filepath
+        self.accession2taxidfull_fpath = os.path.join(
+            self.dirpath, "prot.accession2taxid.FULL"
+        )
+        acc2taxid_gz = ".".join([self.accession2taxidfull_fpath, "gz"])
+        if not os.path.exists(self.accession2taxidfull_fpath) and os.path.exists(
+            acc2taxid_gz
+        ):
+            self.accession2taxidfull_fpath = acc2taxid_gz
+        # Set dead_prot.accession2taxid filepath
+        self.dead_accession2taxid_fpath = os.path.join(
+            self.dirpath, "dead_prot.accession2taxid"
+        )
+        acc2taxid_gz = ".".join([self.dead_accession2taxid_fpath, "gz"])
+        if not os.path.exists(self.dead_accession2taxid_fpath) and os.path.exists(
+            acc2taxid_gz
+        ):
+            self.dead_accession2taxid_fpath = acc2taxid_gz
+        # Check formatting for nr database
         self.nr_fpath = os.path.join(self.dirpath, "nr.gz")
         nr_bname = os.path.splitext(os.path.basename(self.nr_fpath))[0]
         nr_dmnd_fname = ".".join([nr_bname, "dmnd"])
@@ -93,6 +118,7 @@ def __init__(self, dirpath, verbose=False):
             logger.warning(
                 f"DatabaseWarning: {self.nr_fpath} needs to be formatted for diamond!"
             )
+        # Setup data structures from nodes.dmp, names.dmp and merged.dmp
         self.nodes = self.parse_nodes()
         self.names = self.parse_names()
         self.merged = self.parse_merged()
@@ -450,6 +476,102 @@ def convert_taxid_dtype(self, taxid: int) -> int:
                     raise DatabaseOutOfSyncError(err_message)
         return taxid
 
+    def search_prot_accessions(
+        self,
+        accessions: set,
+        sseqids_to_taxids: Dict[str, int] = None,
+        db: str = "live",
+    ) -> Dict[str, int]:
+        """Search prot.accession2taxid.gz and dead_prot.accession2taxid.gz
+
+        Parameters
+        ----------
+        accessions : set
+            Set of subject sequence ids retrieved from diamond blastp search (sseqids)
+
+        sseqids_to_taxids : Dict[str, int], optional
+            Dictionary containing sseqids converted to taxids
+
+        db : str, optional
+            selection of one of the prot accession to taxid databases from NCBI. Choices are live, dead, full
+
+            * live: prot.accession2taxid.gz
+            * full: prot.accession2taxid.FULL.gz
+            * dead: dead_prot.accession2taxid.gz
+
+        Returns
+        -------
+        Dict[str, int]
+            Dictionary containing sseqids converted to taxids
+        """
+        if not sseqids_to_taxids:
+            sseqids_to_taxids = {}
+        if not isinstance(db, str):
+            raise ValueError(f"db must be a string! Type Given: {type(db)}")
+        db = db.lower()
+        choices = {
+            "live": self.accession2taxid_fpath,
+            "dead": self.dead_accession2taxid_fpath,
+            "full": self.accession2taxidfull_fpath,
+        }
+        if db not in choices:
+            raise ValueError(f"db must be one of live, full or dead. Given: {db}")
+        fpath = choices.get(db)
+
+        # Revert to accession2taxid if FULL is not present
+        if db == "full" and (not os.path.exists(fpath) or not os.path.getsize(fpath)):
+            logger.warn(
+                "prot.accession2taxid.FULL.gz was not found. Reverting to prot.accession2taxid.gz"
+            )
+            logger.warn(
+                "To achieve greater resolution of your metagenome taxonomy, considering downloading the prot.accession2taxid.FULL.gz database file"
+            )
+            fpath = choices.get("live")
+            db = "live"
+
+        if not os.path.exists(fpath) or not os.path.getsize(fpath):
+            raise FileNotFoundError(fpath)
+
+        # "rt" open the database in text mode instead of binary to be handled like a text file
+        fh = gzip.open(fpath, "rt") if fpath.endswith(".gz") else open(fpath)
+        filename = os.path.basename(fpath)
+        # skip the header line
+        __ = fh.readline()
+        logger.debug(
+            f"Searching for {len(accessions):,} accessions in {filename}. This may take a while..."
+        )
+        n_lines = file_length(fpath, approximate=True) if self.verbose else None
+        desc = f"Parsing {filename}"
+        converted_sseqid_count = 0
+        for line in tqdm(
+            fh, disable=self.disable, desc=desc, total=n_lines, leave=False
+        ):
+            if db == "full":
+                # FULL format is accession.version\ttaxid\n
+                acc_num = None  # Just in case
+                acc_ver, taxid = line.strip().split("\t")
+            else:
+                # dead and live formats are accession\taccession.version\ttaxid\tgi\n
+                acc_num, acc_ver, taxid, _ = line.strip().split("\t")
+
+            taxid = int(taxid)
+            if acc_ver in accessions:
+                sseqids_to_taxids[acc_ver] = taxid
+                converted_sseqid_count += 1
+
+            # So prog will not have to search through the accessions set
+            if db == "full":
+                continue
+
+            # Search for base accession if using live or dead accession2taxid databases
+            if acc_num in accessions:
+                sseqids_to_taxids[acc_num] = taxid
+                converted_sseqid_count += 1
+
+        fh.close()
+        logger.debug(f"sseqids converted from {filename}: {converted_sseqid_count:,}")
+        return sseqids_to_taxids
+
 
 if __name__ == "__main__":
     print("file containing Autometa NCBI utilities class")

diff --git a/autometa/taxonomy/vote.py b/autometa/taxonomy/vote.py
@@ -124,14 +124,14 @@ def call_orfs():
 
     def blast2lca():
         if "lca" not in locals():
-            lca = LCA(dbdir=ncbi_dir, verbose=verbose)
+            lca = LCA(dbdir=ncbi_dir, verbose=verbose, cache=outdir)
         lca.blast2lca(
             orfs=prot_orfs, out=lca_fpath, blast=blast, force=force, cpus=cpus
         )
 
     def majority_vote_lca(out=out):
         if "lca" not in locals():
-            lca = LCA(dbdir=ncbi_dir, verbose=verbose)
+            lca = LCA(dbdir=ncbi_dir, verbose=verbose, cache=outdir)
         ctg_lcas = lca.parse(lca_fpath=lca_fpath, orfs_fpath=prot_orfs)
         votes = majority_vote.rank_taxids(ctg_lcas=ctg_lcas, ncbi=lca, verbose=verbose)
         out = majority_vote.write_votes(results=votes, out=out)

diff --git a/modules/local/lca.nf → modules/local/prepare_lca.nf b/modules/local/lca.nf → modules/local/prepare_lca.nf
@@ -4,9 +4,9 @@ include { initOptions; saveFiles; getSoftwareName } from './functions'
 params.options = [:]
 options        = initOptions(params.options)
 
-process LCA {
-    tag "Finding LCA for ${meta.id}"
-    label 'process_high'
+process PREPARE_LCA {
+    tag "Preparing db cache from ${blastdb_dir}"
+    label 'process_medium'
     publishDir "${params.interim_dir_internal}",
         mode: params.publish_dir_mode,
         saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) }
@@ -18,19 +18,25 @@ process LCA {
         container "jason-c-kwan/autometa:${params.autometa_image_tag}"
     }
 
+    storeDir 'db/lca'
+    cache 'lenient'
+
     input:
-        tuple val(meta), path(blast)
         path(blastdb_dir)
 
     output:
-        tuple val(meta), path("${meta.id}.lca.tsv"), emit: lca
-        path  '*.version.txt'                      , emit: version
-
+        path "cache"       , emit: cache
+        path '*.version.txt'   , emit: version
 
     script:
         def software = getSoftwareName(task.process)
         """
-        autometa-taxonomy-lca --blast ${blast} --dbdir ${blastdb_dir} --output ${meta.id}.lca.tsv
+        autometa-taxonomy-lca \\
+            --blast . \\
+            --lca-output . \\
+            --dbdir ${blastdb_dir} \\
+            --cache cache \\
+            --only-prepare-cache
         echo "TODO" > autometa.version.txt
         """
 }
diff --git a/modules/local/reduce_lca.nf b/modules/local/reduce_lca.nf
@@ -0,0 +1,45 @@
+// Import generic module functions
+include { initOptions; saveFiles; getSoftwareName } from './functions'
+
+params.options = [:]
+options        = initOptions(params.options)
+
+process REDUCE_LCA {
+    tag "Finding LCA for ${meta.id}"
+    label 'process_high'
+    publishDir "${params.interim_dir_internal}",
+        mode: params.publish_dir_mode,
+        saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) }
+
+    conda (params.enable_conda ? "bioconda::autometa" : null)
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE"
+    } else {
+        container "jason-c-kwan/autometa:${params.autometa_image_tag}"
+    }
+
+    input:
+        tuple val(meta), path(blast)
+        path(blastdb_dir)
+        path(lca_cache)
+
+    output:
+        tuple val(meta), path("${meta.id}.lca.tsv"), emit: lca
+        path "${meta.id}.error_taxids.tsv"         , emit: error_taxids
+        path "${meta.id}.sseqid2taxid.tsv"         , emit: sseqid_to_taxids
+        path '*.version.txt'                       , emit: version
+
+
+    script:
+        def software = getSoftwareName(task.process)
+        """
+        autometa-taxonomy-lca \\
+            --blast ${blast} \\
+            --dbdir ${blastdb_dir} \\
+            --cache ${lca_cache} \\
+            --lca-error-taxids ${meta.id}.error_taxids.tsv \\
+            --sseqid2taxid-output ${meta.id}.sseqid2taxid.tsv \\
+            --lca-output ${meta.id}.lca.tsv
+        echo "TODO" > autometa.version.txt
+        """
+}