Skip to content

Commit

Permalink
Refactor autometa-taxonomy-lca (#211)
Browse files Browse the repository at this point in the history
* start to fixing issue-#170
🎨 lca entrypoint now has updated output parameter
🍏🎨 update lca.nf with updated entrypoint param

* 🎨:green-apple:🐍 WIP

* 🎨🐛 Update entrypoint parameters

fixes #170 stores pickled data structures for LCA/RMQ to specified directory
:bug: Update entrypoint parameters in autometa.sh workflow for autometa-taxonomy-lca
:art::green_apple: Include meta.id in LCA outputs

* :bug: Replace LCA(...) instantiation outdir param to cache

* :bug: Replace incorrect variable to prevent passing pd.DataFrame to load(...) func in markers.get(...)

* :art: change process_low to process_medium in prepare_lca.nf
  • Loading branch information
evanroyrees committed Jan 12, 2022
1 parent 974d324 commit cf23ce3
Show file tree
Hide file tree
Showing 11 changed files with 507 additions and 204 deletions.
2 changes: 1 addition & 1 deletion autometa/common/markers.py
Expand Up @@ -193,7 +193,7 @@ def get(
)

if not os.path.exists(out) or not os.path.getsize(out):
out = hmmscan.filter_tblout_markers(
df = hmmscan.filter_tblout_markers(
infpath=scans,
outfpath=out,
cutoffs=cutoffs,
Expand Down
334 changes: 252 additions & 82 deletions autometa/taxonomy/lca.py

Large diffs are not rendered by default.

5 changes: 2 additions & 3 deletions autometa/taxonomy/majority_vote.py
Expand Up @@ -267,7 +267,6 @@ def majority_vote(
ncbi_dir: str,
verbose: bool = False,
orfs: str = None,
force: bool = False,
) -> str:
"""Wrapper for modified majority voting algorithm from Autometa 1.0
Expand All @@ -292,7 +291,8 @@ def majority_vote(
Path to assigned taxids table.
"""
lca = LCA(dbdir=ncbi_dir, verbose=verbose)
outdir = os.path.dirname(os.path.realpath(out))
lca = LCA(dbdir=ncbi_dir, verbose=verbose, cache=outdir)
# retrieve lca taxids for each contig
classifications = lca.parse(lca_fpath=lca_fpath, orfs_fpath=orfs)
# Vote for majority lca taxid from contig lca taxids
Expand All @@ -304,7 +304,6 @@ def majority_vote(
def main():
import argparse

basedir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
parser = argparse.ArgumentParser(
description="Script to assign taxonomy via a modified majority voting"
" algorithm.",
Expand Down
122 changes: 122 additions & 0 deletions autometa/taxonomy/ncbi.py
Expand Up @@ -24,6 +24,7 @@
"""


import gzip
import logging
import os
import string
Expand All @@ -34,6 +35,7 @@

from tqdm import tqdm

from autometa.common.utilities import file_length
from autometa.common.exceptions import DatabaseOutOfSyncError
from autometa.config.utilities import DEFAULT_CONFIG

Expand Down Expand Up @@ -65,23 +67,46 @@ def __init__(self, dirpath, verbose=False):
Parameters
----------
dirpath : str
Path to the database directory
verbose : bool, optional
log progress to terminal, by default False
"""
self.dirpath = dirpath
self.verbose = verbose
self.disable = not self.verbose
self.names_fpath = os.path.join(self.dirpath, "names.dmp")
self.nodes_fpath = os.path.join(self.dirpath, "nodes.dmp")
self.merged_fpath = os.path.join(self.dirpath, "merged.dmp")
# Set prot.accession2taxid filepath
self.accession2taxid_fpath = os.path.join(self.dirpath, "prot.accession2taxid")
acc2taxid_gz = ".".join([self.accession2taxid_fpath, "gz"])
if not os.path.exists(self.accession2taxid_fpath) and os.path.exists(
acc2taxid_gz
):
self.accession2taxid_fpath = acc2taxid_gz
# Set prot.accession2taxid.FULL.gz filepath
self.accession2taxidfull_fpath = os.path.join(
self.dirpath, "prot.accession2taxid.FULL"
)
acc2taxid_gz = ".".join([self.accession2taxidfull_fpath, "gz"])
if not os.path.exists(self.accession2taxidfull_fpath) and os.path.exists(
acc2taxid_gz
):
self.accession2taxidfull_fpath = acc2taxid_gz
# Set dead_prot.accession2taxid filepath
self.dead_accession2taxid_fpath = os.path.join(
self.dirpath, "dead_prot.accession2taxid"
)
acc2taxid_gz = ".".join([self.dead_accession2taxid_fpath, "gz"])
if not os.path.exists(self.dead_accession2taxid_fpath) and os.path.exists(
acc2taxid_gz
):
self.dead_accession2taxid_fpath = acc2taxid_gz
# Check formatting for nr database
self.nr_fpath = os.path.join(self.dirpath, "nr.gz")
nr_bname = os.path.splitext(os.path.basename(self.nr_fpath))[0]
nr_dmnd_fname = ".".join([nr_bname, "dmnd"])
Expand All @@ -93,6 +118,7 @@ def __init__(self, dirpath, verbose=False):
logger.warning(
f"DatabaseWarning: {self.nr_fpath} needs to be formatted for diamond!"
)
# Setup data structures from nodes.dmp, names.dmp and merged.dmp
self.nodes = self.parse_nodes()
self.names = self.parse_names()
self.merged = self.parse_merged()
Expand Down Expand Up @@ -450,6 +476,102 @@ def convert_taxid_dtype(self, taxid: int) -> int:
raise DatabaseOutOfSyncError(err_message)
return taxid

def search_prot_accessions(
self,
accessions: set,
sseqids_to_taxids: Dict[str, int] = None,
db: str = "live",
) -> Dict[str, int]:
"""Search prot.accession2taxid.gz and dead_prot.accession2taxid.gz
Parameters
----------
accessions : set
Set of subject sequence ids retrieved from diamond blastp search (sseqids)
sseqids_to_taxids : Dict[str, int], optional
Dictionary containing sseqids converted to taxids
db : str, optional
selection of one of the prot accession to taxid databases from NCBI. Choices are live, dead, full
* live: prot.accession2taxid.gz
* full: prot.accession2taxid.FULL.gz
* dead: dead_prot.accession2taxid.gz
Returns
-------
Dict[str, int]
Dictionary containing sseqids converted to taxids
"""
if not sseqids_to_taxids:
sseqids_to_taxids = {}
if not isinstance(db, str):
raise ValueError(f"db must be a string! Type Given: {type(db)}")
db = db.lower()
choices = {
"live": self.accession2taxid_fpath,
"dead": self.dead_accession2taxid_fpath,
"full": self.accession2taxidfull_fpath,
}
if db not in choices:
raise ValueError(f"db must be one of live, full or dead. Given: {db}")
fpath = choices.get(db)

# Revert to accession2taxid if FULL is not present
if db == "full" and (not os.path.exists(fpath) or not os.path.getsize(fpath)):
logger.warn(
"prot.accession2taxid.FULL.gz was not found. Reverting to prot.accession2taxid.gz"
)
logger.warn(
"To achieve greater resolution of your metagenome taxonomy, considering downloading the prot.accession2taxid.FULL.gz database file"
)
fpath = choices.get("live")
db = "live"

if not os.path.exists(fpath) or not os.path.getsize(fpath):
raise FileNotFoundError(fpath)

# "rt" open the database in text mode instead of binary to be handled like a text file
fh = gzip.open(fpath, "rt") if fpath.endswith(".gz") else open(fpath)
filename = os.path.basename(fpath)
# skip the header line
__ = fh.readline()
logger.debug(
f"Searching for {len(accessions):,} accessions in {filename}. This may take a while..."
)
n_lines = file_length(fpath, approximate=True) if self.verbose else None
desc = f"Parsing {filename}"
converted_sseqid_count = 0
for line in tqdm(
fh, disable=self.disable, desc=desc, total=n_lines, leave=False
):
if db == "full":
# FULL format is accession.version\ttaxid\n
acc_num = None # Just in case
acc_ver, taxid = line.strip().split("\t")
else:
# dead and live formats are accession\taccession.version\ttaxid\tgi\n
acc_num, acc_ver, taxid, _ = line.strip().split("\t")

taxid = int(taxid)
if acc_ver in accessions:
sseqids_to_taxids[acc_ver] = taxid
converted_sseqid_count += 1

# So prog will not have to search through the accessions set
if db == "full":
continue

# Search for base accession if using live or dead accession2taxid databases
if acc_num in accessions:
sseqids_to_taxids[acc_num] = taxid
converted_sseqid_count += 1

fh.close()
logger.debug(f"sseqids converted from {filename}: {converted_sseqid_count:,}")
return sseqids_to_taxids


if __name__ == "__main__":
print("file containing Autometa NCBI utilities class")
Expand Down
4 changes: 2 additions & 2 deletions autometa/taxonomy/vote.py
Expand Up @@ -124,14 +124,14 @@ def call_orfs():

def blast2lca():
if "lca" not in locals():
lca = LCA(dbdir=ncbi_dir, verbose=verbose)
lca = LCA(dbdir=ncbi_dir, verbose=verbose, cache=outdir)
lca.blast2lca(
orfs=prot_orfs, out=lca_fpath, blast=blast, force=force, cpus=cpus
)

def majority_vote_lca(out=out):
if "lca" not in locals():
lca = LCA(dbdir=ncbi_dir, verbose=verbose)
lca = LCA(dbdir=ncbi_dir, verbose=verbose, cache=outdir)
ctg_lcas = lca.parse(lca_fpath=lca_fpath, orfs_fpath=prot_orfs)
votes = majority_vote.rank_taxids(ctg_lcas=ctg_lcas, ncbi=lca, verbose=verbose)
out = majority_vote.write_votes(results=votes, out=out)
Expand Down
22 changes: 14 additions & 8 deletions modules/local/lca.nf → modules/local/prepare_lca.nf
Expand Up @@ -4,9 +4,9 @@ include { initOptions; saveFiles; getSoftwareName } from './functions'
params.options = [:]
options = initOptions(params.options)

process LCA {
tag "Finding LCA for ${meta.id}"
label 'process_high'
process PREPARE_LCA {
tag "Preparing db cache from ${blastdb_dir}"
label 'process_medium'
publishDir "${params.interim_dir_internal}",
mode: params.publish_dir_mode,
saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) }
Expand All @@ -18,19 +18,25 @@ process LCA {
container "jason-c-kwan/autometa:${params.autometa_image_tag}"
}

storeDir 'db/lca'
cache 'lenient'

input:
tuple val(meta), path(blast)
path(blastdb_dir)

output:
tuple val(meta), path("${meta.id}.lca.tsv"), emit: lca
path '*.version.txt' , emit: version

path "cache" , emit: cache
path '*.version.txt' , emit: version

script:
def software = getSoftwareName(task.process)
"""
autometa-taxonomy-lca --blast ${blast} --dbdir ${blastdb_dir} --output ${meta.id}.lca.tsv
autometa-taxonomy-lca \\
--blast . \\
--lca-output . \\
--dbdir ${blastdb_dir} \\
--cache cache \\
--only-prepare-cache
echo "TODO" > autometa.version.txt
"""
}
45 changes: 45 additions & 0 deletions modules/local/reduce_lca.nf
@@ -0,0 +1,45 @@
// Import generic module functions
include { initOptions; saveFiles; getSoftwareName } from './functions'

params.options = [:]
options = initOptions(params.options)

process REDUCE_LCA {
tag "Finding LCA for ${meta.id}"
label 'process_high'
publishDir "${params.interim_dir_internal}",
mode: params.publish_dir_mode,
saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) }

conda (params.enable_conda ? "bioconda::autometa" : null)
if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE"
} else {
container "jason-c-kwan/autometa:${params.autometa_image_tag}"
}

input:
tuple val(meta), path(blast)
path(blastdb_dir)
path(lca_cache)

output:
tuple val(meta), path("${meta.id}.lca.tsv"), emit: lca
path "${meta.id}.error_taxids.tsv" , emit: error_taxids
path "${meta.id}.sseqid2taxid.tsv" , emit: sseqid_to_taxids
path '*.version.txt' , emit: version


script:
def software = getSoftwareName(task.process)
"""
autometa-taxonomy-lca \\
--blast ${blast} \\
--dbdir ${blastdb_dir} \\
--cache ${lca_cache} \\
--lca-error-taxids ${meta.id}.error_taxids.tsv \\
--sseqid2taxid-output ${meta.id}.sseqid2taxid.tsv \\
--lca-output ${meta.id}.lca.tsv
echo "TODO" > autometa.version.txt
"""
}

0 comments on commit cf23ce3

Please sign in to comment.