diff --git a/Makefile b/Makefile index 9a2e9733d..056aad0fe 100644 --- a/Makefile +++ b/Makefile @@ -1,23 +1,155 @@ -hello: - @echo "Please inspect Makefile for list of commands" +.PHONY: clean black create_environment install image docs clean unit_test unit_test_data unit_test_data_download unit_test_data_build unit_test_wip unit_test_entrypoints +################################################################################# +# GLOBALS # +################################################################################# + +PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST)))) +PROJECT_NAME = autometa +PYTHON_INTERPRETER = python3 +# This was retrieved from https://drive.google.com/file/d/1bSlPldaq3C6Cf9Y5Rm7iwtUDcjxAaeEk/view?usp=sharing +TEST_DATA_FILEID = 1bSlPldaq3C6Cf9Y5Rm7iwtUDcjxAaeEk + +ifeq (,$(shell which conda)) +HAS_CONDA=False +else +HAS_CONDA=True +endif + +################################################################################# +# COMMANDS # +################################################################################# + +## Delete all compiled Python files +clean: + find . -type f -name "*.py[co]" -delete + find . -type d -name "__pycache__" -delete + find . -type d -name "htmlcov" -delete + find . -type d -name "Autometa.egg-info" -delete + find . -type d -name "dist" -delete + find . -type d -name "build" -delete + +## Apply black formatting +black: + black --exclude autometa/validation autometa + +## Set up python interpreter environment +create_environment: requirements.txt +ifeq (True,$(HAS_CONDA)) + @echo ">>> Detected conda, creating conda environment." +ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER))) + conda create --name $(PROJECT_NAME) python=3 --file=requirements.txt +else + conda create --name $(PROJECT_NAME) python=2.7 --file=requirements.txt +endif + @echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)" +else + $(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper + @echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\ + export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n" + @bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)" + @echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)" +endif + +################################################################################# +# PROJECT RULES # +################################################################################# + +## Install autometa from source install: - python setup.py install + $(PYTHON_INTERPRETER) setup.py install + +## Install dependencies for test environment +test_environment: tests/requirements.txt + $(PYTHON_INTERPRETER) -m pip install --requirement=tests/requirements.txt +## Build docker image from Dockerfile (auto-taggged as jason-c-kwan/autometa:) +image: Dockerfile + docker build . -t jason-c-kwan/autometa:`git branch --show-current` + +## Build documentation for autometa.readthedocs.io docs: - make clean html -C docs && open docs/build/html/index.html + make clean html -C docs + @echo "docs built. Open docs/build/html/index.html to view" -clean: - @echo "Removing everything under 'htmlcov'..." - @rm -rf htmlcov && make clean -C docs +## Download test_data.json for unit testing +unit_test_data_download: + gdown --id $(TEST_DATA_FILEID) -O tests/data/test_data.json + +## Build test_data.json file for unit testing (requires all files from https://drive.google.com/open?id=189C6do0Xw-X813gspsafR9r8m-YfbhTS be downloaded into tests/data/) +unit_test_data_build: tests/data/records.fna + $(PYTHON_INTERPRETER) make_test_data.py + +## Run all unit tests +unit_test: tests/data/test_data.json test_environment + $(PYTHON_INTERPRETER) -m pytest --durations=0 --cov=autometa --emoji --cov-report=html tests + +## Run unit tests marked with WIP +unit_test_wip: tests/data/test_data.json test_environment + $(PYTHON_INTERPRETER) -m pytest -m "wip" --durations=0 --cov=autometa --emoji --cov-report=html tests + +## Run unit tests marked with entrypoint +unit_test_entrypoints: tests/data/test_data.json test_environment + $(PYTHON_INTERPRETER) -m pytest -m "entrypoint" --durations=0 --cov=autometa --emoji --cov-report=html tests -test: tests/data/test_data.json - python -m pytest --durations=0 --cov=autometa --emoji --cov-report html -test-wip: tests/data/test_data.json - python -m pytest -m "wip" --durations=0 --cov=autometa --emoji --cov-report html +################################################################################# +# Self Documenting Commands # +################################################################################# -test-entrypoints: tests/data/test_data.json - python -m pytest -m "entrypoint" --durations=0 --cov=autometa --emoji --cov-report html +.DEFAULT_GOAL := help -.PHONY: hello docs clean test test-wip test-entrypoints +# Inspired by +# sed script explained: +# /^##/: +# * save line in hold space +# * purge line +# * Loop: +# * append newline + line to hold space +# * go to next line +# * if line starts with doc comment, strip comment character off and loop +# * remove target prerequisites +# * append hold space (+ newline) to line +# * replace newline plus comments by `---` +# * print line +# Separate expressions are necessary because labels cannot be delimited by +# semicolon; see +.PHONY: help +help: + @echo "$$(tput bold)Available rules:$$(tput sgr0)" + @echo + @sed -n -e "/^## / { \ + h; \ + s/.*//; \ + :doc" \ + -e "H; \ + n; \ + s/^## //; \ + t doc" \ + -e "s/:.*//; \ + G; \ + s/\\n## /---/; \ + s/\\n/ /g; \ + p; \ + }" ${MAKEFILE_LIST} \ + | LC_ALL='C' sort --ignore-case \ + | awk -F '---' \ + -v ncol=$$(tput cols) \ + -v indent=19 \ + -v col_on="$$(tput setaf 6)" \ + -v col_off="$$(tput sgr0)" \ + '{ \ + printf "%s%*s%s ", col_on, -indent, $$1, col_off; \ + n = split($$2, words, " "); \ + line_length = ncol - indent; \ + for (i = 1; i <= n; i++) { \ + line_length -= length(words[i]) + 1; \ + if (line_length <= 0) { \ + line_length = ncol - indent - length(words[i]) - 1; \ + printf "\n%*s ", -indent, " "; \ + } \ + printf "%s ", words[i]; \ + } \ + printf "\n"; \ + }' \ + | more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars') diff --git a/autometa/binning/recursive_dbscan.py b/autometa/binning/recursive_dbscan.py index 21a2c088c..41ab346ee 100644 --- a/autometa/binning/recursive_dbscan.py +++ b/autometa/binning/recursive_dbscan.py @@ -58,7 +58,7 @@ def add_metrics( Parameters ---------- df : pd.DataFrame - index='contig' cols=['x','y','coverage','cluster'] + index='contig' cols=['x','y','coverage','gc_content','cluster'] markers_df : pd.DataFrame wide format, i.e. index=contig cols=[marker,marker,...] domain : str, optional @@ -82,31 +82,72 @@ def add_metrics( if domain not in marker_sets: raise KeyError(f"{domain} is not bacteria or archaea!") expected_number = marker_sets[domain] - clusters = dict(list(df.groupby("cluster"))) - metrics = {"purity": {}, "completeness": {}} - for cluster, dff in clusters.items(): - pfam_counts = markers_df[markers_df.index.isin(dff.index)].sum() - is_present = pfam_counts >= 1 - is_single_copy = pfam_counts == 1 - nunique_markers = pfam_counts[is_present].count() - num_single_copy_markers = pfam_counts[is_single_copy].count() - completeness = nunique_markers / expected_number * 100 - # Protect from divide by zero - if nunique_markers == 0: - purity = pd.NA - else: - purity = num_single_copy_markers / nunique_markers * 100 - metrics["completeness"].update({cluster: completeness}) - metrics["purity"].update({cluster: purity}) - metrics_df = pd.DataFrame(metrics, index=clusters.keys()) - merged_df = pd.merge(df, metrics_df, left_on="cluster", right_index=True) + metrics = [] + if "cluster" in df.columns: + clusters = dict(list(df.groupby("cluster"))) + for cluster, dff in clusters.items(): + pfam_counts = markers_df[markers_df.index.isin(dff.index)].sum() + is_present = pfam_counts >= 1 + is_single_copy = pfam_counts == 1 + nunique_markers = pfam_counts[is_present].count() + num_single_copy_markers = pfam_counts[is_single_copy].count() + completeness = nunique_markers / expected_number * 100 + # Protect from divide by zero + if nunique_markers == 0: + purity = pd.NA + else: + purity = num_single_copy_markers / nunique_markers * 100 + if dff.shape[0] <= 1: + coverage_stddev = 0.0 + gc_content_stddev = 0.0 + else: + coverage_stddev = dff.coverage.std() + gc_content_stddev = dff.gc_content.std() + metrics.append( + { + "cluster": cluster, + "completeness": completeness, + "purity": purity, + "coverage_stddev": coverage_stddev, + "gc_content_stddev": gc_content_stddev, + } + ) + # Account for exceptions where clusters were not recovered + if not metrics or "cluster" not in df.columns: + metrics_df = pd.DataFrame( + [ + { + "contig": contig, + "cluster": pd.NA, + "completeness": pd.NA, + "purity": pd.NA, + "coverage_stddev": pd.NA, + "gc_content_stddev": pd.NA, + } + for contig in df.index + ] + ) + metric_cols = ["completeness", "purity", "coverage_stddev", "gc_content_stddev"] + merged_df = df.copy() + for metric in metric_cols: + merged_df[metric] = pd.NA + + else: + metrics_df = pd.DataFrame(metrics).set_index("cluster") + merged_df = pd.merge(df, metrics_df, left_on="cluster", right_index=True) return merged_df, metrics_df def run_dbscan( df: pd.DataFrame, eps: float, - dropcols: List[str] = ["cluster", "purity", "completeness"], + dropcols: List[str] = [ + "cluster", + "purity", + "completeness", + "coverage_stddev", + "gc_content_stddev", + ], ): """Run clustering on `df` at provided `eps`. @@ -119,7 +160,7 @@ def run_dbscan( Parameters ---------- df : pd.DataFrame - Contigs with embedded k-mer frequencies as ['x','y'] columns and optionally 'coverage' column + Contigs with embedded k-mer frequencies as ['x','y'] columns and 'coverage' column eps : float The maximum distance between two samples for one to be considered as in the neighborhood of the other. This is not a maximum bound @@ -156,7 +197,8 @@ def run_dbscan( raise TableFormatError( f"df is missing {df.isnull().sum().sum()} kmer/coverage annotations" ) - X = df.loc[:, cols].to_numpy() + # We should have 'gc_content' as a column here but we are explicitly leaving it out of the classification algo + X = df[cols].to_numpy() clusterer = DBSCAN(eps=eps, min_samples=1, n_jobs=-1).fit(X) clusters = pd.Series(clusterer.labels_, index=df.index, name="cluster") return pd.merge(df, clusters, how="left", left_index=True, right_index=True) @@ -168,13 +210,15 @@ def recursive_dbscan( domain: str, completeness_cutoff: float, purity_cutoff: float, + coverage_stddev_cutoff: float, + gc_content_stddev_cutoff: float, verbose: bool = False, ) -> Tuple[pd.DataFrame, pd.DataFrame]: """Carry out DBSCAN, starting at eps=0.3 and continuing until there is just one group. Break conditions to speed up pipeline: - Give up if we've got up to eps 1.3 and still no complete and pure clusters + Give up if we've got up to eps 1.3 and still no complete and pure clusters. Often when you start at 0.3 there are zero complete and pure clusters, because the groups are too small. Later, some are found as the groups enlarge enough, but after it becomes zero again, it is a lost cause and we may as well stop. On the @@ -184,17 +228,31 @@ def recursive_dbscan( Parameters ---------- table : pd.DataFrame - Contigs with embedded k-mer frequencies as ['x','y','z'] columns and - optionally 'coverage' column + Contigs with embedded k-mer frequencies ('x','y'), 'coverage' and 'gc_content' columns + markers_df : pd.DataFrame wide format, i.e. index=contig cols=[marker,marker,...] + domain : str Kingdom to determine metrics (the default is 'bacteria'). choices=['bacteria','archaea'] + completeness_cutoff : float `completeness_cutoff` threshold to retain cluster (the default is 20.0). + e.g. cluster completeness >= completeness_cutoff + purity_cutoff : float - `purity_cutoff` threshold to retain cluster (the default is 90.0). + `purity_cutoff` threshold to retain cluster (the default is 95.0). + e.g. cluster purity >= purity_cutoff + + coverage_stddev_cutoff : float + `coverage_stddev_cutoff` threshold to retain cluster (the default is 25.0). + e.g. cluster coverage std.dev. <= coverage_stddev_cutoff + + gc_content_stddev_cutoff : float + `gc_content_stddev_cutoff` threshold to retain cluster (the default is 5.0). + e.g. cluster gc_content std.dev. <= gc_content_stddev_cutoff + verbose : bool log stats for each recursive_dbscan clustering iteration. @@ -207,7 +265,7 @@ def recursive_dbscan( DataFrame: index = contig - columns = ['x,'y','z','coverage','cluster','purity','completeness'] + columns = ['x,'y','coverage','gc_content','cluster','purity','completeness','coverage_stddev','gc_content_stddev'] """ eps = 0.3 step = 0.1 @@ -220,9 +278,12 @@ def recursive_dbscan( df, metrics_df = add_metrics(df=binned_df, markers_df=markers_df, domain=domain) completeness_filter = metrics_df["completeness"] >= completeness_cutoff purity_filter = metrics_df["purity"] >= purity_cutoff - median_completeness = metrics_df[completeness_filter & purity_filter][ - "completeness" - ].median() + coverage_filter = metrics_df["coverage_stddev"] <= coverage_stddev_cutoff + gc_content_filter = metrics_df["gc_content_stddev"] <= gc_content_stddev_cutoff + filtered_df = metrics_df[ + completeness_filter & purity_filter & coverage_filter & gc_content_filter + ] + median_completeness = filtered_df.completeness.median() if median_completeness >= best_median: best_median = median_completeness best_df = df @@ -252,14 +313,18 @@ def recursive_dbscan( completeness_filter = best_df["completeness"] >= completeness_cutoff purity_filter = best_df["purity"] >= purity_cutoff - complete_and_pure_df = best_df.loc[completeness_filter & purity_filter] - unclustered_df = best_df.loc[~best_df.index.isin(complete_and_pure_df.index)] + coverage_filter = best_df["coverage_stddev"] <= coverage_stddev_cutoff + gc_content_filter = best_df["gc_content_stddev"] <= gc_content_stddev_cutoff + clustered_df = best_df[ + completeness_filter & purity_filter & coverage_filter & gc_content_filter + ] + unclustered_df = best_df.loc[~best_df.index.isin(clustered_df.index)] if verbose: logger.debug(f"Best completeness median: {best_median:4.2f}") logger.debug( - f"clustered: {len(complete_and_pure_df)} unclustered: {len(unclustered_df)}" + f"clustered: {clustered_df.shape[0]:,} unclustered: {unclustered_df.shape[0]:,}" ) - return complete_and_pure_df, unclustered_df + return clustered_df, unclustered_df def run_hdbscan( @@ -346,6 +411,8 @@ def recursive_hdbscan( domain: str, completeness_cutoff: float, purity_cutoff: float, + coverage_stddev_cutoff: float, + gc_content_stddev_cutoff: float, verbose: bool = False, ) -> Tuple[pd.DataFrame, pd.DataFrame]: """Recursively run HDBSCAN starting with defaults and iterating the min_samples @@ -354,8 +421,7 @@ def recursive_hdbscan( Parameters ---------- table : pd.DataFrame - Contigs with embedded k-mer frequencies as ['x','y','z'] columns and - optionally 'coverage' column + Contigs with embedded k-mer frequencies ('x','y'), 'coverage' and 'gc_content' columns markers_df : pd.DataFrame wide format, i.e. index=contig cols=[marker,marker,...] @@ -366,9 +432,19 @@ def recursive_hdbscan( completeness_cutoff : float `completeness_cutoff` threshold to retain cluster (the default is 20.0). + e.g. cluster completeness >= completeness_cutoff purity_cutoff : float - `purity_cutoff` threshold to retain cluster (the default is 90.0). + `purity_cutoff` threshold to retain cluster (the default is 95.00). + e.g. cluster purity >= purity_cutoff + + coverage_stddev_cutoff : float + `coverage_stddev_cutoff` threshold to retain cluster (the default is 25.0). + e.g. cluster coverage std.dev. <= coverage_stddev_cutoff + + gc_content_stddev_cutoff : float + `gc_content_stddev_cutoff` threshold to retain cluster (the default is 5.0). + e.g. cluster gc_content std.dev. <= gc_content_stddev_cutoff verbose : bool log stats for each recursive_dbscan clustering iteration. @@ -382,7 +458,7 @@ def recursive_hdbscan( DataFrame: index = contig - columns = ['x,'y','z','coverage','cluster','purity','completeness'] + columns = ['x,'y','coverage','gc_content','cluster','purity','completeness','coverage_stddev','gc_content_stddev'] """ max_min_cluster_size = 10000 max_min_samples = 10 @@ -400,12 +476,14 @@ def recursive_hdbscan( cache_dir=cache_dir, ) df, metrics_df = add_metrics(df=binned_df, markers_df=markers_df, domain=domain) - completeness_filter = metrics_df["completeness"] >= completeness_cutoff purity_filter = metrics_df["purity"] >= purity_cutoff - median_completeness = metrics_df[completeness_filter & purity_filter][ - "completeness" - ].median() + coverage_filter = metrics_df["coverage_stddev"] <= coverage_stddev_cutoff + gc_content_filter = metrics_df["gc_content_stddev"] <= gc_content_stddev_cutoff + filtered_df = metrics_df[ + completeness_filter & purity_filter & coverage_filter & gc_content_filter + ] + median_completeness = filtered_df.completeness.median() if median_completeness >= best_median: best_median = median_completeness best_df = df @@ -441,23 +519,29 @@ def recursive_hdbscan( completeness_filter = best_df["completeness"] >= completeness_cutoff purity_filter = best_df["purity"] >= purity_cutoff - complete_and_pure_df = best_df.loc[completeness_filter & purity_filter] - unclustered_df = best_df.loc[~best_df.index.isin(complete_and_pure_df.index)] + coverage_filter = best_df["coverage_stddev"] <= coverage_stddev_cutoff + gc_content_filter = best_df["gc_content_stddev"] <= gc_content_stddev_cutoff + clustered_df = best_df[ + completeness_filter & purity_filter & coverage_filter & gc_content_filter + ] + unclustered_df = best_df.loc[~best_df.index.isin(clustered_df.index)] if verbose: logger.debug(f"Best completeness median: {best_median:4.2f}") logger.debug( - f"clustered: {len(complete_and_pure_df)} unclustered: {len(unclustered_df)}" + f"clustered: {clustered_df.shape[0]} unclustered: {unclustered_df.shape[0]}" ) - return complete_and_pure_df, unclustered_df + return clustered_df, unclustered_df def get_clusters( master: pd.DataFrame, markers_df: pd.DataFrame, - domain: str = "bacteria", - completeness: float = 20.0, - purity: float = 90.0, - method: str = "dbscan", + domain: str, + completeness: float, + purity: float, + coverage_stddev: float, + gc_content_stddev: float, + method: str, verbose: bool = False, ) -> pd.DataFrame: """Find best clusters retained after applying `completeness` and `purity` filters. @@ -466,23 +550,33 @@ def get_clusters( ---------- master : pd.DataFrame index=contig, - cols=['x','y','coverage'] + cols=['x','y','coverage','gc_content'] markers_df : pd.DataFrame wide format, i.e. index=contig cols=[marker,marker,...] domain : str - Kingdom to determine metrics (the default is 'bacteria'). + Kingdom to determine metrics. choices=['bacteria','archaea']. completeness : float - `completeness` threshold to retain cluster (the default is 20.). + completeness threshold to retain cluster. + e.g. cluster completeness >= completeness purity : float - `purity` threshold to retain cluster (the default is 90.). + `purity` threshold to retain cluster. + e.g. cluster purity >= purity + + coverage_stddev : float + cluster coverage std.dev. threshold to retain cluster. + e.g. cluster coverage std.dev. <= coverage_stddev + + gc_content_stddev : float + cluster GC content std.dev. threshold to retain cluster. + e.g. cluster GC content std.dev. <= gc_content_stddev method : str - Description of parameter `method` (the default is 'dbscan'). + Description of parameter `method`. choices = ['dbscan','hdbscan'] verbose : bool @@ -504,7 +598,14 @@ def get_clusters( # break when either clustered_df or unclustered_df is empty while True: clustered_df, unclustered_df = clusterer( - master, markers_df, domain, completeness, purity, verbose=verbose, + master, + markers_df, + domain, + completeness, + purity, + coverage_stddev, + gc_content_stddev, + verbose=verbose, ) # No contigs can be clustered, label as unclustered and add the final df # of (unclustered) contigs @@ -533,7 +634,12 @@ def rename_cluster(c): # continue with unclustered contigs master = unclustered_df df = pd.concat(clusters, sort=True) - for metric in ["purity", "completeness"]: + for metric in [ + "purity", + "completeness", + "coverage_stddev", + "gc_content_stddev", + ]: # Special case where no clusters are found in first round. # i.e. cluster = pd.NA for all contigs if metric not in df.columns: @@ -546,7 +652,9 @@ def binning( markers: pd.DataFrame, domain: str = "bacteria", completeness: float = 20.0, - purity: float = 90.0, + purity: float = 95.0, + coverage_stddev: float = 25.0, + gc_content_stddev: float = 5.0, taxonomy: bool = True, starting_rank: str = "superkingdom", method: str = "dbscan", @@ -561,7 +669,7 @@ def binning( ---------- master : pd.DataFrame index=contig, - cols=['x','y'] + cols=['x','y', 'coverage', 'gc_content'] taxa cols should be present if `taxonomy` is True. i.e. [taxid,superkingdom,phylum,class,order,family,genus,species] @@ -576,7 +684,14 @@ def binning( Description of parameter `completeness` (the default is 20.). purity : float, optional - Description of parameter `purity` (the default is 90.). + purity threshold to retain cluster (the default is 95.0). + e.g. cluster purity >= purity_cutoff + + coverage_stddev : float, optional + cluster coverage threshold to retain cluster (the default is 25.0). + + gc_content_stddev : float, optional + cluster GC content threshold to retain cluster (the default is 5.0). taxonomy : bool, optional Split canonical ranks and subset based on rank then attempt to find clusters (the default is True). @@ -612,6 +727,8 @@ def binning( raise TableFormatError( "No markers for contigs in table. Unable to assess binning quality" ) + if master.shape[0] <= 1: + raise BinningError("Not enough contigs in table for binning") logger.info(f"Using {method} clustering method") if not taxonomy: @@ -621,6 +738,8 @@ def binning( domain=domain, completeness=completeness, purity=purity, + coverage_stddev=coverage_stddev, + gc_content_stddev=gc_content_stddev, method=method, verbose=verbose, ) @@ -655,10 +774,10 @@ def binning( continue # Only cluster contigs that have not already been assigned a bin. # First filter with 'cluster' column - # Second filter with previous clustering rounds' clustered contigs rank_df = ( dff.loc[dff["cluster"].isna()] if "cluster" in dff.columns else dff ) + # Second filter with previous clustering rounds' clustered contigs if clustered_contigs: rank_df = rank_df.loc[~rank_df.index.isin(clustered_contigs)] # After all of the filters, are there multiple contigs to cluster? @@ -674,6 +793,8 @@ def binning( domain=domain, completeness=completeness, purity=purity, + coverage_stddev=coverage_stddev, + gc_content_stddev=gc_content_stddev, method=method, verbose=verbose, ) @@ -721,28 +842,64 @@ def main(): "composition, coverage and homology.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) - parser.add_argument("kmers", help="") - parser.add_argument("coverage", help="") - parser.add_argument("markers", help="") - parser.add_argument("out", help="") - parser.add_argument("--embedded-kmers", help="") + parser.add_argument("kmers", help="Path to normalized k-mer frequencies table") + parser.add_argument("coverages", help="Path to metagenome coverages table") + parser.add_argument("gc_content", help="Path to metagenome GC contents table") + parser.add_argument("markers", help="Path to Autometa annotated markers table") + parser.add_argument("output", help="Path to write Autometa binning results") + parser.add_argument( + "--embedded-kmers", + help="Path to provide embedded k-mer frequencies table if embedding has already been performed.", + metavar="filepath", + ) parser.add_argument( "--embedding-method", - help="Embedding method to use", + help="Embedding method to use on normalized k-mer frequencies.", choices=["bhsne", "sksne", "umap"], + metavar="string", default="bhsne", ) parser.add_argument( "--clustering-method", - help="Clustering method to use", + help="Clustering algorithm to use for recursive binning.", choices=["dbscan", "hdbscan"], + metavar="string", default="dbscan", ) parser.add_argument( - "--completeness", help="", default=20.0, type=float + "--completeness", + help="completeness cutoff to retain cluster." + " e.g. cluster completeness >= `completeness`", + default=20.0, + metavar="float", + type=float, + ) + parser.add_argument( + "--purity", + help="purity cutoff to retain cluster. e.g. cluster purity >= `purity`", + default=95.0, + metavar="float", + type=float, + ) + parser.add_argument( + "--cov-stddev-limit", + help="coverage standard deviation limit to retain cluster" + " e.g. cluster coverage standard deviation <= `cov-stddev-limit`", + default=25.0, + metavar="float", + type=float, + ) + parser.add_argument( + "--gc-stddev-limit", + help="GC content standard deviation limit to retain cluster" + " e.g. cluster GC content standard deviation <= `gc-content-stddev-limit`", + default=5.0, + metavar="float", + type=float, + ) + parser.add_argument( + "--taxonomy", metavar="filepath", help="" ) - parser.add_argument("--purity", help="", default=90.0, type=float) - parser.add_argument("--taxonomy", help="") parser.add_argument( "--starting-rank", help="Canonical rank at which to begin subsetting taxonomy", @@ -756,33 +913,39 @@ def main(): "genus", "species", ], + metavar="string", ) parser.add_argument( "--reverse-ranks", action="store_true", default=False, help="Reverse order at which to split taxonomy by canonical-rank." - " When --reverse-ranks given, contigs will be split in order of species, genus, family, order, class, phylum, superkingdom.", + " When `--reverse-ranks` is given, contigs will be split in order of" + " species, genus, family, order, class, phylum, superkingdom.", ) parser.add_argument( "--domain", help="Kingdom to consider (archaea|bacteria)", choices=["bacteria", "archaea"], + metavar="string", default="bacteria", ) parser.add_argument( - "--verbose", action="store_true", default=False, help="log debug information" + "--verbose", action="store_true", default=False, help="log debug information", ) args = parser.parse_args() - kmers_df = kmers.embed( kmers=args.kmers, out=args.embedded_kmers, method=args.embedding_method ) - cov_df = pd.read_csv(args.coverage, sep="\t", index_col="contig") + cov_df = pd.read_csv(args.coverages, sep="\t", index_col="contig") master_df = pd.merge( kmers_df, cov_df[["coverage"]], how="left", left_index=True, right_index=True, ) + gc_content_df = pd.read_csv(args.gc_content, sep="\t", index_col="contig") + master_df = pd.merge( + master_df, gc_content_df, how="left", left_index=True, right_index=True, + ) markers_df = load_markers(args.markers) markers_df = markers_df.convert_dtypes() @@ -801,6 +964,7 @@ def main(): taxa_present = True if "taxid" in master_df else False master_df = master_df.convert_dtypes() logger.debug(f"master_df shape: {master_df.shape}") + master_out = binning( master=master_df, markers=markers_df, @@ -810,13 +974,15 @@ def main(): domain=args.domain, completeness=args.completeness, purity=args.purity, + coverage_stddev=args.cov_stddev_limit, + gc_content_stddev=args.gc_stddev_limit, method=args.clustering_method, verbose=args.verbose, ) # Output table outcols = ["cluster", "completeness", "purity"] - master_out[outcols].to_csv(args.out, sep="\t", index=True, header=True) + master_out[outcols].to_csv(args.output, sep="\t", index=True, header=True) if __name__ == "__main__": diff --git a/docs/source/install.rst b/docs/source/install.rst index ca65f32b0..a08e4c391 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -25,7 +25,7 @@ Conda environment creation: hmmer \ prodigal \ diamond \ - ndcctools \ + nextflow \ parallel \ requests \ umap-learn \ @@ -47,8 +47,16 @@ If you are wanting to help develop autometa, you will need these additional depe .. code-block:: bash - conda install -n autometa \ - black pre_commit pytest pytest-cov pytest-html pytest-repeat pytest-variables + conda install -n autometa -c conda-forge \ + black \ + pre_commit \ + pytest \ + pytest-cov \ + pytest-html \ + pytest-repeat \ + pytest-variables \ + sphinx \ + sphinx_rtd_theme # Navigate to your autometa conda environment conda activate autometa diff --git a/make_test_data.py b/make_test_data.py index 27e7f2e12..fb37b785f 100644 --- a/make_test_data.py +++ b/make_test_data.py @@ -57,7 +57,8 @@ 6.2 kmers_embedded 6.3 taxonomy 6.4 coverage - 6.3 markers + 6.5 gc_content + 6.6 markers 7 summary 7.1 bin_df """ @@ -130,16 +131,13 @@ class TestData: binning_norm_kmers: str binning_embedded_kmers: str binning_coverage: str + binning_gc_content: str binning_markers: str binning_taxonomy: str summary_bin_df: str recruitment_binning: str data: typing.Dict = attr.ib(default={}) seed: int = 42 - sam_fpath: str - bed_fpath: str - fwd_reads: str - rev_read: str def prepare_metagenome(self, num_records: int = 4): logger.info("Preparing metagenome records test data...") @@ -345,6 +343,7 @@ def get_binning(self, num_contigs: int = None): "kmers_embedded": self.binning_embedded_kmers, "taxonomy": self.binning_taxonomy, "coverage": self.binning_coverage, + "gc_content": self.binning_gc_content, } markers_df = pd.read_csv(self.binning_markers, sep="\t", index_col="contig") @@ -358,7 +357,7 @@ def get_binning(self, num_contigs: int = None): df[df.index.isin(markers_df.index)].index.tolist()[:num_contigs] ) if annotation == "taxonomy": - for column in df.set_index("contig").select_dtypes(object).columns: + for column in df.select_dtypes(object).columns: df[column] = df[column].map(lambda taxon: taxon.lower()) # We need to reset the index from contig to None before json export. if contigs: @@ -413,6 +412,7 @@ def main(): binning_norm_kmers = os.path.join(outdir, "binning_kmers.am_clr.tsv.gz") binning_embedded_kmers = os.path.join(outdir, "binning_kmers.am_clr.bhsne.tsv.gz") binning_coverage = os.path.join(outdir, "binning_coverage.tsv.gz") + binning_gc_content = os.path.join(outdir, "binning_gc_content.tsv.gz") binning_markers = os.path.join(outdir, "binning_markers.tsv.gz") binning_taxonomy = os.path.join(outdir, "binning_taxonomy.tsv.gz") coverage_sam = os.path.join(outdir, "records.sam.gz") @@ -435,6 +435,7 @@ def main(): binning_norm_kmers=binning_norm_kmers, binning_embedded_kmers=binning_embedded_kmers, binning_coverage=binning_coverage, + binning_gc_content=binning_gc_content, binning_markers=binning_markers, binning_taxonomy=binning_taxonomy, coverage_sam=coverage_sam, diff --git a/requirements.txt b/requirements.txt index 3db8d3b0e..e9e561ea0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ biopython -pandas +gdown +pandas>=1.1 tqdm tsne numpy @@ -10,6 +11,8 @@ bedtools bowtie2 hmmer prodigal +make +nextflow parallel requests umap-learn diff --git a/tests/requirements.txt b/tests/requirements.txt new file mode 100644 index 000000000..d7e615b4f --- /dev/null +++ b/tests/requirements.txt @@ -0,0 +1,11 @@ +black +pre_commit +pytest +pytest-cov +pytest-html +pytest-repeat +pytest-variables +pytest-emoji +pytest-md +sphinx_rtd_theme +sphinx diff --git a/tests/unit_tests/test_recursive_dbscan.py b/tests/unit_tests/test_recursive_dbscan.py index a2caacadf..311648b3e 100644 --- a/tests/unit_tests/test_recursive_dbscan.py +++ b/tests/unit_tests/test_recursive_dbscan.py @@ -55,6 +55,16 @@ def fixture_coverage(variables, test_contigs, binning_testdir): return fpath +@pytest.fixture(name="gc_content", scope="module") +def fixture_gc_content(variables, test_contigs, binning_testdir): + binning_test_data = variables["binning"] + df = pd.read_json(binning_test_data["gc_content"]) + df.set_index("contig", inplace=True) + fpath = binning_testdir / "gc_content.tsv" + df.loc[test_contigs].to_csv(fpath, sep="\t", index=True, header=True) + return fpath + + @pytest.fixture(name="taxonomy", scope="module") def fixture_taxonomy(variables, test_contigs, binning_testdir): binning_test_data = variables["binning"] @@ -80,9 +90,9 @@ def fixture_markers(markers_fpath): @pytest.fixture(name="master", scope="module") -def fixture_master(embedded_kmers, coverage, taxonomy): +def fixture_master(embedded_kmers, coverage, gc_content, taxonomy): master = pd.read_csv(embedded_kmers, sep="\t", index_col="contig") - for fpath in [coverage, taxonomy]: + for fpath in [coverage, gc_content, taxonomy]: df = pd.read_csv(fpath, sep="\t", index_col="contig") master = pd.merge(master, df, how="left", right_index=True, left_index=True) master = master.convert_dtypes() @@ -101,7 +111,9 @@ def test_binning(master, markers, usetaxonomy, method): reverse_ranks=False, domain="bacteria", completeness=20.0, - purity=90.0, + purity=95.0, + coverage_stddev=25.0, + gc_content_stddev=5.0, method=method, verbose=True, ) @@ -109,6 +121,9 @@ def test_binning(master, markers, usetaxonomy, method): assert "cluster" in df.columns assert "purity" in df.columns assert "completeness" in df.columns + assert "coverage_stddev" in df.columns + assert "gc_content_stddev" in df.columns + assert "completeness" in df.columns assert df.shape[0] == num_contigs @@ -122,7 +137,7 @@ def test_binning_invalid_clustering_method(master, markers): reverse_ranks=False, domain="bacteria", completeness=20.0, - purity=90.0, + purity=95.0, method="invalid_clustering_method", verbose=False, ) @@ -146,7 +161,14 @@ def test_binning_empty_markers_table(master): @pytest.mark.entrypoint def test_recursive_dbscan_main( - monkeypatch, kmers, coverage, markers_fpath, embedded_kmers, taxonomy, tmp_path + monkeypatch, + kmers, + coverage, + gc_content, + markers_fpath, + embedded_kmers, + taxonomy, + tmp_path, ): out = tmp_path / "binning.tsv" @@ -154,14 +176,17 @@ class MockArgs: def __init__(self): self.domain = "bacteria" self.kmers = kmers - self.coverage = coverage + self.coverages = coverage + self.gc_content = gc_content self.markers = markers_fpath - self.out = out + self.output = out self.embedded_kmers = embedded_kmers self.embedding_method = "bhsne" self.clustering_method = "dbscan" self.completeness = 20.0 - self.purity = 90.0 + self.purity = 95.0 + self.cov_stddev_limit = 25.0 + self.gc_stddev_limit = 5.0 self.taxonomy = taxonomy self.starting_rank = "superkingdom" self.reverse_ranks = False