diff --git a/.dockerignore b/.dockerignore index 8163a24c7..dd0ac3d27 100644 --- a/.dockerignore +++ b/.dockerignore @@ -20,3 +20,4 @@ Autometa.egg-info # Ignore databases autometa/databases !autometa/databases/markers +/work diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..afb20bb1e --- /dev/null +++ b/.editorconfig @@ -0,0 +1,24 @@ +root = true + +[*] +charset = utf-8 +end_of_line = lf +insert_final_newline = true +trim_trailing_whitespace = true +indent_size = 4 +indent_style = space + +[*.{yml,yaml}] +indent_size = 2 + +# These files are edited and tested upstream in nf-core/modules +[/modules/nf-core/**] +charset = unset +end_of_line = unset +insert_final_newline = unset +trim_trailing_whitespace = unset +indent_style = unset +indent_size = unset + +[/assets/email*] +indent_size = unset diff --git a/.gitattribute b/.gitattribute new file mode 100644 index 000000000..7fe55006f --- /dev/null +++ b/.gitattribute @@ -0,0 +1 @@ +*.config linguist-language=nextflow diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..7fe55006f --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.config linguist-language=nextflow diff --git a/.github/.dockstore.yml b/.github/.dockstore.yml new file mode 100644 index 000000000..191fabd22 --- /dev/null +++ b/.github/.dockstore.yml @@ -0,0 +1,6 @@ +# Dockstore config version, not pipeline version +version: 1.2 +workflows: + - subclass: nfl + primaryDescriptorPath: /nextflow.config + publish: True diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 000000000..e7e17dd3d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,8 @@ +blank_issues_enabled: false +contact_links: + - name: Join nf-core + url: https://nf-co.re/join + about: Please join the nf-core community here + - name: "Slack #autometa channel" + url: https://nfcore.slack.com/channels/autometa + about: Discussion about the nf-core/autometa pipeline diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 000000000..6b9f0e0ad --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,26 @@ + + + +## PR checklist + +- [ ] This comment contains a description of changes (with reason). +- [ ] If you've fixed a bug or added code that should be tested, add tests! + - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/autometa/tree/master/.github/CONTRIBUTING.md) + - [ ] If necessary, also make a PR on the nf-core/autometa _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. +- [ ] Make sure your code lints (`nf-core lint .`). +- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). +- [ ] Usage Documentation in `docs/usage.md` is updated. +- [ ] Output Documentation in `docs/output.md` is updated. +- [ ] `CHANGELOG.md` is updated. +- [ ] `README.md` is updated (including new tool citations and authors/contributors). diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 000000000..b170ccb07 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,41 @@ +name: nf-core CI +# This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors +on: + push: + branches: + - dev + pull_request: + release: + types: [published] + +# Uncomment if we need an edge release of Nextflow again +# env: NXF_EDGE: 1 + +jobs: + test: + name: Run workflow tests + # Only run on push if this is the nf-core dev branch (merged PRs) + if: ${{ github.event_name == 'push_TODO' }} + runs-on: ubuntu-latest + env: + NXF_VER: ${{ matrix.nxf_ver }} + NXF_ANSI_LOG: false + strategy: + matrix: + # Nextflow versions: check pipeline minimum and current latest + nxf_ver: ["21.04.0", ""] + steps: + - name: Check out pipeline code + uses: actions/checkout@v2 + + - name: Install Nextflow + env: + CAPSULE_LOG: none + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + + - name: Run pipeline with test data + run: | + docker build . -t jason-c-kwan/autometa:test + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --autometa_image_tag 'latest' diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml new file mode 100644 index 000000000..b5f1793d2 --- /dev/null +++ b/.github/workflows/linting.yml @@ -0,0 +1,52 @@ +name: nf-core linting +# This workflow is triggered on pushes and PRs to the repository. +# It runs the `nf-core lint` and markdown lint tests to ensure that the code meets the nf-core guidelines +on: + pull_request: + release: + types: [published] + +jobs: + nf-core: + runs-on: ubuntu-latest + steps: + - name: Check out pipeline code + uses: actions/checkout@v2 + + - name: Install Nextflow + env: + CAPSULE_LOG: none + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + + - uses: actions/setup-python@v1 + with: + python-version: "3.6" + architecture: "x64" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install nf-core + + - name: Run nf-core lint + env: + GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }} + run: nf-core -l lint_log.txt lint --dir ${GITHUB_WORKSPACE} --markdown lint_results.md + + - name: Save PR number + if: ${{ always() }} + run: echo ${{ github.event.pull_request.number }} > PR_number.txt + + - name: Upload linting log file artifact + if: ${{ always() }} + uses: actions/upload-artifact@v2 + with: + name: linting-logs + path: | + lint_log.txt + lint_results.md + PR_number.txt diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml new file mode 100644 index 000000000..0471addcc --- /dev/null +++ b/.github/workflows/linting_comment.yml @@ -0,0 +1,27 @@ +name: nf-core linting comment +# This workflow is triggered after the linting action is complete +# It posts an automated comment to the PR, even if the PR is coming from a fork + +on: + workflow_run: + workflows: ["nf-core linting"] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Download lint results + uses: dawidd6/action-download-artifact@v2 + with: + workflow: linting.yml + + - name: Get PR number + id: pr_number + run: echo "::set-output name=pr_number::$(cat linting-logs/PR_number.txt)" + + - name: Post PR comment + uses: marocchino/sticky-pull-request-comment@v2 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + number: ${{ steps.pr_number.outputs.pr_number }} + path: linting-logs/lint_results.md diff --git a/.gitignore b/.gitignore index f85809209..216e71f32 100644 --- a/.gitignore +++ b/.gitignore @@ -16,7 +16,7 @@ dist/ downloads/ eggs/ .eggs/ -lib/ +!nextflow/lib/ lib64/ parts/ sdist/ @@ -157,7 +157,12 @@ tests/data/* .DS_Store # nextflow -.nextflow -.nextflow.log* +.nextflow* pipeline_info -work +work/ +large_db/ +nf-params.json +autometa_interim_dir/ +autometa_outdir/ +autometa_tracedir/ + diff --git a/.markdownlint.yml b/.markdownlint.yml new file mode 100644 index 000000000..9e605fcfa --- /dev/null +++ b/.markdownlint.yml @@ -0,0 +1,14 @@ +# Markdownlint configuration file +default: true +line-length: false +ul-indent: + indent: 4 +no-duplicate-header: + siblings_only: true +no-inline-html: + allowed_elements: + - img + - p + - kbd + - details + - summary diff --git a/.nf-core.yml b/.nf-core.yml new file mode 100644 index 000000000..53c78d04d --- /dev/null +++ b/.nf-core.yml @@ -0,0 +1,27 @@ +lint: + files_exist: + - .github/workflows/branch.yml + - .github/workflows/awstest.yml + - .github/workflows/awsfulltest.yml + - .travis.yml + - docs/usage.md + - docs/output.md + - docs/images/nf-core-autometa_logo.png + + files_unchanged: + - manifest + - .github/CONTRIBUTING.md + - .github/ISSUE_TEMPLATE/bug_report.md + - .github/ISSUE_TEMPLATE/feature_request.md + - .github/workflows/linting_comment.yml + - .github/workflows/linting.yml + - assets/email_template.html + - assets/email_template.txt + - bin/scrape_software_versions.py + - docs/README.md + - .gitignore + - LICENSE + - .github/PULL_REQUEST_TEMPLATE.md + - lib/NfcoreTemplate.groovy + template_strings: false + nextflow_config: false diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 000000000..e1fba1260 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,16 @@ +# nf-core/autometa: Changelog + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## v2.0.0 - [date] + +Second release of nf-core/autometa, created with the [nf-core](https://nf-co.re/) template. + +### `Added` + +### `Fixed` + +### `Dependencies` + +### `Deprecated` diff --git a/CITATIONS.md b/CITATIONS.md new file mode 100644 index 000000000..0191c5822 --- /dev/null +++ b/CITATIONS.md @@ -0,0 +1,48 @@ +# Autometa: + +## [Autometa](https://github.com/KwanLab/Autometa) + > Miller, I. J.; Rees, E. R.; Ross, J.; Miller, I.; Baxa, J.; Lopera, J.; Kerby, R. L.; Rey, F. E.; Kwan, J. C. Autometa: Automated extraction of microbial genomes from individual shotgun metagenomes. Nucleic Acids Research, 2019. DOI: https://doi.org/10.1093/nar/gkz148 + +## [python](https://www.python.org) + > Van Rossum, G., & Drake, F. L. (2009). Python 3 Reference Manual. Scotts Valley, CA: CreateSpace. + +## [nf-core](https://pubmed.ncbi.nlm.nih.gov/32055031/) + +> Ewels PA, Peltzer A, Fillinger S, Patel H, Alneberg J, Wilm A, Garcia MU, Di Tommaso P, Nahnsen S. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol. 2020 Mar;38(3):276-278. doi: 10.1038/s41587-020-0439-x. PubMed PMID: 32055031. + +## [Nextflow](https://pubmed.ncbi.nlm.nih.gov/28398311/) + +> Di Tommaso P, Chatzou M, Floden EW, Barja PP, Palumbo E, Notredame C. Nextflow enables reproducible computational workflows. Nat Biotechnol. 2017 Apr 11;35(4):316-319. doi: 10.1038/nbt.3820. PubMed PMID: 28398311. + +## Pipeline tools + +* [Diamond](https://github.com/bbuchfink/diamond) + > Buchfink B, Reuter K, Drost HG, "Sensitive protein alignments at tree-of-life scale using DIAMOND", Nature Methods 18, 366–368 (2021). doi:10.1038/s41592-021-01101-x + +* [HMMER](http://hmmer.org) + > HMMER Web Server: Interactive Sequence Similarity Searching. R. D. Finn, J. Clements, S. R. Eddy. Nucleic Acids Research, 39:W29-37, 2011. + +* [Prodigal](https://github.com/hyattpd/Prodigal) + > Hyatt, D., Chen, GL., LoCascio, P.F. et al. Prodigal: prokaryotic gene recognition and translation initiation site identification. BMC Bioinformatics 11, 119 (2010). https://doi.org/10.1186/1471-2105-11-119 + +* [Samtools](http://www.htslib.org) + > Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H, Twelve years of SAMtools and BCFtools, GigaScience (2021) 10(2) giab008 [https://pubmed.ncbi.nlm.nih.gov/33590861] + +* [SeqKit](https://github.com/shenwei356/seqkit) + > W Shen, S Le, Y Li*, F Hu*. SeqKit: a cross-platform and ultrafast toolkit for FASTA/Q file manipulation. PLOS ONE. doi:10.1371/journal.pone.0163962. + +## Software packaging/containerisation tools + +* [Anaconda](https://anaconda.com) + > Anaconda Software Distribution. Computer software. Vers. 2-2.4.0. Anaconda, Nov. 2016. Web. + +* [Bioconda](https://pubmed.ncbi.nlm.nih.gov/29967506/) + > Grüning B, Dale R, Sjödin A, Chapman BA, Rowe J, Tomkins-Tinch CH, Valieris R, Köster J; Bioconda Team. Bioconda: sustainable and comprehensive software distribution for the life sciences. Nat Methods. 2018 Jul;15(7):475-476. doi: 10.1038/s41592-018-0046-7. PubMed PMID: 29967506. + +* [BioContainers](https://pubmed.ncbi.nlm.nih.gov/28379341/) + > da Veiga Leprevost F, Grüning B, Aflitos SA, Röst HL, Uszkoreit J, Barsnes H, Vaudel M, Moreno P, Gatto L, Weber J, Bai M, Jimenez RC, Sachsenberg T, Pfeuffer J, Alvarez RV, Griss J, Nesvizhskii AI, Perez-Riverol Y. BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics. 2017 Aug 15;33(16):2580-2582. doi: 10.1093/bioinformatics/btx192. PubMed PMID: 28379341; PubMed Central PMCID: PMC5870671. + +* [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241) + +* [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) + > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..f4fd052f1 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,111 @@ +# Code of Conduct at nf-core (v1.0) + +## Our Pledge + +In the interest of fostering an open, collaborative, and welcoming environment, we as contributors and maintainers of nf-core, pledge to making participation in our projects and community a harassment-free experience for everyone, regardless of: + +- Age +- Body size +- Familial status +- Gender identity and expression +- Geographical location +- Level of experience +- Nationality and national origins +- Native language +- Physical and neurological ability +- Race or ethnicity +- Religion +- Sexual identity and orientation +- Socioeconomic status + +Please note that the list above is alphabetised and is therefore not ranked in any order of preference or importance. + +## Preamble + +> Note: This Code of Conduct (CoC) has been drafted by the nf-core Safety Officer and been edited after input from members of the nf-core team and others. "We", in this document, refers to the Safety Officer and members of the nf-core core team, both of whom are deemed to be members of the nf-core community and are therefore required to abide by this Code of Conduct. This document will amended periodically to keep it up-to-date, and in case of any dispute, the most current version will apply. + +An up-to-date list of members of the nf-core core team can be found [here](https://nf-co.re/about). Our current safety officer is Renuka Kudva. + +nf-core is a young and growing community that welcomes contributions from anyone with a shared vision for [Open Science Policies](https://www.fosteropenscience.eu/taxonomy/term/8). Open science policies encompass inclusive behaviours and we strive to build and maintain a safe and inclusive environment for all individuals. + +We have therefore adopted this code of conduct (CoC), which we require all members of our community and attendees in nf-core events to adhere to in all our workspaces at all times. Workspaces include but are not limited to Slack, meetings on Zoom, Jitsi, YouTube live etc. + +Our CoC will be strictly enforced and the nf-core team reserve the right to exclude participants who do not comply with our guidelines from our workspaces and future nf-core activities. + +We ask all members of our community to help maintain a supportive and productive workspace and to avoid behaviours that can make individuals feel unsafe or unwelcome. Please help us maintain and uphold this CoC. + +Questions, concerns or ideas on what we can include? Contact safety [at] nf-co [dot] re + +## Our Responsibilities + +The safety officer is responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behaviour. + +The safety officer in consultation with the nf-core core team have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. + +Members of the core team or the safety officer who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and be subject to the same actions as others in violation of the CoC. + +## When are where does this Code of Conduct apply? + +Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events. This includes but is not limited to the following listed alphabetically and therefore in no order of preference: + +- Communicating with an official project email address. +- Communicating with community members within the nf-core Slack channel. +- Participating in hackathons organised by nf-core (both online and in-person events). +- Participating in collaborative work on GitHub, Google Suite, community calls, mentorship meetings, email correspondence. +- Participating in workshops, training, and seminar series organised by nf-core (both online and in-person events). This applies to events hosted on web-based platforms such as Zoom, Jitsi, YouTube live etc. +- Representing nf-core on social media. This includes both official and personal accounts. + +## nf-core cares 😊 + +nf-core's CoC and expectations of respectful behaviours for all participants (including organisers and the nf-core team) include but are not limited to the following (listed in alphabetical order): + +- Ask for consent before sharing another community member’s personal information (including photographs) on social media. +- Be respectful of differing viewpoints and experiences. We are all here to learn from one another and a difference in opinion can present a good learning opportunity. +- Celebrate your accomplishments at events! (Get creative with your use of emojis 🎉 🥳 💯 🙌 !) +- Demonstrate empathy towards other community members. (We don’t all have the same amount of time to dedicate to nf-core. If tasks are pending, don’t hesitate to gently remind members of your team. If you are leading a task, ask for help if you feel overwhelmed.) +- Engage with and enquire after others. (This is especially important given the geographically remote nature of the nf-core community, so let’s do this the best we can) +- Focus on what is best for the team and the community. (When in doubt, ask) +- Graciously accept constructive criticism, yet be unafraid to question, deliberate, and learn. +- Introduce yourself to members of the community. (We’ve all been outsiders and we know that talking to strangers can be hard for some, but remember we’re interested in getting to know you and your visions for open science!) +- Show appreciation and **provide clear feedback**. (This is especially important because we don’t see each other in person and it can be harder to interpret subtleties. Also remember that not everyone understands a certain language to the same extent as you do, so **be clear in your communications to be kind.**) +- Take breaks when you feel like you need them. +- Using welcoming and inclusive language. (Participants are encouraged to display their chosen pronouns on Zoom or in communication on Slack.) + +## nf-core frowns on 😕 + +The following behaviours from any participants within the nf-core community (including the organisers) will be considered unacceptable under this code of conduct. Engaging or advocating for any of the following could result in expulsion from nf-core workspaces. + +- Deliberate intimidation, stalking or following and sustained disruption of communication among participants of the community. This includes hijacking shared screens through actions such as using the annotate tool in conferencing software such as Zoom. +- “Doxing” i.e. posting (or threatening to post) another person’s personal identifying information online. +- Spamming or trolling of individuals on social media. +- Use of sexual or discriminatory imagery, comments, or jokes and unwelcome sexual attention. +- Verbal and text comments that reinforce social structures of domination related to gender, gender identity and expression, sexual orientation, ability, physical appearance, body size, race, age, religion or work experience. + +### Online Trolling + +The majority of nf-core interactions and events are held online. Unfortunately, holding events online comes with the added issue of online trolling. This is unacceptable, reports of such behaviour will be taken very seriously, and perpetrators will be excluded from activities immediately. + +All community members are required to ask members of the group they are working within for explicit consent prior to taking screenshots of individuals during video calls. + +## Procedures for Reporting CoC violations + +If someone makes you feel uncomfortable through their behaviours or actions, report it as soon as possible. + +You can reach out to members of the [nf-core core team](https://nf-co.re/about) and they will forward your concerns to the safety officer(s). + +Issues directly concerning members of the core team will be dealt with by other members of the core team and the safety manager, and possible conflicts of interest will be taken into account. nf-core is also in discussions about having an ombudsperson, and details will be shared in due course. + +All reports will be handled with utmost discretion and confidentially. + +## Attribution and Acknowledgements + +- The [Contributor Covenant, version 1.4](http://contributor-covenant.org/version/1/4) +- The [OpenCon 2017 Code of Conduct](http://www.opencon2017.org/code_of_conduct) (CC BY 4.0 OpenCon organisers, SPARC and Right to Research Coalition) +- The [eLife innovation sprint 2020 Code of Conduct](https://sprint.elifesciences.org/code-of-conduct/) +- The [Mozilla Community Participation Guidelines v3.1](https://www.mozilla.org/en-US/about/governance/policies/participation/) (version 3.1, CC BY-SA 3.0 Mozilla) + +## Changelog + +### v1.0 - March 12th, 2021 + +- Complete rewrite from original [Contributor Covenant](http://contributor-covenant.org/) CoC. diff --git a/LICENSE.txt b/LICENSE similarity index 100% rename from LICENSE.txt rename to LICENSE diff --git a/Makefile b/Makefile index cb56b6452..83c5d0621 100644 --- a/Makefile +++ b/Makefile @@ -57,11 +57,11 @@ endif ## Install autometa from source install: setup.py - python setup.py install + python3 setup.py install ## Install dependencies for test environment test_environment: tests/requirements.txt - python -m pip install --requirement=tests/requirements.txt + python3 -m pip install --requirement=tests/requirements.txt ## Build docker image from Dockerfile (auto-taggged as jason-c-kwan/autometa:) image: Dockerfile @@ -78,19 +78,19 @@ unit_test_data_download: ## Build test_data.json file for unit testing (requires all files from https://drive.google.com/open?id=189C6do0Xw-X813gspsafR9r8m-YfbhTS be downloaded into tests/data/) unit_test_data_build: tests/data/records.fna - python make_test_data.py + python3 make_test_data.py ## Run all unit tests unit_test: tests/data/test_data.json test_environment - python -m pytest --durations=0 --cov=autometa --emoji --cov-report=html tests + python3 -m pytest --durations=0 --cov=autometa --emoji --cov-report=html tests ## Run unit tests marked with WIP unit_test_wip: tests/data/test_data.json test_environment - python -m pytest -m "wip" --durations=0 --cov=autometa --emoji --cov-report=html tests + python3 -m pytest -m "wip" --durations=0 --cov=autometa --emoji --cov-report=html tests ## Run unit tests marked with entrypoint unit_test_entrypoints: tests/data/test_data.json test_environment - python -m pytest -m "entrypoint" --durations=0 --cov=autometa --emoji --cov-report=html tests + python3 -m pytest -m "entrypoint" --durations=0 --cov=autometa --emoji --cov-report=html tests ################################################################################# diff --git a/README.md b/README.md index 4764992fc..afdc194ac 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,6 @@ [![Documentation Status](https://readthedocs.org/projects/autometa/badge/?version=latest)](https://autometa.readthedocs.io/en/latest/?badge=latest) [![Build Status](https://travis-ci.com/KwanLab/Autometa.svg?branch=dev)](https://travis-ci.com/KwanLab/Autometa) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.04.0-23aa62.svg?labelColor=000000)](https://www.nextflow.io/) +[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](https://bioconda.github.io/) Autometa ========= diff --git a/VERSION b/VERSION index 0cadb8d32..227cea215 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.0a0 +2.0.0 diff --git a/assets/dummy_file.txt b/assets/dummy_file.txt new file mode 100644 index 000000000..e69de29bb diff --git a/assets/email_template.html b/assets/email_template.html new file mode 100644 index 000000000..dcfaacb82 --- /dev/null +++ b/assets/email_template.html @@ -0,0 +1,67 @@ + + + + + + + + + autometa Pipeline Report + + + +
+ + + +

autometa v${version}

+

Run Name: $runName

+ + <% if (!success){ out << """ +
+

Autometa execution completed unsuccessfully!

+

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

+

The full error message was:

+
${errorReport}
+
+ """ + } else { + out << """ +
+ Autometa execution completed successfully! +
+ """ + } + %> + +

The workflow was completed at $dateComplete (duration: $duration)

+

The command used to launch the workflow was as follows:

+
$commandLine
+ +

Pipeline Configuration:

+ + + <% out << summary.collect{ k,v -> " + + + " }.join("\n") %> + +
+ $k +
$v
+
+ +

Autometa

+

https://github.com/KwanLab/Autometa

+ +
+ + + + diff --git a/assets/email_template.txt b/assets/email_template.txt new file mode 100644 index 000000000..81d935817 --- /dev/null +++ b/assets/email_template.txt @@ -0,0 +1,35 @@ +---------------------------------------------------- + autometa v${version} +---------------------------------------------------- + +Run Name: $runName + +<% if (success){ + out << "## autometa execution completed successfully! ##" +} else { + out << """#################################################### +## autometa execution completed unsuccessfully! ## +#################################################### +The exit status of the task that caused the workflow execution to fail was: $exitStatus. +The full error message was: + +${errorReport} +""" +} %> + + +The workflow was completed at $dateComplete (duration: $duration) + +The command used to launch the workflow was as follows: + + $commandLine + + + +Pipeline Configuration: +----------------------- +<% out << summary.collect{ k,v -> " - $k: $v" }.join("\n") %> + +-- +Autometa +https://github.com/KwanLab/Autometa \ No newline at end of file diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml new file mode 100644 index 000000000..b6add2dfb --- /dev/null +++ b/assets/multiqc_config.yaml @@ -0,0 +1,11 @@ +report_comment: > + This report has been generated by the nf-core/autometa + analysis pipeline. For information about how to interpret these results, please see the + documentation. +report_section_order: + software_versions: + order: -1000 + nf-core-autometa-summary: + order: -1001 + +export_plots: true diff --git a/assets/nf-core-autometa_logo.png b/assets/nf-core-autometa_logo.png new file mode 100644 index 000000000..725d1dce0 Binary files /dev/null and b/assets/nf-core-autometa_logo.png differ diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv new file mode 100644 index 000000000..5f653ab7b --- /dev/null +++ b/assets/samplesheet.csv @@ -0,0 +1,3 @@ +sample,fastq_1,fastq_2 +SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz +SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, diff --git a/assets/schema_input.json b/assets/schema_input.json new file mode 100644 index 000000000..e705e6962 --- /dev/null +++ b/assets/schema_input.json @@ -0,0 +1,39 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/autometa/master/assets/schema_input.json", + "title": "nf-core/autometa pipeline - params.input schema", + "description": "Schema for the file provided with params.input", + "type": "array", + "items": { + "type": "object", + "properties": { + "sample": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Sample name must be provided and cannot contain spaces" + }, + "fastq_1": { + "type": "string", + "pattern": "^\\S+\\.f(ast)?q\\.gz$", + "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + }, + "fastq_2": { + "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.f(ast)?q\\.gz$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + } + }, + "required": [ + "sample", + "fastq_1" + ] + } +} diff --git a/assets/sendmail_template.txt b/assets/sendmail_template.txt new file mode 100644 index 000000000..1feb177f4 --- /dev/null +++ b/assets/sendmail_template.txt @@ -0,0 +1,53 @@ +To: $email +Subject: $subject +Mime-Version: 1.0 +Content-Type: multipart/related;boundary="nfcoremimeboundary" + +--nfcoremimeboundary +Content-Type: text/html; charset=utf-8 + +$email_html + +--nfcoremimeboundary +Content-Type: image/png;name="nf-core-autometa_logo.png" +Content-Transfer-Encoding: base64 +Content-ID: +Content-Disposition: inline; filename="nf-core-autometa_logo.png" + +<% out << new File("$projectDir/assets/nf-core-autometa_logo.png"). + bytes. + encodeBase64(). + toString(). + tokenize( '\n' )*. + toList()*. + collate( 76 )*. + collect { it.join() }. + flatten(). + join( '\n' ) %> + +<% +if (mqcFile){ +def mqcFileObj = new File("$mqcFile") +if (mqcFileObj.length() < mqcMaxSize){ +out << """ +--nfcoremimeboundary +Content-Type: text/html; name=\"multiqc_report\" +Content-Transfer-Encoding: base64 +Content-ID: +Content-Disposition: attachment; filename=\"${mqcFileObj.getName()}\" + +${mqcFileObj. + bytes. + encodeBase64(). + toString(). + tokenize( '\n' )*. + toList()*. + collate( 76 )*. + collect { it.join() }. + flatten(). + join( '\n' )} +""" +}} +%> + +--nfcoremimeboundary-- diff --git a/autometa/binning/recursive_dbscan.py b/autometa/binning/recursive_dbscan.py index bc8e331cf..a6add6a80 100644 --- a/autometa/binning/recursive_dbscan.py +++ b/autometa/binning/recursive_dbscan.py @@ -79,38 +79,34 @@ def add_metrics( if domain not in marker_sets: raise KeyError(f"{domain} is not bacteria or archaea!") expected_number = marker_sets[domain] - metrics = [] if "cluster" in df.columns: - clusters = dict(list(df.groupby("cluster"))) - for cluster, dff in clusters.items(): - pfam_counts = markers_df[markers_df.index.isin(dff.index)].sum() - is_present = pfam_counts >= 1 - is_single_copy = pfam_counts == 1 - nunique_markers = pfam_counts[is_present].count() - num_single_copy_markers = pfam_counts[is_single_copy].count() - completeness = nunique_markers / expected_number * 100 - # Protect from divide by zero - if nunique_markers == 0: - purity = pd.NA - else: - purity = num_single_copy_markers / nunique_markers * 100 - if dff.shape[0] <= 1: - coverage_stddev = 0.0 - gc_content_stddev = 0.0 - else: - coverage_stddev = dff.coverage.std() - gc_content_stddev = dff.gc_content.std() - metrics.append( - { - "cluster": cluster, - "completeness": completeness, - "purity": purity, - "coverage_stddev": coverage_stddev, - "gc_content_stddev": gc_content_stddev, - } - ) + # join cluster and marker data, group by cluster + temp = df.join(markers_df, how="outer").groupby("cluster") + # count present + nunique_markers = temp[list(markers_df.columns)].sum().ge(1).sum(axis=1) + # count single copy + num_single_copy_markers = temp[list(markers_df.columns)].sum().eq(1).sum(axis=1) + # calculate completeness/purity + completeness = nunique_markers / expected_number * 100 + purity = num_single_copy_markers / nunique_markers * 100 + coverage_stddev = temp["coverage"].std() + gc_content_stddev = temp["gc_content"].std() + completeness = completeness.to_frame() + purity = purity.to_frame() + coverage_stddev = coverage_stddev.to_frame() + gc_content_stddev = gc_content_stddev.to_frame() + metrics_df = pd.concat( + [completeness, purity, coverage_stddev, gc_content_stddev], axis=1 + ) + metrics_df.columns = [ + "completeness", + "purity", + "coverage_stddev", + "gc_content_stddev", + ] + merged_df = pd.merge(df, metrics_df, left_on="cluster", right_index=True) # Account for exceptions where clusters were not recovered - if not metrics or "cluster" not in df.columns: + else: metrics_df = pd.DataFrame( [ { @@ -128,10 +124,6 @@ def add_metrics( merged_df = df.copy() for metric in metric_cols: merged_df[metric] = pd.NA - - else: - metrics_df = pd.DataFrame(metrics).set_index("cluster") - merged_df = pd.merge(df, metrics_df, left_on="cluster", right_index=True) return merged_df, metrics_df @@ -269,6 +261,7 @@ def recursive_dbscan( n_clusters = float("inf") best_median = float("-inf") best_df = pd.DataFrame() + while n_clusters > 1: binned_df = run_dbscan(table, eps) df, metrics_df = add_metrics(df=binned_df, markers_df=markers_df, domain=domain) @@ -1023,7 +1016,9 @@ def main(): "coverage_stddev", "gc_content_stddev", ] - main_out[outcols].to_csv(args.output_binning, sep="\t", index=True, header=True) + main_out[outcols].to_csv( + args.output_binning, sep="\t", index=True, header=True, float_format="%.5f" + ) logger.info(f"Wrote binning results to {args.output_binning}") if args.output_main: # First after binning relevant assignments/metrics place contig physical annotations diff --git a/autometa/binning/unclustered_recruitment.py b/autometa/binning/unclustered_recruitment.py index 07cd5c1cf..f2606b3b3 100644 --- a/autometa/binning/unclustered_recruitment.py +++ b/autometa/binning/unclustered_recruitment.py @@ -50,6 +50,7 @@ import warnings import numpy as np import pandas as pd +import sys from sklearn.decomposition import PCA from sklearn.ensemble import RandomForestClassifier @@ -574,7 +575,8 @@ def main(): ) prev_num_unclustered = bin_df[bin_df.cluster.isnull()].shape[0] if not prev_num_unclustered: - raise BinningError("No unclustered contigs are available to recruit!") + logger.warning("No unclustered contigs are available to recruit!") + sys.exit(0) markers_df = load_markers(fpath=args.markers, format="wide") logger.debug( @@ -626,10 +628,14 @@ def main(): ) # Write unclustered recruitment results into main bin df # index = 'contig', cols = [..., 'cluster', 'recruited_cluster', ...] - main_df.to_csv(args.output_binning, sep="\t", index=True, header=True) + main_df.to_csv( + args.output_binning, sep="\t", index=True, header=True, float_format="%.5f" + ) if args.output_main: # Outputs features matrix used as input to recruitment algorithm - features.to_csv(args.output_main, sep="\t", index=True, header=True) + features.to_csv( + args.output_main, sep="\t", index=True, header=True, float_format="%.5f" + ) if __name__ == "__main__": diff --git a/autometa/common/external/hmmer.py b/autometa/common/external/hmmscan.py similarity index 99% rename from autometa/common/external/hmmer.py rename to autometa/common/external/hmmscan.py index 930c6ce0c..4a2fd6a70 100644 --- a/autometa/common/external/hmmer.py +++ b/autometa/common/external/hmmscan.py @@ -35,7 +35,6 @@ import pandas as pd from glob import glob -from Bio import SeqIO from autometa.common.external import prodigal @@ -124,7 +123,7 @@ def annotate_sequential(orfs, hmmdb, outfpath, cpus, seed=42): raise err -def hmmscan( +def run( orfs, hmmdb, outfpath, @@ -329,7 +328,7 @@ def main(): ): result = args.hmmscan else: - result = hmmscan( + result = run( orfs=args.orfs, hmmdb=args.hmmdb, outfpath=args.hmmscan, diff --git a/autometa/common/external/hmmsearch.py b/autometa/common/external/hmmsearch.py new file mode 100644 index 000000000..2e5b54f5f --- /dev/null +++ b/autometa/common/external/hmmsearch.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +COPYRIGHT +Copyright 2020 Ian J. Miller, Evan R. Rees, Kyle Wolf, Siddharth Uppal, +Shaurya Chanana, Izaak Miller, Jason C. Kwan + +This file is part of Autometa. + +Autometa is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +Autometa is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with Autometa. If not, see . +COPYRIGHT + +Module to filter the domtbl file from hmmsearch --domtblout using provided cutoffs +""" + + +import os +import logging + +import pandas as pd + +from autometa.common.external import prodigal + + +logger = logging.getLogger(__name__) + + +def filter_domtblout( + infpath: str, cutoffs: str, orfs: str, outfpath: str = None +) -> pd.DataFrame: + # TODO::memo: Add docstring + for fp in [infpath, cutoffs]: + if not os.path.exists(fp): + raise FileNotFoundError(fp) + if os.path.exists(outfpath) and os.path.getsize(outfpath): + raise FileExistsError(f"{outfpath} already exists") + col_indices = [0, 3, 4, 7] + col_names = ["orf", "sname", "sacc", "score"] + df = pd.read_csv( + infpath, + sep=r"\s+", + usecols=col_indices, + names=col_names, + header=None, + comment="#", + ) + # Regex: \..*$ --> search for a '.' with 0 or more trailing characters until the end of the `sacc` string + # e.g. PF03946.9 --> PF03946 + df["cleaned_sacc"] = df["sacc"].str.replace(r"\..*$", "", regex=True) + logger.debug(f"{df.sacc.nunique()} unique accessions for {df.orf.nunique()} orfs") + dff = pd.read_csv(cutoffs, sep="\t", index_col="accession") + mdf = pd.merge(df, dff, how="left", left_on="cleaned_sacc", right_on="accession") + mdf = mdf[mdf["score"] >= mdf["cutoff"]] + logger.debug( + f"{mdf.orf.nunique()} orfs contained {mdf.shape[0]} markers ({mdf.sacc.nunique()} unique)" + ) + cols = ["orf", "sacc", "sname", "score", "cutoff"] + mdf = mdf[cols] + # Add orf to contig mapping using header descriptions from prodigal + if mdf.empty: + mdf["contig"] = pd.NA + else: + translations = prodigal.contigs_from_headers(orfs) + + def translater(x): + return translations.get(x, x.rsplit("_", 1)[0]) + + mdf["contig"] = mdf["orf"].map(translater) + + mdf.set_index("contig", inplace=True) + if outfpath: + mdf.to_csv(outfpath, sep="\t", index=True, header=True) + logger.debug(f"Wrote filtered markers table to: {outfpath}") + return mdf + + +def main(): + import argparse + import logging as logger + + logger.basicConfig( + format="[%(asctime)s %(levelname)s] %(name)s: %(message)s", + datefmt="%m/%d/%Y %I:%M:%S %p", + level=logger.DEBUG, + ) + parser = argparse.ArgumentParser( + description="Filters domtblout generated from hmmsearch using provided cutoffs" + ) + parser.add_argument( + "--domtblout", + help="Path to domtblout generated from hmmsearch -domtblout ... ", + required=True, + ) + parser.add_argument( + "--cutoffs", + help="Path to cutoffs corresponding to hmmfile used with hmmsearch ", + required=True, + ) + parser.add_argument( + "--seqdb", + help="Path to orfs seqdb used as input to hmmsearch ... ", + required=True, + ) + parser.add_argument( + "--out", + help="Path to write table of markers passing provided cutoffs", + required=True, + ) + args = parser.parse_args() + + result = filter_domtblout( + infpath=args.infpath, + outfpath=args.markersout, + cutoffs=args.cutoffs, + orfs=args.orfs, + ) + + +if __name__ == "__main__": + main() diff --git a/autometa/common/markers.py b/autometa/common/markers.py index cb7e4df24..49dcabf6f 100644 --- a/autometa/common/markers.py +++ b/autometa/common/markers.py @@ -28,12 +28,12 @@ import logging import os -import multiprocessing as mp import pandas as pd -from autometa.common.external import hmmer +from autometa.common.external import hmmscan from autometa.config.utilities import DEFAULT_CONFIG + MARKERS_DIR = DEFAULT_CONFIG.get("databases", "markers") # For cases where autometa has not been configured, attempt to find the markers via source MARKERS_DIR = ( @@ -95,12 +95,14 @@ def load(fpath, format="wide"): def get( kingdom: str, orfs: str, - dbdir: str, + hmmdb: str, + cutoffs: str, + dbdir: str = MARKERS_DIR, scans: str = None, out: str = None, force: bool = False, format: str = "wide", - cpus: int = mp.cpu_count(), + cpus: int = 8, parallel: bool = True, gnu_parallel: bool = False, seed: int = 42, @@ -115,7 +117,11 @@ def get( orfs: str Path to amino-acid ORFs file dbdir: - Directory should contain hmmpressed marker genes database files. + Optional directory containing hmmdb and cutoffs files + hmmdb: + Path to marker genes database file, previously hmmpressed. + cutoffs: + Path to marker genes cutoff tsv. scans: str, optional Path to existing hmmscan table to filter by cutoffs out: str, optional @@ -150,15 +156,23 @@ def get( Why the exception is raised. """ kingdom = kingdom.lower() - hmmdb = os.path.join(dbdir, f"{kingdom}.single_copy.hmm") - cutoffs = os.path.join(dbdir, f"{kingdom}.single_copy.cutoffs") - hmmscan_fname = ".".join([kingdom, "hmmscan.tsv"]) + # if dbdir == MARKERS_DIR and hmmdb/cutoffs not set, use single dbdir + # else dbdir was set, so use it for both hmmdb and cutoffs + if dbdir == MARKERS_DIR: + if hmmdb is None: + hmmdb = os.path.join(dbdir, f"{kingdom}.single_copy.hmm") + if cutoffs is None: + cutoffs = os.path.join(dbdir, f"{kingdom}.single_copy.cutoffs") + else: + hmmdb = os.path.join(dbdir, f"{kingdom}.single_copy.hmm") + cutoffs = os.path.join(dbdir, f"{kingdom}.single_copy.cutoffs") + hmmscan_fname = f"{kingdom}.hmmscan.tsv" scans = ( os.path.join(os.path.dirname(os.path.abspath((orfs))), hmmscan_fname) if not scans else scans ) - markers_fname = ".".join([kingdom, "markers.tsv"]) + markers_fname = f"{kingdom}.markers.tsv" out = ( os.path.join(os.path.dirname(os.path.abspath((orfs))), markers_fname) if not out @@ -167,7 +181,7 @@ def get( kingdom = kingdom.lower() if not os.path.exists(scans) or not os.path.getsize(scans): - scans = hmmer.hmmscan( + scans = hmmscan.run( orfs=orfs, hmmdb=hmmdb, outfpath=scans, @@ -179,7 +193,7 @@ def get( ) if not os.path.exists(out) or not os.path.getsize(out): - out = hmmer.filter_markers( + out = hmmscan.filter_markers( infpath=scans, outfpath=out, cutoffs=cutoffs, @@ -225,6 +239,14 @@ def main(): help="Path to directory containing the single-copy marker HMM databases.", default=MARKERS_DIR, ) + parser.add_argument( + "--hmmdb", + help="Path to single-copy marker HMM databases.", + ) + parser.add_argument( + "--cutoffs", + help="Path to single-copy marker cutoff tsv.", + ) parser.add_argument( "--force", help="Whether to overwrite existing provided annotations.", @@ -246,7 +268,7 @@ def main(): parser.add_argument( "--cpus", help=f"Number of cores to use for parallel execution.", - default=mp.cpu_count(), + default=8, type=int, ) parser.add_argument( @@ -260,7 +282,9 @@ def main(): get( kingdom=args.kingdom, orfs=args.orfs, + hmmdb=args.hmmdb, dbdir=args.dbdir, + cutoffs=args.cutoffs, scans=args.hmmscan, out=args.out, force=args.force, diff --git a/autometa/databases/markers/archaea.single_copy.hmm.md5 b/autometa/databases/markers/archaea.single_copy.hmm.md5 index 79f7742a2..eb7b2be3f 100644 --- a/autometa/databases/markers/archaea.single_copy.hmm.md5 +++ b/autometa/databases/markers/archaea.single_copy.hmm.md5 @@ -1 +1 @@ -d80561bc637bfe65ffb6a975bfa07b8e autometa/databases/markers/archaea.single_copy.hmm +d80561bc637bfe65ffb6a975bfa07b8e archaea.single_copy.hmm diff --git a/autometa/databases/markers/bacteria.single_copy.hmm.md5 b/autometa/databases/markers/bacteria.single_copy.hmm.md5 index ccbbc9d0a..6ee9e8343 100644 --- a/autometa/databases/markers/bacteria.single_copy.hmm.md5 +++ b/autometa/databases/markers/bacteria.single_copy.hmm.md5 @@ -1 +1 @@ -3a32f37b3e269b6cc4fc0e440984c3d8 autometa/databases/markers/bacteria.single_copy.hmm +3a32f37b3e269b6cc4fc0e440984c3d8 bacteria.single_copy.hmm diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py new file mode 100755 index 000000000..71394528c --- /dev/null +++ b/bin/check_samplesheet.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python + +# TODO nf-core: Update the script to check the samplesheet +# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv + +import os +import sys +import errno +import argparse + + +def parse_args(args=None): + Description = "Reformat nf-core/autometa samplesheet file and check its contents." + Epilog = "Example usage: python check_samplesheet.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument("FILE_IN", help="Input samplesheet file.") + parser.add_argument("FILE_OUT", help="Output file.") + return parser.parse_args(args) + + +def make_dir(path): + if len(path) > 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise exception + + +def print_error(error, context="Line", context_str=""): + error_str = "ERROR: Please check samplesheet -> {}".format(error) + if context != "" and context_str != "": + error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format( + error, context.strip(), context_str.strip() + ) + print(error_str) + sys.exit(1) + + +# TODO nf-core: Update the check_samplesheet function +def check_samplesheet(file_in, file_out): + """ + This function checks that the samplesheet follows the following structure: + + sample,fastq_1,fastq_2 + SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz + SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz + SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz, + + For an example see: + https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv + """ + + sample_mapping_dict = {} + with open(file_in, "r") as fin: + + ## Check header + MIN_COLS = 2 + # TODO nf-core: Update the column names for the input samplesheet + HEADER = ["sample", "fastq_1", "fastq_2"] + header = [x.strip('"') for x in fin.readline().strip().split(",")] + if header[: len(HEADER)] != HEADER: + print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER))) + sys.exit(1) + + ## Check sample entries + for line in fin: + lspl = [x.strip().strip('"') for x in line.strip().split(",")] + + # Check valid number of columns per row + if len(lspl) < len(HEADER): + print_error( + "Invalid number of columns (minimum = {})!".format(len(HEADER)), + "Line", + line, + ) + num_cols = len([x for x in lspl if x]) + if num_cols < MIN_COLS: + print_error( + "Invalid number of populated columns (minimum = {})!".format(MIN_COLS), + "Line", + line, + ) + + ## Check sample name entries + sample, fastq_1, fastq_2 = lspl[: len(HEADER)] + sample = sample.replace(" ", "_") + if not sample: + print_error("Sample entry has not been specified!", "Line", line) + + ## Check FastQ file extension + for fastq in [fastq_1, fastq_2]: + if fastq: + if fastq.find(" ") != -1: + print_error("FastQ file contains spaces!", "Line", line) + if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"): + print_error( + "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!", + "Line", + line, + ) + + ## Auto-detect paired-end/single-end + sample_info = [] ## [single_end, fastq_1, fastq_2] + if sample and fastq_1 and fastq_2: ## Paired-end short reads + sample_info = ["0", fastq_1, fastq_2] + elif sample and fastq_1 and not fastq_2: ## Single-end short reads + sample_info = ["1", fastq_1, fastq_2] + else: + print_error("Invalid combination of columns provided!", "Line", line) + + ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] } + if sample not in sample_mapping_dict: + sample_mapping_dict[sample] = [sample_info] + else: + if sample_info in sample_mapping_dict[sample]: + print_error("Samplesheet contains duplicate rows!", "Line", line) + else: + sample_mapping_dict[sample].append(sample_info) + + ## Write validated samplesheet with appropriate columns + if len(sample_mapping_dict) > 0: + out_dir = os.path.dirname(file_out) + make_dir(out_dir) + with open(file_out, "w") as fout: + fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n") + for sample in sorted(sample_mapping_dict.keys()): + + ## Check that multiple runs of the same sample are of the same datatype + if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]): + print_error("Multiple runs of a sample must be of the same datatype!", "Sample: {}".format(sample)) + + for idx, val in enumerate(sample_mapping_dict[sample]): + fout.write(",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n") + else: + print_error("No entries to process!", "Samplesheet: {}".format(file_in)) + + +def main(args=None): + args = parse_args(args) + check_samplesheet(args.FILE_IN, args.FILE_OUT) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py new file mode 100755 index 000000000..7361e8ce5 --- /dev/null +++ b/bin/scrape_software_versions.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +from __future__ import print_function +import os + +results = {} +version_files = [x for x in os.listdir(".") if x.endswith(".version.txt")] +for version_file in version_files: + + software = version_file.replace(".version.txt", "") + if software == "pipeline": + software = "nf-core/autometa" + + with open(version_file) as fin: + version = fin.read().strip() + results[software] = version + +# Dump to YAML +print( + """ +id: 'software_versions' +section_name: 'Autometa Software Versions' +section_href: 'https://github.com/KwanLab/Autometa' +plot_type: 'html' +description: 'are collected at run time from the software output.' +data: | +
+""" +) +for k, v in sorted(results.items()): + print("
{}
{}
".format(k, v)) +print("
") + +# Write out regexes as csv file: +with open("software_versions.tsv", "w") as f: + for k, v in sorted(results.items()): + f.write("{}\t{}\n".format(k, v)) diff --git a/conf/README.md b/conf/README.md new file mode 100644 index 000000000..4887e2287 --- /dev/null +++ b/conf/README.md @@ -0,0 +1,13 @@ +.config files are prefixed by numbers (e.g. "01_optional_autometa.config") + +This is the order in which the files are loaded from `~/Autometa/nextflow.config. + +If a parameters is defined more than once, the one loaded last will be used. + + +Four config files are not numbered and are nf-core based files + + - `base.config` + - `modules.config` + - `test_full.config` + - `test.config` diff --git a/conf/base.config b/conf/base.config new file mode 100644 index 000000000..656467270 --- /dev/null +++ b/conf/base.config @@ -0,0 +1,56 @@ +/* +======================================================================================== + Nextflow base config file +======================================================================================== + A 'blank slate' config file, appropriate for general use on most high performance + compute environments. Assumes that all software is installed and available on + the PATH. Runs in `local` mode - all jobs will be run on the logged in environment. +---------------------------------------------------------------------------------------- +*/ + +process { + + // Check the defaults for all processes + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + + errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + maxRetries = 1 + maxErrors = '-1' + + // Process-specific resource requirements + // NOTE - Please try and re-use the labels below as much as possible. + // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. + // If possible, it would be nice to keep the same label naming convention when + // adding in your local modules too. + // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors + withLabel:process_low { + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 2.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } + withLabel:process_medium { + cpus = { check_max( 8 * task.attempt, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } + time = { check_max( 8.h * task.attempt, 'time' ) } + } + withLabel:process_high { + cpus = { check_max( 16 * task.attempt, 'cpus' ) } + memory = { check_max( 16.GB * task.attempt, 'memory' ) } + time = { check_max( 16.h * task.attempt, 'time' ) } + } + withLabel:process_long { + time = { check_max( 20.h * task.attempt, 'time' ) } + } + withLabel:process_high_memory { + memory = { check_max( 200.GB * task.attempt, 'memory' ) } + } + withLabel:error_ignore { + errorStrategy = 'ignore' + } + withLabel:error_retry { + errorStrategy = 'retry' + maxRetries = 2 + } +} diff --git a/conf/igenomes.config b/conf/igenomes.config new file mode 100644 index 000000000..855948def --- /dev/null +++ b/conf/igenomes.config @@ -0,0 +1,432 @@ +/* +======================================================================================== + Nextflow config file for iGenomes paths +======================================================================================== + Defines reference genomes using iGenome paths. + Can be used by any config that customises the base path using: + $params.igenomes_base / --igenomes_base +---------------------------------------------------------------------------------------- +*/ + +params { + // illumina iGenomes reference file paths + genomes { + 'GRCh37' { + fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/README.txt" + mito_name = "MT" + macs_gsize = "2.7e9" + blacklist = "${projectDir}/assets/blacklists/GRCh37-blacklist.bed" + } + 'GRCh38' { + fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.bed" + mito_name = "chrM" + macs_gsize = "2.7e9" + blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" + } + 'GRCm38' { + fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/README.txt" + mito_name = "MT" + macs_gsize = "1.87e9" + blacklist = "${projectDir}/assets/blacklists/GRCm38-blacklist.bed" + } + 'TAIR10' { + fasta = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/README.txt" + mito_name = "Mt" + } + 'EB2' { + fasta = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/README.txt" + } + 'UMD3.1' { + fasta = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/README.txt" + mito_name = "MT" + } + 'WBcel235' { + fasta = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.bed" + mito_name = "MtDNA" + macs_gsize = "9e7" + } + 'CanFam3.1' { + fasta = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/README.txt" + mito_name = "MT" + } + 'GRCz10' { + fasta = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.bed" + mito_name = "MT" + } + 'BDGP6' { + fasta = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.bed" + mito_name = "M" + macs_gsize = "1.2e8" + } + 'EquCab2' { + fasta = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/README.txt" + mito_name = "MT" + } + 'EB1' { + fasta = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/README.txt" + } + 'Galgal4' { + fasta = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.bed" + mito_name = "MT" + } + 'Gm01' { + fasta = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/README.txt" + } + 'Mmul_1' { + fasta = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/README.txt" + mito_name = "MT" + } + 'IRGSP-1.0' { + fasta = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.bed" + mito_name = "Mt" + } + 'CHIMP2.1.4' { + fasta = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/README.txt" + mito_name = "MT" + } + 'Rnor_5.0' { + fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Annotation/Genes/genes.bed" + mito_name = "MT" + } + 'Rnor_6.0' { + fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.bed" + mito_name = "MT" + } + 'R64-1-1' { + fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.bed" + mito_name = "MT" + macs_gsize = "1.2e7" + } + 'EF2' { + fasta = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/README.txt" + mito_name = "MT" + macs_gsize = "1.21e7" + } + 'Sbi1' { + fasta = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/README.txt" + } + 'Sscrofa10.2' { + fasta = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/README.txt" + mito_name = "MT" + } + 'AGPv3' { + fasta = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.bed" + mito_name = "Mt" + } + 'hg38' { + fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed" + mito_name = "chrM" + macs_gsize = "2.7e9" + blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" + } + 'hg19' { + fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/README.txt" + mito_name = "chrM" + macs_gsize = "2.7e9" + blacklist = "${projectDir}/assets/blacklists/hg19-blacklist.bed" + } + 'mm10' { + fasta = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/README.txt" + mito_name = "chrM" + macs_gsize = "1.87e9" + blacklist = "${projectDir}/assets/blacklists/mm10-blacklist.bed" + } + 'bosTau8' { + fasta = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Annotation/Genes/genes.bed" + mito_name = "chrM" + } + 'ce10' { + fasta = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/README.txt" + mito_name = "chrM" + macs_gsize = "9e7" + } + 'canFam3' { + fasta = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/README.txt" + mito_name = "chrM" + } + 'danRer10' { + fasta = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.bed" + mito_name = "chrM" + macs_gsize = "1.37e9" + } + 'dm6' { + fasta = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Annotation/Genes/genes.bed" + mito_name = "chrM" + macs_gsize = "1.2e8" + } + 'equCab2' { + fasta = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/README.txt" + mito_name = "chrM" + } + 'galGal4' { + fasta = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/README.txt" + mito_name = "chrM" + } + 'panTro4' { + fasta = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/README.txt" + mito_name = "chrM" + } + 'rn6' { + fasta = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Annotation/Genes/genes.bed" + mito_name = "chrM" + } + 'sacCer3' { + fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BismarkIndex/" + readme = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Annotation/README.txt" + mito_name = "chrM" + macs_gsize = "1.2e7" + } + 'susScr3' { + fasta = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BWAIndex/genome.fa" + bowtie2 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/Bowtie2Index/" + star = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/STARIndex/" + bismark = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BismarkIndex/" + gtf = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/Genes/genes.gtf" + bed12 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/Genes/genes.bed" + readme = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/README.txt" + mito_name = "chrM" + } + } +} diff --git a/conf/modules.config b/conf/modules.config new file mode 100644 index 000000000..46d938cbc --- /dev/null +++ b/conf/modules.config @@ -0,0 +1,101 @@ +/* +======================================================================================== + Config file for defining DSL2 per module options +======================================================================================== + Available keys to override module options: + args = Additional arguments appended to command in module. + args2 = Second set of arguments appended to command in module (multi-tool modules). + args3 = Third set of arguments appended to command in module (multi-tool modules). + publish_dir = Directory to publish results. + publish_by_meta = Groovy list of keys available in meta map to append as directories to "publish_dir" path + If publish_by_meta = true - Value of ${meta['id']} is appended as a directory to "publish_dir" path + If publish_by_meta = ['id', 'custompath'] - If "id" is in meta map and "custompath" isn't then "${meta['id']}/custompath/" + is appended as a directory to "publish_dir" path + If publish_by_meta = false / null - No directories are appended to "publish_dir" path + publish_files = Groovy map where key = "file_ext" and value = "directory" to publish results for that file extension + The value of "directory" is appended to the standard "publish_dir" path as defined above. + If publish_files = null (unspecified) - All files are published. + If publish_files = false - No files are published. + suffix = File name suffix for output files. +---------------------------------------------------------------------------------------- +*/ +params { + modules { + 'analyze_kmers_options' { + publish_by_meta = ['id'] + publish_dir = "kmer_analysis" + } + 'binning_options' { + publish_by_meta = ['id'] + publish_dir = "binning_results/bins" + } + 'binning_summary_options' { + publish_by_meta = ['id'] + publish_dir = "binning_results/binning_summary" + } + 'diamond_blastp_options' { + args = "--evalue 1e-5 --max-target-seqs 200 -b 6 --outfmt 6" + publish_by_meta = ['id'] + publish_dir = "diamond_blastp_results" + } + 'hmmsearch_options' { + args = "-Z 150 --cpu 1 --seed 42" + args2 = "" + } + 'hmmsearch_filter_options' { + args = "" + } + 'merge_hmmsearch_options'{ + publish_by_meta = ['id'] + publish_dir = "hmmsearch" + } + 'majority_vote_options' { + publish_by_meta = ['id'] + } + 'merge_kmers_embedded_options'{ + publish_by_meta = ['id'] + publish_dir = "kmers_embedded" + } + 'merge_kmers_normalized_options'{ + publish_by_meta = ['id'] + publish_dir = "kmers_normalized" + } + 'prodigal_options' { + publish_by_meta = ['id'] + args = "-p meta -m" + publish_dir = "prodigal" + } + 'diamond_makedb_options' { + publish_by_meta = ['id'] + args = "" + } + 'samtools_viewsort_options' { + args = "" + args2 = "" + publish_by_meta = ['id'] + publish_dir = "samtools_sort" + } + 'seqkit_split_options' { + publish_by_meta = ['id'] + args = "-p ${params.num_splits}" + args2 = "--two-pass" + } + 'spades_kmer_coverage' { + publish_by_meta = ['id'] + publish_files = ['*.coverages.tsv':''] + publish_dir = "coverage" + } + 'split_kingdoms_options' { + publish_by_meta = ['id'] + } + 'taxon_assignment' { + publish_by_meta = ['id'] + } + 'unclustered_recruitment_options' { + publish_by_meta = ['id'] + publish_dir = "binning_results/unclustered_recruitment_results" + + } + } +} + diff --git a/conf/test.config b/conf/test.config new file mode 100644 index 000000000..571e13bab --- /dev/null +++ b/conf/test.config @@ -0,0 +1,31 @@ +// There is currently not a simple test profile. Remove this comment when there is. +// Below is the template for one. + +/* +======================================================================================== + Nextflow config file for running minimal tests +======================================================================================== + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run autometa -profile test, + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 2.h + + // Input data + // Specify the paths to your test data on nf-core/test-datasets + // Give any required params for the test so that command line flags are not needed + input = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/015/645/455/GCF_015645455.1_ASM1564545v1/GCF_015645455.1_ASM1564545v1_genomic.fna.gz' + mock_test = true + +} diff --git a/conf/test_full.config b/conf/test_full.config new file mode 100644 index 000000000..b57e635c4 --- /dev/null +++ b/conf/test_full.config @@ -0,0 +1,25 @@ +/* +======================================================================================== + Nextflow config file for running full-size tests +======================================================================================== + Defines input files and everything required to run a full size pipeline test. + + Use as follows: + nextflow run autometa -profile test_full, + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Full test profile' + config_profile_description = 'Full test dataset to check pipeline function' + max_cpus = 2 + max_memory = 6.GB + max_time = 0.5h + + // Input data for full size test + // Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) + // Give any required params for the test so that command line flags are not needed + input = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/015/645/455/GCF_015645455.1_ASM1564545v1/GCF_015645455.1_ASM1564545v1_genomic.fna.gz' + parallel_split_fasta = false +} diff --git a/environment.yml b/environment.yml new file mode 100644 index 000000000..9d1fa1d87 --- /dev/null +++ b/environment.yml @@ -0,0 +1,12 @@ +name: autometa-nf + +channels: + - conda-forge + - bioconda + - defaults + +dependencies: + - nextflow>=20.10 + - bioconda::nf-core>=2.1 + - python>=3.8 + diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy new file mode 100755 index 000000000..8d6920dd6 --- /dev/null +++ b/lib/NfcoreSchema.groovy @@ -0,0 +1,517 @@ +// +// This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template. +// + +import org.everit.json.schema.Schema +import org.everit.json.schema.loader.SchemaLoader +import org.everit.json.schema.ValidationException +import org.json.JSONObject +import org.json.JSONTokener +import org.json.JSONArray +import groovy.json.JsonSlurper +import groovy.json.JsonBuilder + +class NfcoreSchema { + + // + // Resolve Schema path relative to main workflow directory + // + public static String getSchemaPath(workflow, schema_filename='nextflow_schema.json') { + return "${workflow.projectDir}/${schema_filename}" + } + + // + // Function to loop over all parameters defined in schema and check + // whether the given parameters adhere to the specifications + // + /* groovylint-disable-next-line UnusedPrivateMethodParameter */ + public static void validateParameters(workflow, params, log, schema_filename='nextflow_schema.json') { + def has_error = false + //=====================================================================// + // Check for nextflow core params and unexpected params + def json = new File(getSchemaPath(workflow, schema_filename=schema_filename)).text + def Map schemaParams = (Map) new JsonSlurper().parseText(json).get('definitions') + def nf_params = [ + // Options for base `nextflow` command + 'bg', + 'c', + 'C', + 'config', + 'd', + 'D', + 'dockerize', + 'h', + 'log', + 'q', + 'quiet', + 'syslog', + 'v', + 'version', + + // Options for `nextflow run` command + 'ansi', + 'ansi-log', + 'bg', + 'bucket-dir', + 'c', + 'cache', + 'config', + 'dsl2', + 'dump-channels', + 'dump-hashes', + 'E', + 'entry', + 'latest', + 'lib', + 'main-script', + 'N', + 'name', + 'offline', + 'params-file', + 'pi', + 'plugins', + 'poll-interval', + 'pool-size', + 'profile', + 'ps', + 'qs', + 'queue-size', + 'r', + 'resume', + 'revision', + 'stdin', + 'stub', + 'stub-run', + 'test', + 'w', + 'with-charliecloud', + 'with-conda', + 'with-dag', + 'with-docker', + 'with-mpi', + 'with-notification', + 'with-podman', + 'with-report', + 'with-singularity', + 'with-timeline', + 'with-tower', + 'with-trace', + 'with-weblog', + 'without-docker', + 'without-podman', + 'work-dir' + ] + def unexpectedParams = [] + + // Collect expected parameters from the schema + def expectedParams = [] + for (group in schemaParams) { + for (p in group.value['properties']) { + expectedParams.push(p.key) + } + } + + for (specifiedParam in params.keySet()) { + // nextflow params + if (nf_params.contains(specifiedParam)) { + log.error "ERROR: You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'" + has_error = true + } + // unexpected params + def params_ignore = params.schema_ignore_params.split(',') + 'schema_ignore_params' + def expectedParamsLowerCase = expectedParams.collect{ it.replace("-", "").toLowerCase() } + def specifiedParamLowerCase = specifiedParam.replace("-", "").toLowerCase() + def isCamelCaseBug = (specifiedParam.contains("-") && !expectedParams.contains(specifiedParam) && expectedParamsLowerCase.contains(specifiedParamLowerCase)) + if (!expectedParams.contains(specifiedParam) && !params_ignore.contains(specifiedParam) && !isCamelCaseBug) { + // Temporarily remove camelCase/camel-case params #1035 + def unexpectedParamsLowerCase = unexpectedParams.collect{ it.replace("-", "").toLowerCase()} + if (!unexpectedParamsLowerCase.contains(specifiedParamLowerCase)){ + unexpectedParams.push(specifiedParam) + } + } + } + + //=====================================================================// + // Validate parameters against the schema + InputStream input_stream = new File(getSchemaPath(workflow, schema_filename=schema_filename)).newInputStream() + JSONObject raw_schema = new JSONObject(new JSONTokener(input_stream)) + + // Remove anything that's in params.schema_ignore_params + raw_schema = removeIgnoredParams(raw_schema, params) + + Schema schema = SchemaLoader.load(raw_schema) + + // Clean the parameters + def cleanedParams = cleanParameters(params) + + // Convert to JSONObject + def jsonParams = new JsonBuilder(cleanedParams) + JSONObject params_json = new JSONObject(jsonParams.toString()) + + // Validate + try { + schema.validate(params_json) + } catch (ValidationException e) { + println '' + log.error 'ERROR: Validation of pipeline parameters failed!' + JSONObject exceptionJSON = e.toJSON() + printExceptions(exceptionJSON, params_json, log) + println '' + has_error = true + } + + // Check for unexpected parameters + if (unexpectedParams.size() > 0) { + Map colors = NfcoreTemplate.logColours(params.monochrome_logs) + println '' + def warn_msg = 'Found unexpected parameters:' + for (unexpectedParam in unexpectedParams) { + warn_msg = warn_msg + "\n* --${unexpectedParam}: ${params[unexpectedParam].toString()}" + } + log.warn warn_msg + log.info "- ${colors.dim}Ignore this warning: params.schema_ignore_params = \"${unexpectedParams.join(',')}\" ${colors.reset}" + println '' + } + + if (has_error) { + System.exit(1) + } + } + + // + // Beautify parameters for --help + // + public static String paramsHelp(workflow, params, command, schema_filename='nextflow_schema.json') { + Map colors = NfcoreTemplate.logColours(params.monochrome_logs) + Integer num_hidden = 0 + String output = '' + output += 'Typical pipeline command:\n\n' + output += " ${colors.cyan}${command}${colors.reset}\n\n" + Map params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename)) + Integer max_chars = paramsMaxChars(params_map) + 1 + Integer desc_indent = max_chars + 14 + Integer dec_linewidth = 160 - desc_indent + for (group in params_map.keySet()) { + Integer num_params = 0 + String group_output = colors.underlined + colors.bold + group + colors.reset + '\n' + def group_params = params_map.get(group) // This gets the parameters of that particular group + for (param in group_params.keySet()) { + if (group_params.get(param).hidden && !params.show_hidden_params) { + num_hidden += 1 + continue; + } + def type = '[' + group_params.get(param).type + ']' + def description = group_params.get(param).description + def defaultValue = group_params.get(param).default ? " [default: " + group_params.get(param).default.toString() + "]" : '' + def description_default = description + colors.dim + defaultValue + colors.reset + // Wrap long description texts + // Loosely based on https://dzone.com/articles/groovy-plain-text-word-wrap + if (description_default.length() > dec_linewidth){ + List olines = [] + String oline = "" // " " * indent + description_default.split(" ").each() { wrd -> + if ((oline.size() + wrd.size()) <= dec_linewidth) { + oline += wrd + " " + } else { + olines += oline + oline = wrd + " " + } + } + olines += oline + description_default = olines.join("\n" + " " * desc_indent) + } + group_output += " --" + param.padRight(max_chars) + colors.dim + type.padRight(10) + colors.reset + description_default + '\n' + num_params += 1 + } + group_output += '\n' + if (num_params > 0){ + output += group_output + } + } + if (num_hidden > 0){ + output += colors.dim + "!! Hiding $num_hidden params, use --show_hidden_params to show them !!\n" + colors.reset + } + output += NfcoreTemplate.dashedLine(params.monochrome_logs) + return output + } + + // + // Groovy Map summarising parameters/workflow options used by the pipeline + // + public static LinkedHashMap paramsSummaryMap(workflow, params, schema_filename='nextflow_schema.json') { + // Get a selection of core Nextflow workflow options + def Map workflow_summary = [:] + if (workflow.revision) { + workflow_summary['revision'] = workflow.revision + } + workflow_summary['runName'] = workflow.runName + if (workflow.containerEngine) { + workflow_summary['containerEngine'] = workflow.containerEngine + } + if (workflow.container) { + workflow_summary['container'] = workflow.container + } + workflow_summary['launchDir'] = workflow.launchDir + workflow_summary['workDir'] = workflow.workDir + workflow_summary['projectDir'] = workflow.projectDir + workflow_summary['userName'] = workflow.userName + workflow_summary['profile'] = workflow.profile + workflow_summary['configFiles'] = workflow.configFiles.join(', ') + + // Get pipeline parameters defined in JSON Schema + def Map params_summary = [:] + def blacklist = ['hostnames'] + def params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename)) + for (group in params_map.keySet()) { + def sub_params = new LinkedHashMap() + def group_params = params_map.get(group) // This gets the parameters of that particular group + for (param in group_params.keySet()) { + if (params.containsKey(param) && !blacklist.contains(param)) { + def params_value = params.get(param) + def schema_value = group_params.get(param).default + def param_type = group_params.get(param).type + if (schema_value != null) { + if (param_type == 'string') { + if (schema_value.contains('$projectDir') || schema_value.contains('${projectDir}')) { + def sub_string = schema_value.replace('\$projectDir', '') + sub_string = sub_string.replace('\${projectDir}', '') + if (params_value.contains(sub_string)) { + schema_value = params_value + } + } + if (schema_value.contains('$params.outdir') || schema_value.contains('${params.outdir}')) { + def sub_string = schema_value.replace('\$params.outdir', '') + sub_string = sub_string.replace('\${params.outdir}', '') + if ("${params.outdir}${sub_string}" == params_value) { + schema_value = params_value + } + } + } + } + + // We have a default in the schema, and this isn't it + if (schema_value != null && params_value != schema_value) { + sub_params.put(param, params_value) + } + // No default in the schema, and this isn't empty + else if (schema_value == null && params_value != "" && params_value != null && params_value != false) { + sub_params.put(param, params_value) + } + } + } + params_summary.put(group, sub_params) + } + return [ 'Core Nextflow options' : workflow_summary ] << params_summary + } + + // + // Beautify parameters for summary and return as string + // + public static String paramsSummaryLog(workflow, params) { + Map colors = NfcoreTemplate.logColours(params.monochrome_logs) + String output = '' + def params_map = paramsSummaryMap(workflow, params) + def max_chars = paramsMaxChars(params_map) + for (group in params_map.keySet()) { + def group_params = params_map.get(group) // This gets the parameters of that particular group + if (group_params) { + output += colors.bold + group + colors.reset + '\n' + for (param in group_params.keySet()) { + output += " " + colors.blue + param.padRight(max_chars) + ": " + colors.green + group_params.get(param) + colors.reset + '\n' + } + output += '\n' + } + } + output += "!! Only displaying parameters that differ from the pipeline defaults !!\n" + output += NfcoreTemplate.dashedLine(params.monochrome_logs) + return output + } + + // + // Loop over nested exceptions and print the causingException + // + private static void printExceptions(ex_json, params_json, log) { + def causingExceptions = ex_json['causingExceptions'] + if (causingExceptions.length() == 0) { + def m = ex_json['message'] =~ /required key \[([^\]]+)\] not found/ + // Missing required param + if (m.matches()) { + log.error "* Missing required parameter: --${m[0][1]}" + } + // Other base-level error + else if (ex_json['pointerToViolation'] == '#') { + log.error "* ${ex_json['message']}" + } + // Error with specific param + else { + def param = ex_json['pointerToViolation'] - ~/^#\// + def param_val = params_json[param].toString() + log.error "* --${param}: ${ex_json['message']} (${param_val})" + } + } + for (ex in causingExceptions) { + printExceptions(ex, params_json, log) + } + } + + // + // Remove an element from a JSONArray + // + private static JSONArray removeElement(json_array, element) { + def list = [] + int len = json_array.length() + for (int i=0;i + if(raw_schema.keySet().contains('definitions')){ + raw_schema.definitions.each { definition -> + for (key in definition.keySet()){ + if (definition[key].get("properties").keySet().contains(ignore_param)){ + // Remove the param to ignore + definition[key].get("properties").remove(ignore_param) + // If the param was required, change this + if (definition[key].has("required")) { + def cleaned_required = removeElement(definition[key].required, ignore_param) + definition[key].put("required", cleaned_required) + } + } + } + } + } + if(raw_schema.keySet().contains('properties') && raw_schema.get('properties').keySet().contains(ignore_param)) { + raw_schema.get("properties").remove(ignore_param) + } + if(raw_schema.keySet().contains('required') && raw_schema.required.contains(ignore_param)) { + def cleaned_required = removeElement(raw_schema.required, ignore_param) + raw_schema.put("required", cleaned_required) + } + } + return raw_schema + } + + // + // Clean and check parameters relative to Nextflow native classes + // + private static Map cleanParameters(params) { + def new_params = params.getClass().newInstance(params) + for (p in params) { + // remove anything evaluating to false + if (!p['value']) { + new_params.remove(p.key) + } + // Cast MemoryUnit to String + if (p['value'].getClass() == nextflow.util.MemoryUnit) { + new_params.replace(p.key, p['value'].toString()) + } + // Cast Duration to String + if (p['value'].getClass() == nextflow.util.Duration) { + new_params.replace(p.key, p['value'].toString().replaceFirst(/d(?!\S)/, "day")) + } + // Cast LinkedHashMap to String + if (p['value'].getClass() == LinkedHashMap) { + new_params.replace(p.key, p['value'].toString()) + } + } + return new_params + } + + // + // This function tries to read a JSON params file + // + private static LinkedHashMap paramsLoad(String json_schema) { + def params_map = new LinkedHashMap() + try { + params_map = paramsRead(json_schema) + } catch (Exception e) { + println "Could not read parameters settings from JSON. $e" + params_map = new LinkedHashMap() + } + return params_map + } + + // + // Method to actually read in JSON file using Groovy. + // Group (as Key), values are all parameters + // - Parameter1 as Key, Description as Value + // - Parameter2 as Key, Description as Value + // .... + // Group + // - + private static LinkedHashMap paramsRead(String json_schema) throws Exception { + def json = new File(json_schema).text + def Map schema_definitions = (Map) new JsonSlurper().parseText(json).get('definitions') + def Map schema_properties = (Map) new JsonSlurper().parseText(json).get('properties') + /* Tree looks like this in nf-core schema + * definitions <- this is what the first get('definitions') gets us + group 1 + title + description + properties + parameter 1 + type + description + parameter 2 + type + description + group 2 + title + description + properties + parameter 1 + type + description + * properties <- parameters can also be ungrouped, outside of definitions + parameter 1 + type + description + */ + + // Grouped params + def params_map = new LinkedHashMap() + schema_definitions.each { key, val -> + def Map group = schema_definitions."$key".properties // Gets the property object of the group + def title = schema_definitions."$key".title + def sub_params = new LinkedHashMap() + group.each { innerkey, value -> + sub_params.put(innerkey, value) + } + params_map.put(title, sub_params) + } + + // Ungrouped params + def ungrouped_params = new LinkedHashMap() + schema_properties.each { innerkey, value -> + ungrouped_params.put(innerkey, value) + } + params_map.put("Other parameters", ungrouped_params) + + return params_map + } + + // + // Get maximum number of characters across all parameter names + // + private static Integer paramsMaxChars(params_map) { + Integer max_chars = 0 + for (group in params_map.keySet()) { + def group_params = params_map.get(group) // This gets the parameters of that particular group + for (param in group_params.keySet()) { + if (param.size() > max_chars) { + max_chars = param.size() + } + } + } + return max_chars + } +} diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy new file mode 100755 index 000000000..44551e0a3 --- /dev/null +++ b/lib/NfcoreTemplate.groovy @@ -0,0 +1,270 @@ +// +// This file holds several functions used within the nf-core pipeline template. +// + +import org.yaml.snakeyaml.Yaml + +class NfcoreTemplate { + + // + // Check AWS Batch related parameters have been specified correctly + // + public static void awsBatch(workflow, params) { + if (workflow.profile.contains('awsbatch')) { + // Check params.awsqueue and params.awsregion have been set if running on AWSBatch + assert (params.awsqueue && params.awsregion) : "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" + // Check outdir paths to be S3 buckets if running on AWSBatch + assert params.outdir.startsWith('s3:') : "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" + } + } + + // + // Check params.hostnames + // + public static void hostName(workflow, params, log) { + Map colors = logColours(params.monochrome_logs) + if (params.hostnames) { + try { + def hostname = "hostname".execute().text.trim() + params.hostnames.each { prof, hnames -> + hnames.each { hname -> + if (hostname.contains(hname) && !workflow.profile.contains(prof)) { + log.info "=${colors.yellow}====================================================${colors.reset}=\n" + + "${colors.yellow}WARN: You are running with `-profile $workflow.profile`\n" + + " but your machine hostname is ${colors.white}'$hostname'${colors.reset}.\n" + + " ${colors.yellow_bold}Please use `-profile $prof${colors.reset}`\n" + + "=${colors.yellow}====================================================${colors.reset}=" + } + } + } + } catch (Exception e) { + log.warn "[$workflow.manifest.name] Could not determine 'hostname' - skipping check. Reason: ${e.message}." + } + } + } + + // + // Construct and send completion email + // + public static void email(workflow, params, summary_params, projectDir, log, multiqc_report=[]) { + + // Set up the e-mail variables + def subject = "[$workflow.manifest.name] Successful: $workflow.runName" + if (!workflow.success) { + subject = "[$workflow.manifest.name] FAILED: $workflow.runName" + } + + def summary = [:] + for (group in summary_params.keySet()) { + summary << summary_params[group] + } + + def misc_fields = [:] + misc_fields['Date Started'] = workflow.start + misc_fields['Date Completed'] = workflow.complete + misc_fields['Pipeline script file path'] = workflow.scriptFile + misc_fields['Pipeline script hash ID'] = workflow.scriptId + if (workflow.repository) misc_fields['Pipeline repository Git URL'] = workflow.repository + if (workflow.commitId) misc_fields['Pipeline repository Git Commit'] = workflow.commitId + if (workflow.revision) misc_fields['Pipeline Git branch/tag'] = workflow.revision + misc_fields['Nextflow Version'] = workflow.nextflow.version + misc_fields['Nextflow Build'] = workflow.nextflow.build + misc_fields['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp + + def email_fields = [:] + email_fields['version'] = workflow.manifest.version + email_fields['runName'] = workflow.runName + email_fields['success'] = workflow.success + email_fields['dateComplete'] = workflow.complete + email_fields['duration'] = workflow.duration + email_fields['exitStatus'] = workflow.exitStatus + email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') + email_fields['errorReport'] = (workflow.errorReport ?: 'None') + email_fields['commandLine'] = workflow.commandLine + email_fields['projectDir'] = workflow.projectDir + email_fields['summary'] = summary << misc_fields + + // On success try attach the multiqc report + def mqc_report = null + try { + if (workflow.success) { + mqc_report = multiqc_report.getVal() + if (mqc_report.getClass() == ArrayList && mqc_report.size() >= 1) { + if (mqc_report.size() > 1) { + log.warn "[$workflow.manifest.name] Found multiple reports from process 'MULTIQC', will use only one" + } + mqc_report = mqc_report[0] + } + } + } catch (all) { + if (multiqc_report) { + log.warn "[$workflow.manifest.name] Could not attach MultiQC report to summary email" + } + } + + // Check if we are only sending emails on failure + def email_address = params.email + if (!params.email && params.email_on_fail && !workflow.success) { + email_address = params.email_on_fail + } + + // Render the TXT template + def engine = new groovy.text.GStringTemplateEngine() + def tf = new File("$projectDir/assets/email_template.txt") + def txt_template = engine.createTemplate(tf).make(email_fields) + def email_txt = txt_template.toString() + + // Render the HTML template + def hf = new File("$projectDir/assets/email_template.html") + def html_template = engine.createTemplate(hf).make(email_fields) + def email_html = html_template.toString() + + // Render the sendmail template + def max_multiqc_email_size = params.max_multiqc_email_size as nextflow.util.MemoryUnit + def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes() ] + def sf = new File("$projectDir/assets/sendmail_template.txt") + def sendmail_template = engine.createTemplate(sf).make(smail_fields) + def sendmail_html = sendmail_template.toString() + + // Send the HTML e-mail + Map colors = logColours(params.monochrome_logs) + if (email_address) { + try { + if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } + // Try to send HTML e-mail using sendmail + [ 'sendmail', '-t' ].execute() << sendmail_html + log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (sendmail)-" + } catch (all) { + // Catch failures and try with plaintext + def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] + if ( mqc_report.size() <= max_multiqc_email_size.toBytes() ) { + mail_cmd += [ '-A', mqc_report ] + } + mail_cmd.execute() << email_html + log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (mail)-" + } + } + + // Write summary e-mail HTML to a file + def output_d = new File("${params.outdir}/pipeline_info/") + if (!output_d.exists()) { + output_d.mkdirs() + } + def output_hf = new File(output_d, "pipeline_report.html") + output_hf.withWriter { w -> w << email_html } + def output_tf = new File(output_d, "pipeline_report.txt") + output_tf.withWriter { w -> w << email_txt } + } + + // + // Print pipeline summary on completion + // + public static void summary(workflow, params, log) { + Map colors = logColours(params.monochrome_logs) + if (workflow.success) { + if (workflow.stats.ignoredCount == 0) { + log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-" + } else { + log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed successfully, but with errored process(es) ${colors.reset}-" + } + } else { + hostName(workflow, params, log) + log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed with errors${colors.reset}-" + } + } + + // + // ANSII Colours used for terminal logging + // + public static Map logColours(Boolean monochrome_logs) { + Map colorcodes = [:] + + // Reset / Meta + colorcodes['reset'] = monochrome_logs ? '' : "\033[0m" + colorcodes['bold'] = monochrome_logs ? '' : "\033[1m" + colorcodes['dim'] = monochrome_logs ? '' : "\033[2m" + colorcodes['underlined'] = monochrome_logs ? '' : "\033[4m" + colorcodes['blink'] = monochrome_logs ? '' : "\033[5m" + colorcodes['reverse'] = monochrome_logs ? '' : "\033[7m" + colorcodes['hidden'] = monochrome_logs ? '' : "\033[8m" + + // Regular Colors + colorcodes['black'] = monochrome_logs ? '' : "\033[0;30m" + colorcodes['red'] = monochrome_logs ? '' : "\033[0;31m" + colorcodes['green'] = monochrome_logs ? '' : "\033[0;32m" + colorcodes['yellow'] = monochrome_logs ? '' : "\033[0;33m" + colorcodes['blue'] = monochrome_logs ? '' : "\033[0;34m" + colorcodes['purple'] = monochrome_logs ? '' : "\033[0;35m" + colorcodes['cyan'] = monochrome_logs ? '' : "\033[0;36m" + colorcodes['white'] = monochrome_logs ? '' : "\033[0;37m" + + // Bold + colorcodes['bblack'] = monochrome_logs ? '' : "\033[1;30m" + colorcodes['bred'] = monochrome_logs ? '' : "\033[1;31m" + colorcodes['bgreen'] = monochrome_logs ? '' : "\033[1;32m" + colorcodes['byellow'] = monochrome_logs ? '' : "\033[1;33m" + colorcodes['bblue'] = monochrome_logs ? '' : "\033[1;34m" + colorcodes['bpurple'] = monochrome_logs ? '' : "\033[1;35m" + colorcodes['bcyan'] = monochrome_logs ? '' : "\033[1;36m" + colorcodes['bwhite'] = monochrome_logs ? '' : "\033[1;37m" + + // Underline + colorcodes['ublack'] = monochrome_logs ? '' : "\033[4;30m" + colorcodes['ured'] = monochrome_logs ? '' : "\033[4;31m" + colorcodes['ugreen'] = monochrome_logs ? '' : "\033[4;32m" + colorcodes['uyellow'] = monochrome_logs ? '' : "\033[4;33m" + colorcodes['ublue'] = monochrome_logs ? '' : "\033[4;34m" + colorcodes['upurple'] = monochrome_logs ? '' : "\033[4;35m" + colorcodes['ucyan'] = monochrome_logs ? '' : "\033[4;36m" + colorcodes['uwhite'] = monochrome_logs ? '' : "\033[4;37m" + + // High Intensity + colorcodes['iblack'] = monochrome_logs ? '' : "\033[0;90m" + colorcodes['ired'] = monochrome_logs ? '' : "\033[0;91m" + colorcodes['igreen'] = monochrome_logs ? '' : "\033[0;92m" + colorcodes['iyellow'] = monochrome_logs ? '' : "\033[0;93m" + colorcodes['iblue'] = monochrome_logs ? '' : "\033[0;94m" + colorcodes['ipurple'] = monochrome_logs ? '' : "\033[0;95m" + colorcodes['icyan'] = monochrome_logs ? '' : "\033[0;96m" + colorcodes['iwhite'] = monochrome_logs ? '' : "\033[0;97m" + + // Bold High Intensity + colorcodes['biblack'] = monochrome_logs ? '' : "\033[1;90m" + colorcodes['bired'] = monochrome_logs ? '' : "\033[1;91m" + colorcodes['bigreen'] = monochrome_logs ? '' : "\033[1;92m" + colorcodes['biyellow'] = monochrome_logs ? '' : "\033[1;93m" + colorcodes['biblue'] = monochrome_logs ? '' : "\033[1;94m" + colorcodes['bipurple'] = monochrome_logs ? '' : "\033[1;95m" + colorcodes['bicyan'] = monochrome_logs ? '' : "\033[1;96m" + colorcodes['biwhite'] = monochrome_logs ? '' : "\033[1;97m" + + return colorcodes + } + + // + // Does what is says on the tin + // + public static String dashedLine(monochrome_logs) { + Map colors = logColours(monochrome_logs) + return "-${colors.dim}----------------------------------------------------${colors.reset}-" + } + + // + // nf-core logo + // + public static String logo(workflow, monochrome_logs) { + Map colors = logColours(monochrome_logs) + String.format( + """\n + ${dashedLine(monochrome_logs)} + ${colors.green},--.${colors.black}/${colors.green},-.${colors.reset} + ${colors.blue} ___ __ __ __ ___ ${colors.green}/,-._.--~\'${colors.reset} + ${colors.blue} |\\ | |__ __ / ` / \\ |__) |__ ${colors.yellow}} {${colors.reset} + ${colors.blue} | \\| | \\__, \\__/ | \\ |___ ${colors.green}\\`-._,-`-,${colors.reset} + ${colors.green}`._,._,\'${colors.reset} + ${colors.purple} ${workflow.manifest.name} v${workflow.manifest.version}${colors.reset} + ${dashedLine(monochrome_logs)} + """.stripIndent() + ) + } +} diff --git a/lib/Utils.groovy b/lib/Utils.groovy new file mode 100755 index 000000000..18173e985 --- /dev/null +++ b/lib/Utils.groovy @@ -0,0 +1,47 @@ +// +// This file holds several Groovy functions that could be useful for any Nextflow pipeline +// + +import org.yaml.snakeyaml.Yaml + +class Utils { + + // + // When running with -profile conda, warn if channels have not been set-up appropriately + // + public static void checkCondaChannels(log) { + Yaml parser = new Yaml() + def channels = [] + try { + def config = parser.load("conda config --show channels".execute().text) + channels = config.channels + } catch(NullPointerException | IOException e) { + log.warn "Could not verify conda channel configuration." + return + } + + // Check that all channels are present + def required_channels = ['conda-forge', 'bioconda', 'defaults'] + def conda_check_failed = !required_channels.every { ch -> ch in channels } + + // Check that they are in the right order + conda_check_failed |= !(channels.indexOf('conda-forge') < channels.indexOf('bioconda')) + conda_check_failed |= !(channels.indexOf('bioconda') < channels.indexOf('defaults')) + + if (conda_check_failed) { + log.warn "=============================================================================\n" + + " There is a problem with your Conda configuration!\n\n" + + " You will need to set-up the conda-forge and bioconda channels correctly.\n" + + " Please refer to https://bioconda.github.io/user/install.html#set-up-channels\n" + + " NB: The order of the channels matters!\n" + + "===================================================================================" + } + } + + // + // Join module args with appropriate spacing + // + public static String joinModuleArgs(args_list) { + return ' ' + args_list.join(' ') + } +} diff --git a/lib/WorkflowAutometa.groovy b/lib/WorkflowAutometa.groovy new file mode 100755 index 000000000..e66120fa0 --- /dev/null +++ b/lib/WorkflowAutometa.groovy @@ -0,0 +1,59 @@ +// +// This file holds several functions specific to the workflow/autometa.nf in the nf-core/autometa pipeline +// + +class WorkflowAutometa { + + // + // Check and validate parameters + // + public static void initialise(params, log) { + genomeExistsError(params, log) + + if (!params.fasta) { + log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." + System.exit(1) + } + } + + // + // Get workflow summary for MultiQC + // + public static String paramsSummaryMultiqc(workflow, summary) { + String summary_section = '' + for (group in summary.keySet()) { + def group_params = summary.get(group) // This gets the parameters of that particular group + if (group_params) { + summary_section += "

$group

\n" + summary_section += "
\n" + for (param in group_params.keySet()) { + summary_section += "
$param
${group_params.get(param) ?: 'N/A'}
\n" + } + summary_section += "
\n" + } + } + + String yaml_file_text = "id: '${workflow.manifest.name.replace('/','-')}-summary'\n" + yaml_file_text += "description: ' - this information is collected when the pipeline is started.'\n" + yaml_file_text += "section_name: '${workflow.manifest.name} Workflow Summary'\n" + yaml_file_text += "section_href: 'https://github.com/${workflow.manifest.name}'\n" + yaml_file_text += "plot_type: 'html'\n" + yaml_file_text += "data: |\n" + yaml_file_text += "${summary_section}" + return yaml_file_text + } + + // + // Exit pipeline if incorrect --genome key provided + // + private static void genomeExistsError(params, log) { + if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { + log.error "=============================================================================\n" + + " Genome '${params.genome}' not found in any config files provided to the pipeline.\n" + + " Currently, the available genome keys are:\n" + + " ${params.genomes.keySet().join(", ")}\n" + + "===================================================================================" + System.exit(1) + } + } +} diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy new file mode 100755 index 000000000..e34547e0a --- /dev/null +++ b/lib/WorkflowMain.groovy @@ -0,0 +1,94 @@ +// +// This file holds several functions specific to the main.nf workflow in the nf-core/autometa pipeline +// + +class WorkflowMain { + + // + // Citation string for pipeline + // + public static String citation(workflow) { + return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + + // TODO nf-core: Add Zenodo DOI for pipeline after first release + //"* The pipeline\n" + + //" https://doi.org/10.5281/zenodo.XXXXXXX\n\n" + + "* The nf-core framework\n" + + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + + "* Software dependencies\n" + + " https://github.com/${workflow.manifest.name}/blob/master/CITATIONS.md" + } + + // + // Print help to screen if required + // + public static String help(workflow, params, log) { + def command = "nf-core launch KwanLab/Autometa" + def help_string = '' + help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) + help_string += NfcoreSchema.paramsHelp(workflow, params, command) + help_string += '\n' + citation(workflow) + '\n' + help_string += NfcoreTemplate.dashedLine(params.monochrome_logs) + return help_string + } + + // + // Print parameter summary log to screen + // + public static String paramsSummaryLog(workflow, params, log) { + def summary_log = '' + summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs) + summary_log += NfcoreSchema.paramsSummaryLog(workflow, params) + summary_log += '\n' + citation(workflow) + '\n' + summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs) + return summary_log + } + + // + // Validate parameters and print summary to screen + // + public static void initialise(workflow, params, log) { + // Print help to screen if required + if (params.help) { + log.info help(workflow, params, log) + System.exit(0) + } + + // Validate workflow parameters via the JSON schema + if (params.validate_params) { + NfcoreSchema.validateParameters(workflow, params, log) + } + + // Print parameter summary log to screen + log.info paramsSummaryLog(workflow, params, log) + + // Check that conda channels are set-up correctly + if (params.enable_conda) { + Utils.checkCondaChannels(log) + } + + // Check AWS batch settings + NfcoreTemplate.awsBatch(workflow, params) + + // Check the hostnames against configured profiles + NfcoreTemplate.hostName(workflow, params, log) + + // Check input has been provided + if (!params.input) { + log.error "Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'" + System.exit(1) + } + } + + // + // Get attribute from genome config file e.g. fasta + // + public static String getGenomeAttribute(params, attribute) { + def val = '' + if (params.genomes && params.genome && params.genomes.containsKey(params.genome)) { + if (params.genomes[ params.genome ].containsKey(attribute)) { + val = params.genomes[ params.genome ][ attribute ] + } + } + return val + } +} diff --git a/lib/nfcore_external_java_deps.jar b/lib/nfcore_external_java_deps.jar new file mode 100644 index 000000000..805c8bb5e Binary files /dev/null and b/lib/nfcore_external_java_deps.jar differ diff --git a/main.nf b/main.nf index 2dca3be87..2227c1795 100644 --- a/main.nf +++ b/main.nf @@ -1,85 +1,69 @@ #!/usr/bin/env nextflow + +/* +======================================================================================== + Autometa +======================================================================================== + Autometa's Nextflow Analysis Pipeline + Github : https://github.com/KwanLab/Autometa + Documentation : https://autometa.readthedocs.io/en/latest/ +---------------------------------------------------------------------------------------- +*/ + nextflow.enable.dsl = 2 -include { AUTOMETA } from './nextflow/autometa.nf' +/* +======================================================================================== + VALIDATE & PRINT PARAMETER SUMMARY +======================================================================================== +*/ + +WorkflowMain.initialise(workflow, params, log) -// Below listed parameters should be provided by the parameters.config file -// Available here: https://raw.githubusercontent.com/KwanLab/Autometa/dev/nextflow/parameters.config -// -// Check User data inputs -params.metagenome = null -if ( !params.metagenome || params.metagenome instanceof Boolean ) -error """ -You must supply the `metagenome` parameter in the config or on the command line! -e.g. -nextflow run main.nf -c parameters.config --metagenome "" -""" -// Where to store intermediate and final results: -params.interim = null -if ( !params.interim || params.interim instanceof Boolean ) -error """ -You must supply the `--interim` parameter in the config or on the command line! -e.g. -nextflow run main.nf -c parameters.config --interim """ -""" -params.processed = null -if ( !params.processed || params.processed instanceof Boolean ) -error """ -You must supply the `--processed` parameter in the config or on the command line! -e.g. -nextflow run main.nf -c parameters.config --processed """ -""" +//////////////////////////////////////////////////// +/* -- VALIDATE PARAMETERS -- */ +//////////////////////////////////////////////////// -log.info """ +if (params.use_run_name){ + params.interim_dir_internal = "${params.interim_dir}/autometa_interim_dir/${workflow.runName}/${workflow.sessionId}" // Intermediate results directory + params.outdir_internal = "${params.outdir}/autometa_outdir/${workflow.runName}/${workflow.sessionId}" // Final results directory +} else { + params.interim_dir_internal = "${params.interim_dir}/autometa_interim_dir/${workflow.sessionId}" // Intermediate results directory + params.outdir_internal = "${params.outdir}/autometa_outdir/${workflow.sessionId}" // Final results directory +} - Autometa - Automated Extraction of Genomes from Shotgun Metagenomes - ===================================================== - projectDir : ${workflow.projectDir} - ----------------------------------------------------- - Data - ----------------------------------------------------- - metagenome : ${params.metagenome} - interim : ${params.interim} - processed : ${params.processed} - ----------------------------------------------------- - Parameters - ----------------------------------------------------- - cpus : ${params.cpus} - length_cutoff : ${params.length_cutoff} - kmer_size : ${params.kmer_size} - norm_method : ${params.norm_method} - pca_dimensions : ${params.pca_dimensions} - embedding_method : ${params.embedding_method} - embedding_dimensions : ${params.embedding_dimensions} - clustering_method : ${params.clustering_method} - classification_kmer_pca_dimensions : ${params.classification_kmer_pca_dimensions} - classification_method : ${params.classification_method} - completeness : ${params.completeness} - purity : ${params.purity} - gc_stddev_limit : ${params.gc_stddev_limit} - cov_stddev_limit : ${params.cov_stddev_limit} - kingdom : ${params.kingdom} - ----------------------------------------------------- - Databases - ----------------------------------------------------- - ncbi_database : ${params.ncbi_database} - ----------------------------------------------------- +println """ +-------------------------------------------- +Output files will be found here: +Intermediate results directory: ${params.interim_dir_internal} +Binning results directory: ${params.outdir_internal} +-------------------------------------------- +\n """ -workflow { - Channel - .fromPath(params.metagenome, checkIfExists: true, type: 'file') - .set{unfiltered_metagenome_ch} +/* +======================================================================================== + NAMED WORKFLOW FOR PIPELINE +======================================================================================== +*/ - AUTOMETA(unfiltered_metagenome_ch) -} +include { AUTOMETA } from './workflows/autometa.nf' addParams(single_db_dir: params.single_db_dir) /* - * completion handler - */ -workflow.onComplete { - log.info ( workflow.success ? "\nDone!\n" : "Oops .. something went wrong" ) +======================================================================================== + Run Autometa +======================================================================================== +*/ + +workflow { + AUTOMETA() } + +/* +======================================================================================== + THE END +======================================================================================== +*/ diff --git a/modules.json b/modules.json new file mode 100644 index 000000000..b08257f69 --- /dev/null +++ b/modules.json @@ -0,0 +1,20 @@ +{ + "name": "nf-core/autometa", + "homePage": "https://github.com/nf-core/autometa", + "repos": { + "nf-core/modules": { + "prodigal": { + "git_sha": "e937c7950af70930d1f34bb961403d9d2aa81c7d" + }, + "bowtie2/align": { + "git_sha": "e937c7950af70930d1f34bb961403d9d2aa81c7d" + }, + "multiqc": { + "git_sha": "e937c7950af70930d1f34bb961403d9d2aa81c7d" + }, + "fastqc": { + "git_sha": "e937c7950af70930d1f34bb961403d9d2aa81c7d" + } + } + } +} \ No newline at end of file diff --git a/modules/local/analyze_kmers.nf b/modules/local/analyze_kmers.nf new file mode 100644 index 000000000..464743089 --- /dev/null +++ b/modules/local/analyze_kmers.nf @@ -0,0 +1,49 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process ANALYZE_KMERS { + tag "Counting kmers for ${meta.id}" + label 'process_medium' + publishDir "${params.interim_dir_internal}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? "autometa" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/autometa" + } else { + container "jason-c-kwan/autometa:${params.autometa_image_tag}" + } + + input: + tuple val(meta), path(metagenome) + + output: + tuple val(meta), path("${meta.id}.kmers.tsv") , emit: counts + tuple val(meta), path("${meta.id}.kmers.normalized.tsv"), emit: normalized + tuple val(meta), path("${meta.id}.kmers.embedded.tsv") , emit: embedded + path '*.version.txt' , emit: version + + script: + // Add soft-links to original FastQs for consistent naming in pipeline + def software = getSoftwareName(task.process) + """ + autometa-kmers \\ + --fasta ${metagenome} \\ + --kmers "${meta.id}.kmers.tsv" \\ + --size "${params.kmer_size}" \\ + --norm-output "${meta.id}.kmers.normalized.tsv" \\ + --norm-method "${params.norm_method}" \\ + --pca-dimensions "${params.pca_dimensions}" \\ + --embedding-output "${meta.id}.kmers.embedded.tsv" \\ + --embedding-method "${params.embedding_method}" \\ + --embedding-dimensions "${params.embedding_dimensions}" \\ + --cpus "${task.cpus}" \\ + --seed 42 + + echo "TODO" > autometa.version.txt + """ +} diff --git a/modules/local/bedtools_genomecov.nf b/modules/local/bedtools_genomecov.nf new file mode 100644 index 000000000..81253dceb --- /dev/null +++ b/modules/local/bedtools_genomecov.nf @@ -0,0 +1,40 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process BEDTOOLS_GENOMECOV { + tag "$meta.id" + label 'process_medium' + publishDir "${params.outdir_internal}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? "bioconda::bedtools=2.30.0" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/bedtools:2.30.0--hc088bd4_0" + } else { + container "quay.io/biocontainers/bedtools:2.30.0--hc088bd4_0" + } + + input: + tuple val(meta), path(bam), path(lengths) + + output: + tuple val(meta), path("*.bed"), emit: bed + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" + """ + bedtools \\ + genomecov \\ + -ibam ${bam} \\ + -g "${lengths}" \\ + $options.args > ${prefix}.bed + + bedtools --version | sed -e "s/bedtools v//g" > bedtools.version.txt + """ +} diff --git a/modules/local/bin_contigs.nf b/modules/local/bin_contigs.nf new file mode 100644 index 000000000..38a876072 --- /dev/null +++ b/modules/local/bin_contigs.nf @@ -0,0 +1,51 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process BIN_CONTIGS { + tag "Performing Autometa binning on ${meta.id}" + label 'process_high' + publishDir "${params.outdir_internal}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? "bioconda::autometa" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + } else { + container "jason-c-kwan/autometa:${params.autometa_image_tag}" + } + + input: + tuple val(meta), path(kmers), path(coverage), path(gc_content), path(markers), path(taxonomy) + + output: + tuple val(meta), path("${meta.id}.${params.kingdom}.binning.tsv.gz"), emit: binning + tuple val(meta), path("${meta.id}.${params.kingdom}.main.tsv.gz") , emit: main + path '*.version.txt' , emit: version + + script: + def software = getSoftwareName(task.process) + taxonomy_call = params.taxonomy_aware ? "--taxonomy $taxonomy" : "" // https://github.com/nextflow-io/nextflow/issues/1694#issuecomment-683272275 + """ + autometa-binning \\ + --kmers $kmers \\ + --coverages $coverage \\ + --gc-content $gc_content \\ + --markers $markers \\ + --output-binning ${meta.id}.${params.kingdom}.binning.tsv.gz \\ + --output-main ${meta.id}.${params.kingdom}.main.tsv.gz \\ + --clustering-method ${params.clustering_method} \\ + --completeness ${params.completeness} \\ + --purity ${params.purity} \\ + $taxonomy_call \\ + --cov-stddev-limit ${params.cov_stddev_limit} \\ + --gc-stddev-limit ${params.gc_stddev_limit} \\ + --starting-rank ${params.binning_starting_rank} \\ + --domain ${params.kingdom} + + echo "TODO" > autometa.version.txt + """ +} diff --git a/modules/local/binning_summary.nf b/modules/local/binning_summary.nf new file mode 100644 index 000000000..4c096cc68 --- /dev/null +++ b/modules/local/binning_summary.nf @@ -0,0 +1,48 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +params.taxdump_tar_gz_dir = [:] + +process BINNING_SUMMARY { + tag "Gathering binning summary for ${meta.id}" + label 'process_high' + publishDir "${params.outdir_internal}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? "bioconda::autometa" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + } else { + container "jason-c-kwan/autometa:${params.autometa_image_tag}" + } + + input: + tuple val(meta), path(binning_main), path(markers), path(metagenome) + val(binning_column) + + output: + tuple val(meta), path("${meta.id}_metabin_stats.tsv") , emit: stats + tuple val(meta), path("${meta.id}_metabins") , emit: metabins + tuple val(meta), path("${meta.id}_metabin_taxonomy.tsv"), emit: taxonomies, optional: true + path '*.version.txt' , emit: version + + script: + def software = getSoftwareName(task.process) + """ + autometa-binning-summary \\ + --ncbi ${params.taxdump_tar_gz_dir} \\ + --binning-main $binning_main \\ + --markers $markers \\ + --metagenome $metagenome \\ + --binning-column $binning_column \\ + --output-stats "${meta.id}_metabin_stats.tsv" \\ + --output-taxonomy "${meta.id}_metabin_taxonomy.tsv" \\ + --output-metabins "${meta.id}_metabins" + + echo "TODO" > autometa.version.txt + """ +} diff --git a/modules/local/diamond_blastp.nf b/modules/local/diamond_blastp.nf new file mode 100644 index 000000000..28ce61c83 --- /dev/null +++ b/modules/local/diamond_blastp.nf @@ -0,0 +1,44 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process DIAMOND_BLASTP { + tag "Aligning ORFS in ${meta.id} against ${diamond_database}" + label 'process_high' + // Old diamond manual suggested *NOT* running in parallel... so we are setting maxForks to 1 here. + // TODO: There appears to be features for multiprocessing availabe now + // See: https://github.com/bbuchfink/diamond/wiki/6.-Distributed-computing + maxForks 1 + publishDir "${params.interim_dir_internal}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } + + conda (params.enable_conda ? "bioconda::diamond=2.0.9" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/diamond:2.0.9--hdcc8f71_0" + } else { + container "quay.io/biocontainers/diamond:2.0.9--hdcc8f71_0" + } + + input: + tuple val(meta), path(protein_fasta) + path(diamond_database) + + output: + tuple val(meta), path("${meta.id}.blastp.tsv"), emit: diamond_results + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + """ + diamond blastp $options.args \\ + --query ${protein_fasta} \\ + --db ${diamond_database} \\ + --threads ${task.cpus} \\ + --out ${meta.id}.blastp.tsv + + diamond version | sed 's/^.*diamond version //' > diamond.version.txt + """ +} diff --git a/modules/local/diamond_makedb.nf b/modules/local/diamond_makedb.nf new file mode 100644 index 000000000..629d24e12 --- /dev/null +++ b/modules/local/diamond_makedb.nf @@ -0,0 +1,39 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +params.nr_dmnd_dir = null +options = initOptions(params.options) + +process DIAMOND_MAKEDB { + tag ' Preparing Diamond database' + label 'process_high' + + storeDir "${params.nr_dmnd_dir}" + + conda (params.enable_conda ? "bioconda::diamond=2.0.9" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/diamond:2.0.9--hdcc8f71_0" + } else { + container "quay.io/biocontainers/diamond:2.0.9--hdcc8f71_0" + } + + input: + path(fasta) + val(dbname) + + output: + path("*.dmnd"), emit: diamond_db + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + """ + diamond makedb --in ${fasta} \\ + $options.args \\ + --threads ${task.cpus} \\ + --db ${dbname} + + diamond version | sed 's/^.*diamond version //' > diamond.version.txt + """ +} diff --git a/modules/local/functions.nf b/modules/local/functions.nf new file mode 100644 index 000000000..da9da093d --- /dev/null +++ b/modules/local/functions.nf @@ -0,0 +1,68 @@ +// +// Utility functions used in nf-core DSL2 module files +// + +// +// Extract name of software tool from process name using $task.process +// +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +// +// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules +// +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.args3 = args.args3 ?: '' + options.publish_by_meta = args.publish_by_meta ?: [] + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files + options.suffix = args.suffix ?: '' + return options +} + +// +// Tidy up and join elements of a list to return a path string +// +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +// +// Function to save/publish module results +// +def saveFiles(Map args) { + if (!args.filename.endsWith('.version.txt')) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + if (ioptions.publish_by_meta) { + def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta + for (key in key_list) { + if (args.meta && key instanceof String) { + def path = key + if (args.meta.containsKey(key)) { + path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] + } + path = path instanceof String ? path : '' + path_list.add(path) + } + } + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/$args.filename" + } + } +} diff --git a/modules/local/get_software_versions.nf b/modules/local/get_software_versions.nf new file mode 100644 index 000000000..dcf4945a8 --- /dev/null +++ b/modules/local/get_software_versions.nf @@ -0,0 +1,45 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +/* +This file is left in from the template, that's mainly used for QUAST (http://cab.spbu.ru/software/quast/). + There's a discussion that can be had later about incorporating that module fully or removing the remaining template that feeds into it +*/ + +process GET_SOFTWARE_VERSIONS { + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', meta:[:], publish_by_meta:[]) } + + conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/python:3.8.3" + } else { + container "quay.io/biocontainers/python:3.8.3" + } + + cache false + + input: + path versions + + output: + path "software_versions.tsv" , emit: tsv + path 'software_versions_mqc.yaml', emit: yaml + path '*.version.txt' , emit: version + + script: + // Add soft-links to original FastQs for consistent naming in pipeline + def software = getSoftwareName(task.process) + """ + echo $workflow.manifest.version > pipeline.version.txt + echo $workflow.nextflow.version > nextflow.version.txt + scrape_software_versions.py &> software_versions_mqc.yaml + + echo "make linter happy" > autometa.version.txt + """ +} diff --git a/modules/local/hmmer_hmmsearch.nf b/modules/local/hmmer_hmmsearch.nf new file mode 100644 index 000000000..26627c8fa --- /dev/null +++ b/modules/local/hmmer_hmmsearch.nf @@ -0,0 +1,53 @@ + +/* +======================= +TODO: Not yet implemented +This should speed up hmm searches, however as of now 2 things are needed: +1: cutoff values need to be downloaded/provided to the next process that reads +the results of this process +2: The cutoffs would need to be determined again using the -Z flag of hmmsearch +======================= +*/ + +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process HMMER_HMMSEARCH { + tag "Annotating ORFs in $meta.id" + label 'process_medium' + + // no publishdir + + conda (params.enable_conda ? "bioconda::hmmer=3.3.2" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/hmmer:3.3.2--h1b792b2_1" + } else { + container "quay.io/biocontainers/hmmer:3.3.2--h1b792b2_1" + } + + input: + tuple val(meta), path(fasta) + path(hmm) + + output: + tuple val(meta), path("*.domtblout"), emit: domtblout + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" + def fastacmd = fasta.getExtension() == 'gz' ? "gunzip -c $fasta" : "cat $fasta" + """ + hmmsearch \\ + --domtblout "${meta.id}.domtblout" \\ + ${options.args} \\ + ${options.args2} \\ + $hmm \\ + $fasta > /dev/null 2>&1 + + echo \$(hmmalign -h | grep -o '^# HMMER [0-9.]*') | sed 's/^# HMMER *//' > HMMER.version.txt + """ +} diff --git a/modules/local/hmmer_hmmsearch_filter.nf b/modules/local/hmmer_hmmsearch_filter.nf new file mode 100644 index 000000000..da4efafd3 --- /dev/null +++ b/modules/local/hmmer_hmmsearch_filter.nf @@ -0,0 +1,55 @@ + +/* +======================= +TODO: Not yet implemented +"Cutoffs" need to be downloaded/provided to this process +======================= +*/ + + +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process HMMER_HMMSEARCH_FILTER { + tag "Filtering marker hmms in $meta.id" + label 'process_medium' + + if ( params.num_splits < 2 ) { + // if running in parallel, the results are published from the process + // that merges the individual results from this process + publishDir "${params.interim_dir_internal}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + } + + conda (params.enable_conda ? "autometa" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + } else { + container "jason-c-kwan/autometa:${params.autometa_image_tag}" + } + + input: + tuple val(meta), path(domtblout), path(fasta) + path("bacteria.single_copy.cutoffs") + + output: + tuple val(meta), path("${meta.id}.markers.tsv"), emit: markers_tsv + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" + """ + autometa-hmmsearch-filter \\ + --domtblout "$domtblout" \\ + --cutoffs TODO:"Cutoffs" need to be downloaded/provided to this process \\ + --seqdb "$fasta" \\ + --out "${meta.id}.markers.tsv" + + echo "TODO" > autometa.version.txt + """ +} diff --git a/modules/local/lca.nf b/modules/local/lca.nf new file mode 100644 index 000000000..ed56e8159 --- /dev/null +++ b/modules/local/lca.nf @@ -0,0 +1,36 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process LCA { + tag "Finding LCA for ${meta.id}" + label 'process_high' + publishDir "${params.interim_dir_internal}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } + + conda (params.enable_conda ? "bioconda::autometa" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + } else { + container "jason-c-kwan/autometa:${params.autometa_image_tag}" + } + + input: + tuple val(meta), path(blast) + path(blastdb_dir) + + output: + tuple val(meta), path("${meta.id}.lca.tsv"), emit: lca + path '*.version.txt' , emit: version + + + script: + def software = getSoftwareName(task.process) + """ + autometa-taxonomy-lca --blast ${blast} --dbdir ${blastdb_dir} --output ${meta.id}.lca.tsv + echo "TODO" > autometa.version.txt + """ +} diff --git a/modules/local/length_table.nf b/modules/local/length_table.nf new file mode 100644 index 000000000..4a265b845 --- /dev/null +++ b/modules/local/length_table.nf @@ -0,0 +1,42 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process LENGTH_TABLE { + tag "$meta.id" + label 'process_low' + publishDir "${params.outdir_internal}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? "bioconda::autometa" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + } else { + container "jason-c-kwan/autometa:${params.autometa_image_tag}" + } + + input: + tuple val(meta), path(metagenome) + + output: + tuple val(meta), path("${meta.id}.lengths.tsv"), emit: lengths + path '*.version.txt' , emit: version + + script: + def software = getSoftwareName(task.process) + """ + #!/usr/bin/env python + from Bio import SeqIO + import pandas as pd + + seqs = {record.id: len(record.seq) for record in SeqIO.parse(${metagenome}, "fasta")} + lengths = pd.Series(seqs, name="length") + lengths.index.name = "contig" + lengths.to_csv(${meta.id}.lengths.tsv, sep="\t", index=True, header=True) + + echo "TODO" > ${software}.version.txt + """ +} diff --git a/modules/local/majority_vote.nf b/modules/local/majority_vote.nf new file mode 100644 index 000000000..b5baa5dd9 --- /dev/null +++ b/modules/local/majority_vote.nf @@ -0,0 +1,39 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + + +process MAJORITY_VOTE { + label 'process_medium' + + tag "Performing taxon majority vote on ${meta.id}" + publishDir "${params.interim_dir_internal}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } + + conda (params.enable_conda ? "bioconda::autometa" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + } else { + container "jason-c-kwan/autometa:${params.autometa_image_tag}" + } + + input: + tuple val(meta), path(lca) + path(ncbi_tax_dir) + + output: + tuple val(meta), path("${meta.id}.votes.tsv"), emit: votes + path '*.version.txt' , emit: version + + script: + def software = getSoftwareName(task.process) + """ + autometa-taxonomy-majority-vote --lca ${lca} --output ${meta.id}.votes.tsv --dbdir "${ncbi_tax_dir}" + + echo "TODO" > autometa.version.txt + """ +} + diff --git a/modules/local/markers.nf b/modules/local/markers.nf new file mode 100644 index 000000000..d7ac8bfd2 --- /dev/null +++ b/modules/local/markers.nf @@ -0,0 +1,49 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +// TODO: For faster results/ les I/O this could be replaced with hmmsearch +process MARKERS { + tag "Finding markers for ${meta.id}" + label "process_low" + + conda (params.enable_conda ? "autometa" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + } else { + container "jason-c-kwan/autometa:${params.autometa_image_tag}" + } + + input: + tuple val(meta), path(orfs) + //path(hmmdb) currently only inside docker + //path(cutoffs) currently only inside docker + + output: + tuple val(meta), path("${meta.id}.markers.tsv"), emit: markers_tsv + path '*.version.txt' , emit: version + + script: + def software = getSoftwareName(task.process) + if (params.enable_conda) + """ + exit 1 + """ + else + """ + autometa-markers \\ + --orfs $orfs \\ + --hmmscan ${meta.id}.hmmscan.tsv \\ + --out ${meta.id}.markers.tsv \\ + --kingdom ${params.kingdom} \\ + --parallel \\ + --cpus ${task.cpus} \\ + --seed 42 \\ + --hmmdb "/scratch/dbs/markers/${params.kingdom}.single_copy.hmm" \\ + --cutoffs "/scratch/dbs/markers/${params.kingdom}.single_copy.cutoffs" + + echo "TODO" > autometa.version.txt + """ +} diff --git a/modules/local/merge_fasta.nf b/modules/local/merge_fasta.nf new file mode 100644 index 000000000..d74ffe2b7 --- /dev/null +++ b/modules/local/merge_fasta.nf @@ -0,0 +1,37 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process MERGE_FASTA { + tag "Merging ${meta.id} FASTA" + label 'process_low' + publishDir "${params.interim_dir_internal}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? "bioconda::seqkit=0.16.1" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/seqkit:0.16.1--h9ee0642_0" + } else { + container "quay.io/biocontainers/seqkit:0.16.1--h9ee0642_0" + } + + input: + tuple val(meta), path("?") + val extension + + output: + tuple val(meta), path("${meta.id}.${extension}"), emit: merged + path '*.version.txt' , emit: version + + script: + def software = getSoftwareName(task.process) + """ + # If errors occur because of issues with symlinks, + # try: cat * | seqkit sort -n > "${meta.id}.${extension}" + seqkit sort -n * > "${meta.id}.${extension}" + seqkit version | sed 's/seqkit v//g' > ${software}.version.txt + """ +} diff --git a/modules/local/merge_tsv.nf b/modules/local/merge_tsv.nf new file mode 100644 index 000000000..603008b02 --- /dev/null +++ b/modules/local/merge_tsv.nf @@ -0,0 +1,35 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process MERGE_TSV_WITH_HEADERS { + tag "Merging files from parallel split for ${meta.id}" + label 'process_low' + publishDir "${params.interim_dir_internal}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? "bioconda::autometa" : null) + + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + } else { + container "jason-c-kwan/autometa:${params.autometa_image_tag}" + } + + input: + tuple val(meta), path("?.tsv") + val extension + + output: + tuple val(meta), path("${meta.id}.${extension}"), emit: merged_tsv + + + script: + def software = getSoftwareName(task.process) + """ + awk 'FNR==1 && NR!=1{next;}{print}' *.tsv > "${meta.id}.${extension}" + """ +} diff --git a/modules/local/parse_bed.nf b/modules/local/parse_bed.nf new file mode 100644 index 000000000..dc76b5013 --- /dev/null +++ b/modules/local/parse_bed.nf @@ -0,0 +1,39 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process PARSE_BED { + tag "$meta.id" + label 'process_low' + publishDir "${params.outdir_internal}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? "bioconda::autometa" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/bedtools:2.30.0--hc088bd4_0" + } else { + container "jason-c-kwan/autometa:${params.autometa_image_tag}" + } + + input: + tuple val(meta), path(bam), path(lengths), path(bed_out) + + output: + tuple val(meta), path("${meta.id}.coverage.tsv"), emit: coverage + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + """ + autometa-parse-bed \\ + --ibam $bam \\ + --lengths $lengths \\ + --bed $bed_out \\ + --output ${meta.id}.coverage.tsv + + echo "TODO" > autometa.version.txt + """ +} diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf new file mode 100644 index 000000000..602e6c99a --- /dev/null +++ b/modules/local/samplesheet_check.nf @@ -0,0 +1,34 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process SAMPLESHEET_CHECK { + tag "$samplesheet" + label 'process_low' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', meta:[:], publish_by_meta:[]) } + + conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/python:3.8.3" + } else { + container "quay.io/biocontainers/python:3.8.3" + } + + input: + path samplesheet + + output: + path '*.csv' + + script: + def software = getSoftwareName(task.process) + """ + check_samplesheet.py \\ + $samplesheet \\ + samplesheet.valid.csv + """ +} diff --git a/modules/local/samtools_view_sort.nf b/modules/local/samtools_view_sort.nf new file mode 100644 index 000000000..a8dec989a --- /dev/null +++ b/modules/local/samtools_view_sort.nf @@ -0,0 +1,37 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process SAMTOOLS_VIEW_AND_SORT { + tag "$meta.id" + label 'process_medium' + publishDir "${params.outdir_internal}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? "bioconda::samtools=1.13" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/samtools:1.12--hd5e65b6_0" + } else { + container "quay.io/biocontainers/samtools:1.12--hd5e65b6_0" + } + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*.bam"), emit: bam + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" + """ + samtools view ${options.args} -@ ${task.cpus} ${bam} |\ + samtools sort ${options.args2} -@ ${task.cpus} -o ${prefix}.bam -T $prefix + + echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' > ${software}.version.txt + """ +} diff --git a/modules/local/seqkit_filter.nf b/modules/local/seqkit_filter.nf new file mode 100644 index 000000000..81c6a1bef --- /dev/null +++ b/modules/local/seqkit_filter.nf @@ -0,0 +1,52 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process SEQKIT_FILTER { + tag "Removing contigs < ${params.length_cutoff} bp, from ${meta.id}" + label 'process_high' + + publishDir "${params.interim_dir_internal}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? "bioconda::seqkit=0.16.1" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/seqkit:0.16.1--h9ee0642_0" + } else { + container "quay.io/biocontainers/seqkit:0.16.1--h9ee0642_0" + } + + input: + tuple val(meta), path(metagenome) + + output: + tuple val(meta), path("${meta.id}.filtered.fna") , emit: fasta + tuple val(meta), path("${meta.id}.gc_content.tsv"), emit: gc_content + path '*.version.txt' , emit: version + + script: + def software = getSoftwareName(task.process) + def metagenomecmd = metagenome.getExtension() == 'gz' ? "gunzip -c $metagenome" : "cat $metagenome" + """ + # filter contigs by specified length + ${metagenomecmd} | \\ + seqkit seq -j ${task.cpus} -m ${params.length_cutoff} | \\ + seqkit sort -n > "${meta.id}.filtered.fna" + + # calculate gc content + seqkit fx2tab -j ${task.cpus} -n -lg "${meta.id}.filtered.fna" > temp + + # Extract columns, create tsv + awk '{FS="\\t"; OFS="\\t"; print \$1,\$3,\$2}' temp > temp2 + echo -e "contig\\tgc_content\\tlength" | cat - temp2 > "${meta.id}.gc_content.tsv" + + # Remove temporary files + rm temp + rm temp2 + + seqkit version | sed 's/seqkit v//g' > ${software}.version.txt + """ +} diff --git a/modules/local/seqkit_split.nf b/modules/local/seqkit_split.nf new file mode 100644 index 000000000..f4b863088 --- /dev/null +++ b/modules/local/seqkit_split.nf @@ -0,0 +1,40 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process SEQKIT_SPLIT { + tag "Splitting $meta.id for parallel processing" + label 'process_medium' + + // no publishdir + + conda (params.enable_conda ? "bioconda::seqkit=0.16.1" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/seqkit:0.16.1--h9ee0642_0" + } else { + container "quay.io/biocontainers/seqkit:0.16.1--h9ee0642_0" + } + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("outfolder/*") , emit: fasta + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" + """ + seqkit \\ + split \\ + ${fasta} \\ + ${options.args} \\ + ${options.args2} \\ + -O outfolder + + seqkit version | sed 's/seqkit v//g' > ${software}.version.txt + """ +} diff --git a/modules/local/spades_kmer_coverage.nf b/modules/local/spades_kmer_coverage.nf new file mode 100644 index 000000000..5585ec33b --- /dev/null +++ b/modules/local/spades_kmer_coverage.nf @@ -0,0 +1,39 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process SPADES_KMER_COVERAGE { + tag "Calculating k-mer coverage for ${meta.id}" + label 'process_low' + + publishDir "${params.interim_dir_internal}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? "autometa" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + } else { + container "jason-c-kwan/autometa:${params.autometa_image_tag}" + } + + input: + tuple val(meta), path(metagenome) + + output: + tuple val(meta), path("${meta.id}.coverage") , emit: coverages + path '*.version.txt' , emit: version + + script: + def software = getSoftwareName(task.process) + """ + autometa-coverage \\ + --assembly ${metagenome} \\ + --from-spades \\ + --out "${meta.id}.coverage" + + echo "TODO" > autometa.version.txt + """ +} diff --git a/modules/local/split_kingdoms.nf b/modules/local/split_kingdoms.nf new file mode 100644 index 000000000..0e0939eb6 --- /dev/null +++ b/modules/local/split_kingdoms.nf @@ -0,0 +1,45 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process SPLIT_KINGDOMS { + tag "Splitting votes into kingdoms for ${meta.id}" + label 'process_medium' + + publishDir "${params.interim_dir_internal}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } + + conda (params.enable_conda ? "bioconda::autometa" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + } else { + container "jason-c-kwan/autometa:${params.autometa_image_tag}" + } + + input: + tuple val(meta), path(assembly), path(votes) + path(ncbi_tax_dir) + + output: + tuple val(meta), path("${meta.id}.taxonomy.tsv"), emit: taxonomy + tuple val(meta), path("${meta.id}.bacteria.fna"), emit: bacteria, optional: true + tuple val(meta), path("${meta.id}.archaea.fna") , emit: archaea, optional: true + path '*.version.txt' , emit: version + + script: + def software = getSoftwareName(task.process) + """ + autometa-taxonomy \\ + --votes "${votes}" \\ + --output . \\ + --prefix "${meta.id}" \\ + --split-rank-and-write superkingdom \\ + --assembly "${assembly}" \\ + --ncbi "${ncbi_tax_dir}" + + echo "TODO" > autometa.version.txt + """ +} diff --git a/modules/local/unclustered_recruitment.nf b/modules/local/unclustered_recruitment.nf new file mode 100644 index 000000000..0b636365d --- /dev/null +++ b/modules/local/unclustered_recruitment.nf @@ -0,0 +1,63 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process RECRUIT { + tag "Performing Autometa unclustered recruitment on ${meta.id}" + label 'process_high' + + publishDir "${params.outdir_internal}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? "autometa" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + } else { + container "jason-c-kwan/autometa:${params.autometa_image_tag}" + } + + input: + tuple val(meta), path(kmers), path(coverage), path(binning), path(markers), path(taxonomy) + + output: + tuple val(meta), path ("${meta.id}.${params.kingdom}.recruitment.tsv.gz") , emit: binning, optional: true + tuple val(meta), path ("${meta.id}.${params.kingdom}.recruitment.main.tsv.gz"), emit: main, optional: true + path '*.version.txt' , emit: version + + script: + // Add soft-links to original FastQs for consistent naming in pipeline + def software = getSoftwareName(task.process) + if (!params.taxonomy_aware) + """ + autometa-unclustered-recruitment \\ + --classifier ${params.classification_method} \\ + --kmer-dimensions ${params.classification_kmer_pca_dimensions} \\ + --seed 42 \\ + --kmers $kmers \\ + --coverage $coverage \\ + --binning $binning \\ + --markers $markers \\ + --output-binning ${meta.id}.${params.kingdom}.recruitment.tsv.gz \\ + --output-main ${meta.id}.${params.kingdom}.recruitment.main.tsv.gz + echo "TODO" > autometa.version.txt + """ + else + """ + autometa-unclustered-recruitment \\ + --classifier ${params.classification_method} \\ + --kmer-dimensions ${params.classification_kmer_pca_dimensions} \\ + --seed 42 \\ + --taxonomy $taxonomy \\ + --kmers $kmers \\ + --coverage $coverage \\ + --binning $binning \\ + --markers $markers \\ + --output-binning ${meta.id}.${params.kingdom}.recruitment.tsv.gz \\ + --output-main ${meta.id}.${params.kingdom}.recruitment.main.tsv.gz + + echo "TODO" > autometa.version.txt + """ +} diff --git a/modules/nf-core/modules/bowtie2/align/functions.nf b/modules/nf-core/modules/bowtie2/align/functions.nf new file mode 100644 index 000000000..da9da093d --- /dev/null +++ b/modules/nf-core/modules/bowtie2/align/functions.nf @@ -0,0 +1,68 @@ +// +// Utility functions used in nf-core DSL2 module files +// + +// +// Extract name of software tool from process name using $task.process +// +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +// +// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules +// +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.args3 = args.args3 ?: '' + options.publish_by_meta = args.publish_by_meta ?: [] + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files + options.suffix = args.suffix ?: '' + return options +} + +// +// Tidy up and join elements of a list to return a path string +// +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +// +// Function to save/publish module results +// +def saveFiles(Map args) { + if (!args.filename.endsWith('.version.txt')) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + if (ioptions.publish_by_meta) { + def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta + for (key in key_list) { + if (args.meta && key instanceof String) { + def path = key + if (args.meta.containsKey(key)) { + path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] + } + path = path instanceof String ? path : '' + path_list.add(path) + } + } + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/$args.filename" + } + } +} diff --git a/modules/nf-core/modules/bowtie2/align/main.nf b/modules/nf-core/modules/bowtie2/align/main.nf new file mode 100644 index 000000000..d43d479d0 --- /dev/null +++ b/modules/nf-core/modules/bowtie2/align/main.nf @@ -0,0 +1,73 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process BOWTIE2_ALIGN { + tag "$meta.id" + label 'process_high' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? 'bioconda::bowtie2=2.4.2 bioconda::samtools=1.11 conda-forge::pigz=2.3.4' : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:577a697be67b5ae9b16f637fd723b8263a3898b3-0" + } else { + container "quay.io/biocontainers/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:577a697be67b5ae9b16f637fd723b8263a3898b3-0" + } + + input: + tuple val(meta), path(reads) + path index + + output: + tuple val(meta), path('*.bam'), emit: bam + tuple val(meta), path('*.log'), emit: log + path '*.version.txt' , emit: version + tuple val(meta), path('*fastq.gz'), optional:true, emit: fastq + + script: + def split_cpus = Math.floor(task.cpus/2) + def software = getSoftwareName(task.process) + def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" + if (meta.single_end) { + def unaligned = params.save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' + """ + INDEX=`find -L ./ -name "*.rev.1.bt2" | sed 's/.rev.1.bt2//'` + bowtie2 \\ + -x \$INDEX \\ + -U $reads \\ + --threads ${split_cpus} \\ + $unaligned \\ + $options.args \\ + 2> ${prefix}.bowtie2.log \\ + | samtools view -@ ${split_cpus} $options.args2 -bhS -o ${prefix}.bam - + + echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//' > ${software}.version.txt + """ + } else { + def unaligned = params.save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' + """ + INDEX=`find -L ./ -name "*.rev.1.bt2" | sed 's/.rev.1.bt2//'` + bowtie2 \\ + -x \$INDEX \\ + -1 ${reads[0]} \\ + -2 ${reads[1]} \\ + --threads ${split_cpus} \\ + $unaligned \\ + $options.args \\ + 2> ${prefix}.bowtie2.log \\ + | samtools view -@ ${split_cpus} $options.args2 -bhS -o ${prefix}.bam - + + if [ -f ${prefix}.unmapped.fastq.1.gz ]; then + mv ${prefix}.unmapped.fastq.1.gz ${prefix}.unmapped_1.fastq.gz + fi + if [ -f ${prefix}.unmapped.fastq.2.gz ]; then + mv ${prefix}.unmapped.fastq.2.gz ${prefix}.unmapped_2.fastq.gz + fi + echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//' > ${software}.version.txt + """ + } +} diff --git a/modules/nf-core/modules/bowtie2/align/meta.yml b/modules/nf-core/modules/bowtie2/align/meta.yml new file mode 100644 index 000000000..9d9cd004b --- /dev/null +++ b/modules/nf-core/modules/bowtie2/align/meta.yml @@ -0,0 +1,50 @@ +name: bowtie2_align +description: Align reads to a reference genome using bowtie2 +keywords: + - align + - fasta + - genome + - reference +tools: + - bowtie2: + description: | + Bowtie 2 is an ultrafast and memory-efficient tool for aligning + sequencing reads to long reference sequences. + homepage: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml + documentation: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml + doi: 10.1038/nmeth.1923 +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - index: + type: file + description: Bowtie2 genome index files + pattern: "*.ebwt" +output: + - bam: + type: file + description: Output BAM file containing read alignments + pattern: "*.{bam}" + - version: + type: file + description: File containing software version + pattern: "*.{version.txt}" + - fastq: + type: file + description: Unaligned FastQ files + pattern: "*.fastq.gz" + - log: + type: file + description: Aligment log + pattern: "*.log" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/modules/fastqc/functions.nf b/modules/nf-core/modules/fastqc/functions.nf new file mode 100644 index 000000000..da9da093d --- /dev/null +++ b/modules/nf-core/modules/fastqc/functions.nf @@ -0,0 +1,68 @@ +// +// Utility functions used in nf-core DSL2 module files +// + +// +// Extract name of software tool from process name using $task.process +// +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +// +// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules +// +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.args3 = args.args3 ?: '' + options.publish_by_meta = args.publish_by_meta ?: [] + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files + options.suffix = args.suffix ?: '' + return options +} + +// +// Tidy up and join elements of a list to return a path string +// +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +// +// Function to save/publish module results +// +def saveFiles(Map args) { + if (!args.filename.endsWith('.version.txt')) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + if (ioptions.publish_by_meta) { + def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta + for (key in key_list) { + if (args.meta && key instanceof String) { + def path = key + if (args.meta.containsKey(key)) { + path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] + } + path = path instanceof String ? path : '' + path_list.add(path) + } + } + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/$args.filename" + } + } +} diff --git a/modules/nf-core/modules/fastqc/main.nf b/modules/nf-core/modules/fastqc/main.nf new file mode 100644 index 000000000..39c327b26 --- /dev/null +++ b/modules/nf-core/modules/fastqc/main.nf @@ -0,0 +1,47 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process FASTQC { + tag "$meta.id" + label 'process_medium' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? "bioconda::fastqc=0.11.9" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0" + } else { + container "quay.io/biocontainers/fastqc:0.11.9--0" + } + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.html"), emit: html + tuple val(meta), path("*.zip") , emit: zip + path "*.version.txt" , emit: version + + script: + // Add soft-links to original FastQs for consistent naming in pipeline + def software = getSoftwareName(task.process) + def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" + if (meta.single_end) { + """ + [ ! -f ${prefix}.fastq.gz ] && ln -s $reads ${prefix}.fastq.gz + fastqc $options.args --threads $task.cpus ${prefix}.fastq.gz + fastqc --version | sed -e "s/FastQC v//g" > ${software}.version.txt + """ + } else { + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz + fastqc $options.args --threads $task.cpus ${prefix}_1.fastq.gz ${prefix}_2.fastq.gz + fastqc --version | sed -e "s/FastQC v//g" > ${software}.version.txt + """ + } +} diff --git a/modules/nf-core/modules/fastqc/meta.yml b/modules/nf-core/modules/fastqc/meta.yml new file mode 100644 index 000000000..8eb9953dc --- /dev/null +++ b/modules/nf-core/modules/fastqc/meta.yml @@ -0,0 +1,51 @@ +name: fastqc +description: Run FastQC on sequenced reads +keywords: + - quality control + - qc + - adapters + - fastq +tools: + - fastqc: + description: | + FastQC gives general quality metrics about your reads. + It provides information about the quality score distribution + across your reads, the per base sequence content (%A/C/G/T). + You get information about adapter contamination and other + overrepresented sequences. + homepage: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ + documentation: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/ +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - html: + type: file + description: FastQC report + pattern: "*_{fastqc.html}" + - zip: + type: file + description: FastQC report archive + pattern: "*_{fastqc.zip}" + - version: + type: file + description: File containing software version + pattern: "*.{version.txt}" +authors: + - "@drpatelh" + - "@grst" + - "@ewels" + - "@FelixKrueger" diff --git a/modules/nf-core/modules/multiqc/functions.nf b/modules/nf-core/modules/multiqc/functions.nf new file mode 100644 index 000000000..da9da093d --- /dev/null +++ b/modules/nf-core/modules/multiqc/functions.nf @@ -0,0 +1,68 @@ +// +// Utility functions used in nf-core DSL2 module files +// + +// +// Extract name of software tool from process name using $task.process +// +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +// +// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules +// +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.args3 = args.args3 ?: '' + options.publish_by_meta = args.publish_by_meta ?: [] + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files + options.suffix = args.suffix ?: '' + return options +} + +// +// Tidy up and join elements of a list to return a path string +// +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +// +// Function to save/publish module results +// +def saveFiles(Map args) { + if (!args.filename.endsWith('.version.txt')) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + if (ioptions.publish_by_meta) { + def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta + for (key in key_list) { + if (args.meta && key instanceof String) { + def path = key + if (args.meta.containsKey(key)) { + path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] + } + path = path instanceof String ? path : '' + path_list.add(path) + } + } + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/$args.filename" + } + } +} diff --git a/modules/nf-core/modules/multiqc/main.nf b/modules/nf-core/modules/multiqc/main.nf new file mode 100644 index 000000000..da7808002 --- /dev/null +++ b/modules/nf-core/modules/multiqc/main.nf @@ -0,0 +1,35 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process MULTIQC { + label 'process_medium' + publishDir "${params.outdir}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } + + conda (params.enable_conda ? "bioconda::multiqc=1.10.1" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/multiqc:1.10.1--py_0" + } else { + container "quay.io/biocontainers/multiqc:1.10.1--py_0" + } + + input: + path multiqc_files + + output: + path "*multiqc_report.html", emit: report + path "*_data" , emit: data + path "*_plots" , optional:true, emit: plots + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + """ + multiqc -f $options.args . + multiqc --version | sed -e "s/multiqc, version //g" > ${software}.version.txt + """ +} diff --git a/modules/nf-core/modules/multiqc/meta.yml b/modules/nf-core/modules/multiqc/meta.yml new file mode 100644 index 000000000..532a8bb1e --- /dev/null +++ b/modules/nf-core/modules/multiqc/meta.yml @@ -0,0 +1,39 @@ +name: MultiQC +description: Aggregate results from bioinformatics analyses across many samples into a single report +keywords: + - QC + - bioinformatics tools + - Beautiful stand-alone HTML report +tools: + - multiqc: + description: | + MultiQC searches a given directory for analysis logs and compiles a HTML report. + It's a general use tool, perfect for summarising the output from numerous bioinformatics tools. + homepage: https://multiqc.info/ + documentation: https://multiqc.info/docs/ +input: + - multiqc_files: + type: file + description: | + List of reports / files recognised by MultiQC, for example the html and zip output of FastQC +output: + - report: + type: file + description: MultiQC report file + pattern: "multiqc_report.html" + - data: + type: dir + description: MultiQC data dir + pattern: "multiqc_data" + - plots: + type: file + description: Plots created by MultiQC + pattern: "*_data" + - version: + type: file + description: File containing software version + pattern: "*.{version.txt}" +authors: + - "@abhi18av" + - "@bunop" + - "@drpatelh" diff --git a/modules/nf-core/modules/prodigal/functions.nf b/modules/nf-core/modules/prodigal/functions.nf new file mode 100644 index 000000000..da9da093d --- /dev/null +++ b/modules/nf-core/modules/prodigal/functions.nf @@ -0,0 +1,68 @@ +// +// Utility functions used in nf-core DSL2 module files +// + +// +// Extract name of software tool from process name using $task.process +// +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +// +// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules +// +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.args3 = args.args3 ?: '' + options.publish_by_meta = args.publish_by_meta ?: [] + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files + options.suffix = args.suffix ?: '' + return options +} + +// +// Tidy up and join elements of a list to return a path string +// +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +// +// Function to save/publish module results +// +def saveFiles(Map args) { + if (!args.filename.endsWith('.version.txt')) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + if (ioptions.publish_by_meta) { + def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta + for (key in key_list) { + if (args.meta && key instanceof String) { + def path = key + if (args.meta.containsKey(key)) { + path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] + } + path = path instanceof String ? path : '' + path_list.add(path) + } + } + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/$args.filename" + } + } +} diff --git a/modules/nf-core/modules/prodigal/main.nf b/modules/nf-core/modules/prodigal/main.nf new file mode 100644 index 000000000..2c7de63d4 --- /dev/null +++ b/modules/nf-core/modules/prodigal/main.nf @@ -0,0 +1,46 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +process PRODIGAL { + tag "Annotating $meta.id" + label 'process_low' + publishDir "${params.interim_dir_internal}", + mode: params.publish_dir_mode, + saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } + + conda (params.enable_conda ? "bioconda::prodigal=2.6.3" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/prodigal:2.6.3--h516909a_2" + } else { + container "quay.io/biocontainers/prodigal:2.6.3--h516909a_2" + } + + input: + tuple val(meta), path(genome) + val(output_format) + + output: + tuple val(meta), path("${prefix}.${output_format}"), emit: gene_annotations + tuple val(meta), path("${prefix}.fna"), emit: nucleotide_fasta + tuple val(meta), path("${prefix}.faa"), emit: amino_acid_fasta + tuple val(meta), path("${prefix}_all.txt"), emit: all_gene_annotations + path "*.version.txt" , emit: version + + script: + def software = getSoftwareName(task.process) + prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" + """ + prodigal -i ${genome} \\ + $options.args \\ + -f $output_format \\ + -d "${prefix}.fna" \\ + -o "${prefix}.${output_format}" \\ + -a "${prefix}.faa" \\ + -s "${prefix}_all.txt" + + echo \$(prodigal -v 2>&1) | sed -n 's/Prodigal V\\(.*\\):.*/\\1/p' > ${software}.version.txt + """ +} diff --git a/modules/nf-core/modules/prodigal/meta.yml b/modules/nf-core/modules/prodigal/meta.yml new file mode 100644 index 000000000..f20d878e0 --- /dev/null +++ b/modules/nf-core/modules/prodigal/meta.yml @@ -0,0 +1,41 @@ +name: prodigal +description: write your description here +keywords: + - sort +tools: + - prodigal: + description: Prodigal (Prokaryotic Dynamic Programming Genefinding Algorithm) is a microbial (bacterial and archaeal) gene finding program + homepage: {} + documentation: {} + tool_dev_url: {} + doi: "" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - version: + type: file + description: File containing software version + pattern: "*.{version.txt}" + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + +authors: + - "@grst" diff --git a/nextflow.config b/nextflow.config index 4d878c765..b18a09590 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,79 +1,283 @@ -// Nextflow Autometa Pipeline Configuration. +// ***************** +// Autometa Nextflow Configuration +// Default config options for all compute environments +// ***************** + +manifest { + name = "autometa" + author = "Jason C. Kwan Lab" + homePage = "https://github.com/KwanLab/Autometa" + defaultBranch = "main" + description = "Autometa: Automated Extraction of Microbial Genomes from Shotgun Metagenomes" + doi = "https://doi.org/10.1093/nar/gkz148" + mainScript = "main.nf" + nextflowVersion = ">=21.04.0" + version = "2.0.0" +} + + + +// Global default params, used in configs +params { + /* - To choose one of the available profiles (standard, cluster, chtc) you must pass - in the the -profile argument. - i.e. - - ```bash - nextflow run main.nf -profile cluster -c parameters.config - ``` - - You may also specify multiple profiles by separating their names with a comma. - i.e. - - ```bash - nextflow run autometa.nf -profile standard,cluster -c parameters.config - ``` - - Note: Standard profile is implictly used if no profile is specified by the user. + * ------------------------------------------------- + * Change runtime + * ------------------------------------------------- + */ + use_run_name = false + debug = false + mock_test = false + autometa_image_tag = manifest.version //default docker image version is same as version in manifest, above +/* + * ------------------------------------------------- + * Parallelization + * ------------------------------------------------- */ + // Input options + input = null // Metagenome path (.fna) + outdir = "${baseDir}" + interim_dir = "${baseDir}" + tracedir = "${baseDir}/autometa_tracedir" + +/* + * ------------------------------------------------- + * Taxonomy-aware data subsetting + * ------------------------------------------------- +*/ + + taxonomy_aware = false + single_db_dir = null + nr_dmnd_dir = null + prot_accession2taxid_gz_dir = null + taxdump_tar_gz_dir = null + large_downloads_permission = false + binning_starting_rank = "superkingdom" // choices: "superkingdom", "phylum", "class", "order", "family", "genus", "species" + +/* + * ------------------------------------------------- + * Binning Paramaters + * ------------------------------------------------- +*/ + + length_cutoff = 3000 + kmer_size = 5 + norm_method = "am_clr" + pca_dimensions = 50 + embedding_method = "bhsne" + embedding_dimensions = 2 + kingdom = "bacteria" + clustering_method = "dbscan" + classification_method = "decision_tree" + classification_kmer_pca_dimensions = 50 + completeness = 20.0 + purity = 90.0 + gc_stddev_limit = 25.0 + cov_stddev_limit = 5.0 + unclustered_recruitment = false +/* + * ------------------------------------------------- + * Boilerplate options + * ------------------------------------------------- +*/ + + publish_dir_mode = 'copy' + email = null + email_on_fail = null + plaintext_email = false + monochrome_logs = false + help = false + validate_params = true + show_hidden_params = false + schema_ignore_params = 'genomes,modules' + enable_conda = false + singularity_pull_docker_container = false + + // Config options + custom_config_version = 'master' + custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" + hostnames = [:] + config_profile_description = null + config_profile_contact = null + config_profile_url = null + config_profile_name = null + + // Max resource options + // Defaults only, expecting to be overwritten + max_memory = '16.GB' + max_cpus = 4 + max_time = '240.h' + + num_splits = 1 + +} + + +// Load base.config by default for all pipelines +includeConfig 'conf/base.config' + +// Load modules.config for DSL2 module specific options +includeConfig 'conf/modules.config' + +// Load nf-core custom profiles from different Institutions +try { + includeConfig "${params.custom_config_base}/nfcore_custom.config" +} catch (Exception e) { + System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") +} + + + profiles { + standard { + // https://www.nextflow.io/docs/latest/executor.html#local + // simple profile that runs all processess locally + process.executor = "local" } - cluster { + + slurm { + // https://www.nextflow.io/docs/latest/executor.html#slurm + // simple profile for running jobs on slurm + process.executor = "slurm" + // queue is the slurm partition to use. // Set SLURM partition with queue directive. process.queue = "queue" - // See https://www.nextflow.io/docs/latest/executor.html#slurm for details. - } - chtc { - process.executor = "condor" - // See https://www.nextflow.io/docs/latest/executor.html#htcondor for details. - /* - ***IMPORTANT NOTES:*** - 1. The pipeline must be launched from a node where the `condor_submit` command is available. - 2. The HTCondor executor for Nextflow does not support at this time the HTCondor - ability to transfer input/output data to the corresponding job computing node. - Therefore the data needs to be made accessible to the computing nodes using a - shared file system directory from where the Nextflow workflow has to be executed - (or specified via the -w option). - */ + } + } -docker { - // See: https://www.nextflow.io/docs/latest/config.html#config-docker for more info. - enabled = true - fixOwnership = true + +profiles { + debug { process.beforeScript = 'echo $HOSTNAME' } + standard { + docker.enabled = true + docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + slurm { + docker.enabled = true + docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + conda { + params.enable_conda = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + docker { + docker.enabled = true + docker.userEmulation = true + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + singularity { + singularity.enabled = true + singularity.autoMounts = true + docker.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + podman { + podman.enabled = true + docker.enabled = false + singularity.enabled = false + shifter.enabled = false + charliecloud.enabled = false + } + shifter { + shifter.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + charliecloud.enabled = false + } + charliecloud { + charliecloud.enabled = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + } + test { includeConfig 'conf/test.config' } + test_full { includeConfig 'conf/test_full.config' } +} + +// Export these variables to prevent local Python/R libraries from conflicting with those in the container +env { + PYTHONNOUSERSITE = 1 + R_PROFILE_USER = "/.Rprofile" + R_ENVIRON_USER = "/.Renviron" } + +// Capture exit codes from upstream processes when piping +process.shell = ['/bin/bash', '-euo', 'pipefail'] + +def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { - enabled = true - file = "pipeline_info/autometa-timeline.html" + enabled = true + file = "${params.tracedir}/execution_timeline_${trace_timestamp}.html" } report { - enabled = true - file = "pipeline_info/autometa-report.html" + enabled = true + file = "${params.tracedir}/execution_report_${trace_timestamp}.html" } trace { - enabled = true - file = "pipeline_info/autometa-trace.txt" + enabled = true + file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt" } dag { - enabled = true - file = "pipeline_info/autometa-dag.dot" + enabled = true + file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.svg" } -manifest { - author = "Jason C. Kwan Lab" - defaultBranch = "dev" - name = "autometa" - homePage = "https://github.com/KwanLab/Autometa" - description = "Autometa: Automated Extraction of Microbial Genomes from Shotgun Metagenomes" - mainScript = "main.nf" - doi = "https://doi.org/10.1093/nar/gkz148" - version = "2.0.0" - nextflowVersion = "20.10+" + +// Function to ensure that resource requirements don't go beyond +// a maximum limit +def check_max(obj, type) { + if (type == 'memory') { + try { + if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) + return params.max_memory as nextflow.util.MemoryUnit + else + return obj + } catch (all) { + println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'time') { + try { + if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) + return params.max_time as nextflow.util.Duration + else + return obj + } catch (all) { + println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'cpus') { + try { + return Math.min( obj, params.max_cpus as int ) + } catch (all) { + println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" + return obj + } + } } diff --git a/nextflow/autometa.nf b/nextflow/autometa.nf deleted file mode 100644 index e088a6471..000000000 --- a/nextflow/autometa.nf +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env nextflow - -nextflow.enable.dsl = 2 - -include { LENGTH_FILTER; KMERS; KMER_COVERAGE; READ_COVERAGE; ORFS; MARKERS } from './common-tasks.nf' -include { TAXON_ASSIGNMENT } from './taxonomy-tasks.nf' -include { BINNING; UNCLUSTERED_RECRUITMENT; BINNING_SUMMARY } from './binning-tasks.nf' - -workflow AUTOMETA { - take: - metagenome - - main: - // Perform various annotations on provided metagenome - LENGTH_FILTER(metagenome) - - // k-mer coverage vs. read coverage - KMER_COVERAGE(LENGTH_FILTER.out.fasta) - // READ_COVERAGE(LENGTH_FILTER.out.fasta, fwd_reads, rev_reads, se_reads) - - ORFS(LENGTH_FILTER.out.fasta) - MARKERS(ORFS.out.prots) - // Perform taxon assignment with filtered metagenome - TAXON_ASSIGNMENT(LENGTH_FILTER.out.fasta, ORFS.out.prots) - // Now perform binning with all of our annotations. - KMERS(TAXON_ASSIGNMENT.out.bacteria) - // KMERS(TAXON_ASSIGNMENT.out.archaea) ... for case of performing binning on archaea - BINNING(KMERS.out.embedded, KMER_COVERAGE.out, LENGTH_FILTER.out.gc_content, MARKERS.out, TAXON_ASSIGNMENT.out.taxonomy) - // BINNING(KMERS.out.normalized, READ_COVERAGE.out.coverage, LENGTH_FILTER.out.gc_content, MARKERS.out, TAXON_ASSIGNMENT.out.taxonomy) - // Then unclustered recruitment of any unclustered contigs using binning assignments from above. - UNCLUSTERED_RECRUITMENT(KMERS.out.normalized, KMER_COVERAGE.out, BINNING.out.binning, MARKERS.out, TAXON_ASSIGNMENT.out.taxonomy) - // UNCLUSTERED_RECRUITMENT(KMERS.out.normalized, READ_COVERAGE.out.coverage, BINNING.out.binning, MARKERS.out, TAXON_ASSIGNMENT.out.taxonomy) - - // Summary for Binning - BINNING_SUMMARY(BINNING.out.main, MARKERS.out, LENGTH_FILTER.out.fasta, "cluster") - // Summary for unclustered recruitment - // BINNING_SUMMARY(UNCLUSTERED_RECRUITMENT.out.main, MARKERS.out, LENGTH_FILTER.out.fasta, "recruited_cluster") - - emit: - binning = BINNING.out.binning - binning_main = BINNING.out.main - recruitment = UNCLUSTERED_RECRUITMENT.out.binning - recruitment_main = UNCLUSTERED_RECRUITMENT.out.main - all_binning_results = BINNING.out.binning | mix(UNCLUSTERED_RECRUITMENT.out) | collect - summary_stats = BINNING_SUMMARY.out.stats - summary_taxa = BINNING_SUMMARY.out.taxonomies - metabins = BINNING_SUMMARY.out.metabins -} diff --git a/nextflow/benchmarking-tasks.nf b/nextflow/benchmarking-tasks.nf deleted file mode 100644 index acab91455..000000000 --- a/nextflow/benchmarking-tasks.nf +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env nextflow - -nextflow.enable.dsl = 2 - -params.community_type = "Community type that was used for clustering or classification" //choices: synthetic, simulated, all -params.community_size = "Community size that was used for clustering or classification" //choices: 78Mbp,156Mbp,312Mbp,625Mbp,1250Mbp,2500Mbp,5000Mbp,10000Mbp,all, etc. -params.processed = "Path to store final results" -params.ncbi_database = "Path to ncbi databases directory" - - -process BENCHMARK_CLUSTERING { - tag "benchmarking clustering on ${community_type}: ${community_size}" - container = 'jason-c-kwan/autometa:dev' - publishDir params.processed, pattern: "*.clustering_benchmarks.*" - - input: - tuple path(binning), val(community_type), val(community_size) - - output: - path "${community_type}.${community_size}.clustering_benchmarks.*.tsv.gz" - - script: - """ - # First retrieve reference assignments for provided community - autometa-download-dataset \ - --community-type ${community_type} \ - --community-sizes ${community_size} \ - --file-names reference_assignments.tsv.gz,binning.tsv.gz \ - --dir-path . - - # Now benchmark inputs and previous gold-standard against community reference assignments - autometa-benchmark \ - --benchmark clustering \ - --predictions $binning ${community_type}/${community_size}/binning.tsv.gz \ - --reference ${community_type}/${community_size}/reference_assignments.tsv.gz \ - --output-wide ${community_type}.${community_size}.clustering_benchmarks.wide.tsv.gz \ - --output-long ${community_type}.${community_size}.clustering_benchmarks.long.tsv.gz - """ -} - -process BENCHMARK_CLASSIFICATION { - tag "benchmarking classification on ${community_type}: ${community_size}" - container = 'jason-c-kwan/autometa:dev' - containerOptions = "-v ${params.ncbi_database}:/ncbi:ro" - publishDir params.processed, pattern: "*.classification_benchmarks.*" - - input: - tuple path(taxonomy), val(community_type), val(community_size) - - output: - path "${community_type}.${community_size}.classification_benchmarks.*.tsv.gz" - - script: - """ - # First retrieve reference assignments for provided community - autometa-download-dataset \ - --community-type ${community_type} \ - --community-sizes ${community_size} \ - --file-names reference_assignments.tsv.gz,taxonomy.tsv.gz \ - --dir-path . - - # Now benchmark inputs and previous gold-standard against community reference assignments - autometa-benchmark \ - --benchmark classification \ - --predictions $taxonomy ${community_type}/${community_size}/taxonomy.tsv.gz \ - --reference ${community_type}/${community_size}/reference_assignments.tsv.gz \ - --output-wide ${community_type}.${community_size}.classification_benchmarks.wide.tsv.gz \ - --ncbi /ncbi - """ -} diff --git a/nextflow/binning-tasks.nf b/nextflow/binning-tasks.nf deleted file mode 100644 index aa7787b99..000000000 --- a/nextflow/binning-tasks.nf +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/env nextflow - -nextflow.enable.dsl=2 - -// Data inputs -params.interim = "" -params.processed = "" -// Binning parameters -params.kingdom = "bacteria" -params.clustering_method = "dbscan" // choices: "dbscan", "hdbscan" -params.binning_starting_rank = "superkingdom" // choices: "superkingdom", "phylum", "class", "order", "family", "genus", "species" -params.completeness = 20.0 -params.purity = 95.0 -params.cov_stddev_limit = 25.0 -params.gc_stddev_limit = 5.0 -// Unclustered recruitment parameters -params.classification_method = "decision_tree" // choices: "decision_tree", "random_forest" -params.classification_kmer_pca_dimensions = 50 -// Summary parameters -params.ncbi_database = "Path to user ncbi databases directory" - -process BINNING { - tag "Performing Autometa binning" - container = 'jason-c-kwan/autometa:dev' - publishDir params.processed, pattern: "${coverage.simpleName}.${params.kingdom}.*.tsv.gz" - - input: - path kmers - path coverage - path gc_content - path markers - path taxonomy - - output: - path "${coverage.simpleName}.${params.kingdom}.binning.tsv.gz", emit: binning - path "${coverage.simpleName}.${params.kingdom}.main.tsv.gz", emit: main - - """ - autometa-binning \ - --kmers $kmers \ - --coverages $coverage \ - --gc-content $gc_content \ - --markers $markers \ - --output-binning ${coverage.simpleName}.${params.kingdom}.binning.tsv.gz \ - --output-main ${coverage.simpleName}.${params.kingdom}.main.tsv.gz \ - --clustering-method ${params.clustering_method} \ - --completeness ${params.completeness} \ - --purity ${params.purity} \ - --cov-stddev-limit ${params.cov_stddev_limit} \ - --gc-stddev-limit ${params.gc_stddev_limit} \ - --taxonomy $taxonomy \ - --starting-rank ${params.binning_starting_rank} \ - --domain ${params.kingdom} - """ -} - -process UNCLUSTERED_RECRUITMENT { - tag "Performing Autometa unclustered recruitment" - container = 'jason-c-kwan/autometa:dev' - publishDir params.processed, pattern: "${coverage.simpleName}.${params.kingdom}.recruitment.tsv.gz" - - input: - path kmers - path coverage - path binning - path markers - path taxonomy - - output: - path "${coverage.simpleName}.${params.kingdom}.recruitment.tsv.gz", emit: binning - path "${coverage.simpleName}.${params.kingdom}.recruitment.main.tsv.gz", emit: main - - """ - autometa-unclustered-recruitment \ - --classifier ${params.classification_method} \ - --kmer-dimensions ${params.classification_kmer_pca_dimensions} \ - --seed 42 \ - --taxonomy $taxonomy \ - --kmers $kmers \ - --coverage $coverage \ - --binning $binning \ - --markers $markers \ - --output-binning ${coverage.simpleName}.${params.kingdom}.recruitment.tsv.gz \ - --output-main ${coverage.simpleName}.${params.kingdom}.recruitment.main.tsv.gz - """ -} - -process BINNING_SUMMARY { - tag "Binning summary for ${binning_main.simpleName}" - container = 'jason-c-kwan/autometa:dev' - containerOptions = "-v ${params.ncbi_database}:/ncbi:ro" - - input: - path binning_main - path markers - path metagenome - val binning_column - - output: - path 'metabin_stats.tsv', emit: stats - path 'metabin_taxonomy.tsv', emit: taxonomies - path 'metabins', emit: metabins - - script: - """ - autometa-binning-summary \ - --ncbi /ncbi \ - --binning-main $binning_main \ - --markers $markers \ - --metagenome $metagenome \ - --binning-column $binning_column \ - --output-stats metabin_stats.tsv \ - --output-taxonomy metabin_taxonomy.tsv \ - --output-metabins metabins - """ -} diff --git a/nextflow/common-tasks.nf b/nextflow/common-tasks.nf deleted file mode 100644 index 6dc5bb74c..000000000 --- a/nextflow/common-tasks.nf +++ /dev/null @@ -1,262 +0,0 @@ -#!/usr/bin/env nextflow - -nextflow.enable.dsl=2 - -// Data inputs -params.metagenome = "" -params.interim = "" -params.processed = "" -params.length_cutoff = 3000 -// kmer parameters -params.kmer_size = 5 -params.norm_method = "am_clr" // choices: "am_clr", "clr", "ilr" -params.pca_dimensions = 50 -params.embedding_method = "bhsne" // choices: "sksne", "bhsne", "umap", "densmap", "trimap" -params.embedding_dimensions = 2 -// Marker annotation parameters -params.kingdom = "bacteria" -// Runtime parameters -params.cpus = 2 - - -process LENGTH_FILTER { - tag "filtering metagenome ${metagenome.simpleName}" - container = 'jason-c-kwan/autometa:dev' - publishDir params.interim, pattern: "${metagenome.simpleName}.*" - - input: - path metagenome - - output: - path "${metagenome.simpleName}.filtered.fna", emit: fasta - path "${metagenome.simpleName}.stats.tsv", emit: stats - path "${metagenome.simpleName}.gc_content.tsv", emit: gc_content - - """ - autometa-length-filter \ - --assembly $metagenome \ - --cutoff ${params.length_cutoff} \ - --output-fasta ${metagenome.simpleName}.filtered.fna \ - --output-stats ${metagenome.simpleName}.stats.tsv \ - --output-gc-content ${metagenome.simpleName}.gc_content.tsv - """ -} - -process KMERS { - tag "counting kmers for ${metagenome.simpleName}" - container = 'jason-c-kwan/autometa:dev' - cpus params.cpus - publishDir params.interim, pattern: "*.kmers.*" - - input: - path metagenome - - output: - path "*.kmers.tsv", emit: counts - path "*.kmers.normalized.tsv", emit: normalized - path "*.kmers.embedded.tsv", emit: embedded - - """ - autometa-kmers \ - --fasta $metagenome \ - --kmers ${metagenome.simpleName}.kmers.tsv \ - --size ${params.kmer_size} \ - --norm-output ${metagenome.simpleName}.kmers.normalized.tsv \ - --norm-method ${params.norm_method} \ - --pca-dimensions ${params.pca_dimensions} \ - --embedding-output ${metagenome.simpleName}.kmers.embedded.tsv \ - --embedding-method ${params.embedding_method} \ - --embedding-dimensions ${params.embedding_dimensions} \ - --cpus ${task.cpus} \ - --seed 42 - """ -} - -process KMER_COVERAGE { - tag "Calculating k-mer coverage for ${metagenome.simpleName}" - container = 'jason-c-kwan/autometa:dev' - cpus params.cpus - publishDir params.interim, pattern: "${metagenome.simpleName}.coverages.tsv" - - input: - path metagenome - - output: - path "${metagenome.simpleName}.coverages.tsv" - - """ - autometa-coverage \ - --assembly $metagenome \ - --cpus ${task.cpus} \ - --from-spades \ - --out ${metagenome.simpleName}.coverages.tsv - """ -} - -process MARKERS { - tag "Finding markers for ${orfs.simpleName}" - container = 'jason-c-kwan/autometa:dev' - cpus params.cpus - // copying orfs via stageInMode is required to run hmmscan (does not handle symlinks) - stageInMode 'copy' - publishDir params.interim, pattern: "${orfs.simpleName}.markers.tsv" - publishDir params.interim, pattern: "${orfs.simpleName}.hmmscan.tsv" - - input: - path orfs - - output: - path "${orfs.simpleName}.markers.tsv" - - """ - autometa-markers \ - --orfs $orfs \ - --hmmscan ${orfs.simpleName}.hmmscan.tsv \ - --out ${orfs.simpleName}.markers.tsv \ - --kingdom ${params.kingdom} \ - --parallel \ - --cpus ${task.cpus} \ - --seed 42 - """ -} - -process ORFS { - tag "Calling orfs for ${metagenome.simpleName}" - container = 'jason-c-kwan/autometa:dev' - // Hardcoding cpus here b/c prodigal is limited to only using single core - cpus 1 - publishDir params.interim, pattern: "${metagenome.simpleName}.orfs.f*" - - input: - path metagenome - - output: - path "${metagenome.simpleName}.orfs.fna", emit: nucls - path "${metagenome.simpleName}.orfs.faa", emit: prots - - """ - prodigal \ - -i $metagenome \ - -d ${metagenome.simpleName}.orfs.fna \ - -a ${metagenome.simpleName}.orfs.faa \ - -p meta \ - -q \ - -m - """ -} - -process ALIGN_READS { - tag "Aligning reads to ${metagenome.simpleName}" - container = 'jason-c-kwan/autometa:dev' - cpus params.cpus - - input: - path metagenome - path fwd_reads - path rev_reads - path se_reads - - output: - path "${metagenome.simpleName}.sam" - - """ - bowtie2-build \ - --threads ${task.cpus} - ${metagenome} \ - ${metagenome.simpleName}.db - - bowtie2 \ - -x ${metagenome.simpleName}.db \ - -q \ - --phred33 \ - --very-sensitive \ - --no-unal \ - -p ${task.cpus} \ - -S ${metagenome.simpleName}.sam \ - -1 $fwd_reads \ - -2 $rev_reads \ - -U $se_reads - """ -} - -process SORT_READS { - tag "Sorting reads to ${sam.simpleName}" - container = 'jason-c-kwan/autometa:dev' - cpus params.cpus - - input: - path sam - - output: - path "${sam.simpleName}.bam" - - """ - samtools view -@${task.cpus} -bS ${sam} \ - | samtools sort -@${task.cpus} -o ${sam.simpleName}.bam - """ -} - -process LENGTH_TABLE { - tag "length table for ${metagenome.simpleName}" - container = 'jason-c-kwan/autometa:dev' - cpus params.cpus - - input: - path metagenome - - output: - path "${metagenome.simpleName}.lengths.tsv" - - """ - #!/usr/bin/env python - - from Bio import SeqIO - import pandas as pd - - seqs = {record.id: len(record) for record in SeqIO.parse($metagenome, "fasta")} - lengths = pd.Series(seqs, name="length") - lengths.index.name = "contig" - lengths.to_csv(${metagenome.simpleName}.lengths.tsv, sep="\t", index=True, header=True) - """ -} - -process GENOMECOV { - tag "Computing genome coverage for ${bam.simpleName}" - container = 'jason-c-kwan/autometa:dev' - cpus params.cpus - - input: - path bam - path lengths - - output: - path "${bam.simpleName}.bed.tsv", emit: bed - path "${bam.simpleName}.coverage.tsv", emit: coverage - - """ - bedtools genomecov -ibam $bam -g $lengths > ${bam.simpleName}.bed.tsv - autometa-parse-bed \ - --ibam $bam \ - --lengths $lengths \ - --bed ${bam.simpleName}.bed.tsv \ - --output ${bam.simpleName}.coverage.tsv - """ -} - -workflow READ_COVERAGE { - take: - metagenome - fwd_reads - rev_reads - se_reads - - main: - LENGTH_TABLE(metagenome) - ALIGN_READS(metagenome, fwd_reads, rev_reads, se_reads) - SORT_READS(ALIGN_READS.out) - GENOMECOV(SORT_READS.out, LENGTH_TABLE.out) - - emit: - bed = GENOMECOV.out.bed - coverage = GENOMECOV.out.coverage -} diff --git a/nextflow/parameters.config b/nextflow/parameters.config deleted file mode 100644 index d3cec534e..000000000 --- a/nextflow/parameters.config +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env nextflow - -nextflow.enable.dsl=2 - -/* - REQUIRED RESULTS PARAMETERS - ======== - -NOTE ----- - - + You MUST wrap these data inputs in 'single quotes' or "double quotes" - - + "double quotes" are required for the use of wildcard characters - e.g. params.metagenome = "$HOME/metagenomes/raw/metagenome.fna" - - + Nextflow will create the interim and processed directories if they do not exist - -*/ - -params.metagenome = null - -// Where to store intermediate and final results: -// e.g. params.interim = "$HOME/metagenomes/interim/" -params.interim = null - -// e.g. params.processed = "$HOME/metagenomes/processed/" -params.processed = null - -/* - REQUIRED DATABASE PARAMETERS - ======== - -NOTE ----- - -Directory path must contain: - - diamond formatted nr file => nr.dmnd - - Extracted files from tarball taxdump.tar.gz - - prot.accession2taxid.gz - -*/ - -params.ncbi_database = "$HOME/Autometa/autometa/databases/ncbi" - -/* - OPTIONAL PARAMETERS - - + Metagenome length cutoff - + K-mer size/normalization/embedding - + Binning kingdom to consider - + Clustering method - + Binning metrics thresholds -*/ - -params.cpus = 2 // Additional runtime settings -params.length_cutoff = 3000 // Metagenome Length filtering -// Kmer counting/normalization/embedding parameters -params.kmer_size = 5 -params.norm_method = "am_clr" // choices: "am_clr", "clr", "ilr" -params.pca_dimensions = 50 // Must be below the number of columns in kmer counts table -params.embedding_method = "bhsne" // choices: "bhsne", "sksne", "umap", "densmap", "trimap" -params.embedding_dimensions = 2 // Must be below `kmer_pca_dimensions` -// Binning parameters -params.kingdom = "bacteria" // choices: "bacteria", "archaea" -> Also used during marker annotation/filtering -params.clustering_method = "dbscan" // choices: "dbscan", "hdbscan" -params.binning_starting_rank = "superkingdom" // choices: "superkingdom", "phylum", "class", "order", "family", "genus", "species" -params.completeness = 20.0 // Will keep clusters over 20% complete -params.purity = 95.0 // Will keep clusters over 95% pure -params.cov_stddev_limit = 25.0 // Will keep clusters less than 25% coverage std.dev. between contigs in cluster -params.gc_stddev_limit = 5.0 // Will keep clusters less than 5% GC% std.dev. between contigs in cluster -// Unclustered Recruitment paramters -params.classification_method = "decision_tree" // choices: "decision_tree", "random_forest" -params.classification_kmer_pca_dimensions = 50 diff --git a/nextflow/taxonomy-tasks.nf b/nextflow/taxonomy-tasks.nf deleted file mode 100644 index 54085ddcd..000000000 --- a/nextflow/taxonomy-tasks.nf +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env nextflow - -nextflow.enable.dsl=2 - -params.interim = "" -params.processed = "" -params.ncbi_database = "$HOME/Autometa/autometa/databases/ncbi" -params.cpus = 2 - -process DIAMOND { - tag "diamond blastp on ${orfs.simpleName}" - container = 'jason-c-kwan/autometa:dev' - containerOptions = "-v ${params.ncbi_database}:/ncbi:ro" - cpus params.cpus - publishDir params.interim, pattern: "${orfs.simpleName}.blastp.tsv" - - input: - path orfs - - output: - path "${orfs.simpleName}.blastp.tsv" - - """ - diamond blastp \ - --query ${orfs} \ - --db /ncbi/nr.dmnd \ - --evalue 1e-5 \ - --max-target-seqs 200 \ - --threads ${task.cpus} \ - --outfmt 6 \ - --out ${orfs.simpleName}.blastp.tsv - """ -} - -process LCA { - tag "Assigning LCA to ${blast.simpleName}" - container = 'jason-c-kwan/autometa:dev' - containerOptions = "-v ${params.ncbi_database}:/ncbi:rw" - publishDir params.interim, pattern: "${blast.simpleName}.lca.tsv" - - input: - path blast - - output: - path "${blast.simpleName}.lca.tsv" - - """ - autometa-taxonomy-lca --blast ${blast} --dbdir /ncbi --output ${blast.simpleName}.lca.tsv - """ -} - -process MAJORITY_VOTE { - tag "Performing taxon majority vote on ${lca.simpleName}" - container = 'jason-c-kwan/autometa:dev' - containerOptions = "-v ${params.ncbi_database}:/ncbi:rw" - publishDir params.interim, pattern: "${lca.simpleName}.votes.tsv" - - input: - path lca - - output: - path "${lca.simpleName}.votes.tsv" - - """ - autometa-taxonomy-majority-vote --lca ${lca} --output ${lca.simpleName}.votes.tsv --dbdir /ncbi - """ -} - -process SPLIT_KINGDOMS { - tag "Splitting votes into kingdoms for ${assembly.simpleName}" - container = 'jason-c-kwan/autometa:dev' - containerOptions = "-v ${params.ncbi_database}:/ncbi:rw" - publishDir params.interim, pattern: "${assembly.simpleName}.taxonomy.tsv" - publishDir params.interim, pattern: '*.{bacteria,archaea}.fna' - - input: - path votes - path assembly - - output: - path "${assembly.simpleName}.taxonomy.tsv", emit: taxonomy - path "${assembly.simpleName}.bacteria.fna", emit: bacteria - path "${assembly.simpleName}.archaea.fna", emit: archaea - - """ - autometa-taxonomy \ - --votes ${votes} \ - --output . \ - --prefix ${assembly.simpleName} \ - --split-rank-and-write superkingdom \ - --assembly ${assembly} \ - --ncbi /ncbi - # Handling case where no archaea were recovered... - if [[ ! -f ${assembly.simpleName}.archaea.fna ]] - then touch ${assembly.simpleName}.archaea.fna - fi - """ -} - -// Autometa taxon assignment workflow -workflow TAXON_ASSIGNMENT { - take: - assembly - orfs - - main: - DIAMOND(orfs) - LCA(DIAMOND.out) - MAJORITY_VOTE(LCA.out) - SPLIT_KINGDOMS(MAJORITY_VOTE.out, assembly) - - emit: - taxonomy = SPLIT_KINGDOMS.out.taxonomy - bacteria = SPLIT_KINGDOMS.out.bacteria - archaea = SPLIT_KINGDOMS.out.archaea - orf_votes = LCA.out - contig_votes = MAJORITY_VOTE.out -} diff --git a/nextflow_schema.json b/nextflow_schema.json new file mode 100644 index 000000000..37e83be09 --- /dev/null +++ b/nextflow_schema.json @@ -0,0 +1,384 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/autometa/main/nextflow_schema.json", + "title": "autometa pipeline parameters", + "description": "Autometa: Automated Extraction of Microbial Genomes from Shotgun Metagenomes", + "type": "object", + "definitions": { + "input_output_options": { + "title": "Input/output options", + "type": "object", + "fa_icon": "fas fa-terminal", + "description": "Define where the pipeline should find input data and save output data.", + "required": [ + "input", + "publish_dir_mode" + ], + "properties": { + "input": { + "type": "string", + "fa_icon": "fas fa-dna", + "description": "Absolute (full) path of the metagenome assembly FASTA (.fna) file(s)", + "help_text": "Use this to specify the location of your input FASTA files. For example:\n\nA single FASTA file: \n```bash\n--input path/to/data/sample.fna\n```\n\nMultiple FASTA files: \n\n```bash\n--input path/to/data/sample_*.fna\n```\n\nNote:\nDo not surround with quotes", + "default": "null" + }, + "interim_dir": { + "type": "string", + "description": "Absolute (full) path of directory for intermediate files", + "default": "~/where/pipeline/is/launched", + "fa_icon": "fas fa-folder-open", + "help_text": "Directory for storing files created during intermediate steps but which are interesting enough to keep for debugging or analyzing with other tools" + }, + "outdir": { + "type": "string", + "description": "Absolute (full) path of directory where the results will be saved.", + "default": "~/where/pipeline/is/launched", + "fa_icon": "fas fa-folder-open" + }, + "tracedir": { + "type": "string", + "description": "Absolute (full) path of directory to keep pipeline Nextflow logs and reports.", + "default": "~/where/pipeline/is/launched", + "fa_icon": "fas fa-cogs" + }, + "publish_dir_mode": { + "type": "string", + "default": "copy", + "description": "How should output files be stored?", + "help_text": "\nImportant note: Using a linking method (anything ending in 'link') and then deleting the nextflow run's working/cache directory will result in the loss of all output files. Links will remain but will only point to non-existent file(s).\n\nUse one of the following\ncopy\n Copies the output files into the published directory. (default)\nsymlink\n Creates an absolute symbolic link in the published directory for each process output file.\nrellink\n Creates a relative symbolic link in the published directory for each process output file.\nlink\n Creates a hard link in the published directory for each process output file.\ncopyNoFollow\n Copies the output files into the published directory without following symlinks ie. copies the links themselves.\nmove\n Moves the output files into the published directory. Note: this is only supposed to be used for a terminating process i.e. a process whose output is not consumed by any other downstream process.\n\nFor more information see:\nhttps://www.nextflow.io/docs/latest/process.html#publishdir" + } + } + }, + "autometa_binning_parameters": { + "title": "Autometa Binning Parameters", + "type": "object", + "default": "", + "properties": { + "length_cutoff": { + "type": "integer", + "default": 3000, + "description": "Minimum contig length to use as input to Autometa" + }, + "norm_method": { + "type": "string", + "default": "am_clr", + "description": "Kmer count normalization transformation method to use. Choices are \"am_clr\", \"clr\", and \"ilr\"" + }, + "pca_dimensions": { + "type": "integer", + "default": 50, + "description": "Number of dimensions of which to reduce the initial k-mer frequencies matrix" + }, + "embedding_method": { + "type": "string", + "default": "bhsne", + "description": "Embedding method to use. Choices are \"sksne\", \"bhsne\", \"umap\", \"densne\", \"trimap\"" + }, + "embedding_dimensions": { + "type": "integer", + "default": 2, + "description": "Final dimensions of the kmer frequencies matrix" + }, + "kmer_size": { + "type": "integer", + "default": 5, + "description": "kmer length to use during kmer counting" + }, + "clustering_method": { + "type": "string", + "default": "dbscan", + "description": "Cluster contigs using specified clustering method. Choices are \"dbscan\" and \"hdbscan\"" + }, + "classification_method": { + "type": "string", + "default": "decision_tree", + "description": "Classification method to use for unclustered recruitment step. Choices are \"decision_tree\" and \"random_forest\"" + }, + "classification_kmer_pca_dimensions": { + "type": "integer", + "default": 50, + "description": "Number of dimensions of which to reduce the initial k-mer frequencies matrix by PCA" + }, + "completeness": { + "type": "number", + "default": 20, + "description": "Minimum completeness needed to keep a cluster (default is at least 20% complete)" + }, + "purity": { + "type": "number", + "default": 95, + "description": "Minimum purity needed to keep a cluster (default is at least 95% pure)" + }, + "gc_stddev_limit": { + "type": "number", + "default": 5, + "description": "Maximum GC% standard deviation under which a cluster is kept (default is 5%)" + }, + "cov_stddev_limit": { + "type": "number", + "default": 25, + "description": "Maximum coverage standard deviation under which a cluster is kept (default is 25%)" + }, + "unclustered_recruitment": { + "type": "boolean", + "description": "Set to true for unclustered recruitment" + } + }, + "required": [ + "length_cutoff", + "norm_method", + "pca_dimensions", + "embedding_method", + "embedding_dimensions", + "kmer_size", + "clustering_method", + "classification_method", + "classification_kmer_pca_dimensions", + "completeness", + "purity", + "gc_stddev_limit", + "cov_stddev_limit" + ] + }, + "autometa_taxonomy_aware_binning_parameters": { + "title": "Autometa Taxonomy-Aware Binning Parameters", + "type": "object", + "description": "", + "default": "", + "properties": { + "taxonomy_aware": { + "type": "boolean", + "description": "Turn taxonomy-aware clustering on/off" + }, + "single_db_dir": { + "type": "string", + "default": "null", + "fa_icon": "fas fa-folder-open", + "description": "Directory containing databases required for taxonomy assignment" + }, + "nr_dmnd_dir": { + "type": "string", + "description": "Currently not used do not set", + "default": "Currently not used do not set", + "hidden": true + }, + "prot_accession2taxid_gz_dir": { + "type": "string", + "description": "Currently not used do not set", + "default": "Currently not used do not set", + "hidden": true + }, + "taxdump_tar_gz_dir": { + "type": "string", + "description": "Currently not used do not set", + "default": "Currently not used do not set", + "hidden": true + }, + "binning_starting_rank": { + "type": "string", + "default": "superkingdom", + "description": "Which taxonomic rank to start the binning from. Choices are \"superkingdom\", \"phylum\", \"class\", \"order\", \"family\", \"genus\", \"species\"" + }, + "kingdom": { + "type": "string", + "default": "bacteria", + "description": "Bin contigs belonging to this kingdom. Choices are \"bacteria\" and \"archaea\"" + }, + "large_downloads_permission": { + "type": "boolean", + "description": "Can Autometa download nr.gz if needed? (It's >100Gb)" + } + } + }, + "autometa_nextflow_parameters": { + "title": "Autometa Nextflow Parameters", + "type": "object", + "description": "These paramters control how the pipeline is executed", + "default": "", + "properties": { + "num_splits": { + "type": "integer", + "default": 1, + "description": "How many parallel splits should be ceated. Should never be more than total CPUss", + "help_text": "Values over 2 will result in splitting the input metagenome into X-number of FASTA files, to be processed in parallel where applicable. This will result in a doubling of disk space occupied by your input metagenome." + }, + "max_cpus": { + "type": "integer", + "default": 16, + "description": "Max cpus to use/request" + }, + "max_memory": { + "type": "string", + "default": "16 GB", + "description": "Max RAM to use/request" + }, + "max_time": { + "type": "string", + "default": "2d", + "description": "Max time a *single* process is allowed to run" + }, + "enable_conda": { + "type": "boolean", + "description": "Use conda?" + }, + "use_run_name": { + "type": "boolean", + "hidden": true, + "description": "If TRUE, run-name will be used in the output directory structure" + }, + "debug": { + "type": "boolean", + "description": "Run pipeline with small defaults (e.g. not the entire nr.gz download)", + "hidden": true + }, + "mock_test": { + "type": "boolean", + "description": "Run with minimal dataset", + "hidden": true + }, + "autometa_image_tag": { + "type": "string", + "default": "latest", + "description": "Change which tag of the autometa docker image is used", + "help_text": "Appends input to `jason-c-kwan/autometa`\n\njason-c-kwan/autometa:${params.autometa_image_tag}\"" + } + }, + "required": [ + "max_cpus", + "max_memory", + "max_time" + ] + }, + "generic_nf_core_options": { + "title": "Generic nf-core options", + "type": "object", + "fa_icon": "fas fa-university", + "description": "Parameters used to describe centralised config profiles. These should not be edited.", + "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", + "properties": { + "validate_params": { + "type": "boolean", + "default": true, + "description": "Whether to validate parameters on initiation", + "hidden": true + }, + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", + "hidden": true + }, + "help": { + "type": "boolean", + "description": "Display help text.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "plaintext_email": { + "type": "string", + "hidden": true + }, + "email_on_fail": { + "type": "string", + "description": "Institutional configs hostname.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "config_profile_name": { + "type": "string", + "description": "Institutional config name.", + "hidden": true, + "fa_icon": "fas fa-users-cog" + }, + "monochrome_logs": { + "type": "boolean", + "description": "Do not use coloured log outputs.", + "fa_icon": "fas fa-users-cog", + "hidden": true + }, + "config_profile_url": { + "type": "string", + "description": "Only used for institutional-based profiles. See https://nf-co.re/usage/tutorials/step_by_step_institutional_profile#params-scope for more information", + "default": "false", + "hidden": true + }, + "config_profile_description": { + "type": "string", + "hidden": true, + "description": "Only used for institutional-based profiles. See https://nf-co.re/usage/tutorials/step_by_step_institutional_profile#params-scope for more information" + }, + "config_profile_contact": { + "type": "string", + "hidden": true, + "description": "Only used for institutional-based profiles. See https://nf-co.re/usage/tutorials/step_by_step_institutional_profile#params-scope for more information" + }, + "custom_config_version": { + "type": "string", + "default": "master", + "hidden": true + }, + "custom_config_base": { + "type": "string", + "default": "https://raw.githubusercontent.com/nf-core/configs/master", + "hidden": true + }, + "hostnames": { + "type": "string", + "default": "[binac:['.binac.uni-tuebingen.de'], cbe:['.cbe.vbc.ac.at'], cfc:['.hpc.uni-tuebingen.de'], crick:['.thecrick.org'], icr_davros:['.davros.compute.estate'], imperial:['.hpc.ic.ac.uk'], imperial_mb:['.hpc.ic.ac.uk'], genotoul:['.genologin1.toulouse.inra.fr', '.genologin2.toulouse.inra.fr'], genouest:['.genouest.org'], uppmax:['.uppmax.uu.se'], utd_ganymede:['ganymede.utdallas.edu'], utd_sysbio:['sysbio.utdallas.edu']]", + "hidden": true + }, + "show_hidden_params": { + "type": "string", + "hidden": true + }, + "singularity_pull_docker_container": { + "type": "string", + "hidden": true + } + }, + "required": [ + "validate_params" + ] + }, + "parameters_set_internally": { + "title": "Parameters set internally", + "type": "object", + "description": "These parameters are determined during runtime", + "default": "", + "properties": { + "outdir_internal": { + "type": "string", + "hidden": true, + "description": "These parameters are determined during runtime" + }, + "interim_dir_internal": { + "type": "string", + "hidden": true, + "description": "These parameters are determined during runtime" + } + } + } + }, + "allOf": [ + { + "$ref": "#/definitions/input_output_options" + }, + { + "$ref": "#/definitions/autometa_binning_parameters" + }, + { + "$ref": "#/definitions/autometa_taxonomy_aware_binning_parameters" + }, + { + "$ref": "#/definitions/autometa_nextflow_parameters" + }, + { + "$ref": "#/definitions/generic_nf_core_options" + }, + { + "$ref": "#/definitions/parameters_set_internally" + } + ] +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index f7919f4aa..2d217fe8e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,22 +4,23 @@ attrs # test-data requirement bedtools biopython bowtie2 -diamond==2.* +diamond>=2.0 gdown hdbscan hmmer +libiconv make nextflow>=20.10 -numba>=0.34 numpy>=1.13 pandas>=1.1 parallel prodigal>=2.5 # 2.5 and 2.6 output format is different for sequence headers requests rsync -samtools==1.11 +samtools=>1.11 scikit-bio scikit-learn>=0.18 tqdm tsne +numba>=0.47 umap-learn>=0.5 # https://github.com/lmcinnes/umap/releases/tag/0.5.0 : umap, densmap diff --git a/setup.py b/setup.py index b9346014e..a3bb33a83 100644 --- a/setup.py +++ b/setup.py @@ -27,10 +27,11 @@ def read(fname): "autometa-config = autometa.config.utilities:main", "autometa-kmers = autometa.common.kmers:main", "autometa-coverage = autometa.common.coverage:main", - "autometa-parse-bed = autometa.common.external.bedtools:main", - "autometa-orfs = autometa.common.external.prodigal:main", - "autometa-markers = autometa.common.markers:main", "autometa-length-filter = autometa.common.metagenome:main", + "autometa-markers = autometa.common.markers:main", + "autometa-orfs = autometa.common.external.prodigal:main", + "autometa-parse-bed = autometa.common.external.bedtools:main", + "autometa-hmmsearch-filter = autometa.common.external.hmmsearch:main", "autometa-taxonomy = autometa.taxonomy.vote:main", "autometa-taxonomy-lca = autometa.taxonomy.lca:main", "autometa-taxonomy-majority-vote = autometa.taxonomy.majority_vote:main", diff --git a/subworkflows/local/align_reads.nf b/subworkflows/local/align_reads.nf new file mode 100644 index 000000000..ad00ccc4d --- /dev/null +++ b/subworkflows/local/align_reads.nf @@ -0,0 +1,59 @@ +process ALIGN_READS { + tag "Aligning reads to ${metagenome.simpleName}" + label "python_cpus" + + input: + path metagenome + path fwd_reads + path rev_reads + path se_reads + + output: + path "${metagenome.simpleName}.sam" + + script: + """ + bowtie2-build \\ + --threads ${task.cpus} \\ + ${metagenome} \\ + ${metagenome.simpleName}.db + + bowtie2 \\ + -x ${metagenome.simpleName}.db \\ + -q \\ + --phred33 \\ + --very-sensitive \\ + --no-unal \\ + -p ${task.cpus} \\ + -S ${metagenome.simpleName}.sam \\ + -1 $fwd_reads \\ + -2 $rev_reads \\ + -U $se_read + """ +} + +params.bedtools_genomecov_options = [:] + +include { BOWTIE2_ALIGN } from './../../modules/nf-core/modules/bowtie2/align/main.nf' addParams( options: params.bedtools_genomecov_options ) +include { BEDTOOLS_GENOMECOV } from './../../modules/nf-core/modules/bedtools/genomecov.nf' addParams( options: params.bedtools_genomecov_options ) + +workflow ALIGN_READS { + take: + metagenome + reads + + main: + BOWTIE2_BUILD ( + metagenome + ) // currently waiting to see if nf-core will update to include a meta map input + BOWTIE2_BUILD.out.index + .combine(reads) + .set{bowtie2_align_input_ch} + BOWTIE2_ALIGN ( + bowtie2_align_input_ch + ) // currently waiting to see if nf-core will update to include a meta map input + + emit: + sam = BOWTIE2_ALIGN.out.bed +} + diff --git a/subworkflows/local/bin_contigs.nf b/subworkflows/local/bin_contigs.nf new file mode 100644 index 000000000..6faf76ffc --- /dev/null +++ b/subworkflows/local/bin_contigs.nf @@ -0,0 +1,105 @@ +params.binning_options = [:] +params.unclustered_recruitment_options = [:] +params.binning_summary_options = [:] +params.taxdump_tar_gz_dir = [:] + +include { BINNING } from './../../modules/local/binning.nf' addParams( options: params.binning_options ) +include { UNCLUSTERED_RECRUITMENT } from './../../modules/local/unclustered_recruitment.nf' addParams( options: params.unclustered_recruitment_options ) +include { BINNING_SUMMARY } from './../../modules/local/binning_summary.nf' addParams( options: params.binning_summary_options, taxdump_tar_gz_dir: params.taxdump_tar_gz_dir ) + + +workflow BIN_CONTIGS { + + take: + metagenome + kmers_embedded + kmers_normalized + coverage + gc_content + markers + taxon_assignments + binning_column + + main: + kmers_embedded + .join( + coverage + ) + .join( + gc_content + ) + .join( + markers + ) + .set{metagenome_annotations} + + if (params.taxonomy_aware) { + metagenome_annotations + .join( + taxon_assignments + ) + .set{binning_ch} + } else { + metagenome_annotations + .combine( + taxon_assignments + ) + .set{binning_ch} + } + + BINNING ( + binning_ch + ) + + kmers_normalized + .join( + coverage + ).join( + BINNING.out.binning + ).join( + markers + ) + .set{coverage_binningout_markers} + + if (params.taxonomy_aware) { + coverage_binningout_markers + .join( + taxon_assignments + ) + .set{unclustered_recruitment_ch} + } else { + coverage_binningout_markers + .combine( + taxon_assignments + ) + .set{unclustered_recruitment_ch} + } + + UNCLUSTERED_RECRUITMENT ( + unclustered_recruitment_ch + ) + + BINNING.out.main + .join( + markers + ).join( + metagenome + ) + .set{binning_summary_ch} + + BINNING_SUMMARY ( + binning_summary_ch, + binning_column + ) + + emit: + binning = BINNING.out.binning + binning_main = BINNING.out.main + recruitment = UNCLUSTERED_RECRUITMENT.out.binning + recruitment_main = UNCLUSTERED_RECRUITMENT.out.main + all_binning_results = BINNING.out.binning | mix(UNCLUSTERED_RECRUITMENT.out) | collect + summary_stats = BINNING_SUMMARY.out.stats + summary_taxa = BINNING_SUMMARY.out.taxonomies + metabins = BINNING_SUMMARY.out.metabins + +} diff --git a/subworkflows/local/binning.nf b/subworkflows/local/binning.nf new file mode 100644 index 000000000..08a5df7e2 --- /dev/null +++ b/subworkflows/local/binning.nf @@ -0,0 +1,94 @@ +params.binning_options = [:] +params.binning_summary_options = [:] +params.taxdump_tar_gz_dir = [:] + +include { BIN_CONTIGS } from './../../modules/local/bin_contigs.nf' addParams( options: params.binning_options ) +include { BINNING_SUMMARY } from './../../modules/local/binning_summary.nf' addParams( options: params.binning_summary_options, taxdump_tar_gz_dir: params.taxdump_tar_gz_dir ) + + +workflow BINNING { + + take: + metagenome + kmers_embedded + coverage + gc_content + markers + taxon_assignments + binning_column + + main: + kmers_embedded + .join( + coverage + ) + .join( + gc_content + ) + .join( + markers + ) + .set{metagenome_annotations} + + if (params.taxonomy_aware) { + metagenome_annotations + .join( + taxon_assignments + ) + .set{binning_ch} + } else { + metagenome_annotations + .combine( + taxon_assignments + ) + .set{binning_ch} + } + + BIN_CONTIGS ( + binning_ch + ) + + kmers_embedded + .join( + coverage + ).join( + BIN_CONTIGS.out.binning + ).join( + markers + ) + .set{coverage_binningout_markers} + + if (params.taxonomy_aware) { + coverage_binningout_markers + .join( + taxon_assignments + ) + .set{unclustered_recruitment_ch} + } else { + coverage_binningout_markers + .combine( + taxon_assignments + ) + .set{unclustered_recruitment_ch} + } + + BIN_CONTIGS.out.main + .join( + markers + ).join( + metagenome + ) + .set{binning_summary_ch} + + BINNING_SUMMARY ( + binning_summary_ch, + binning_column + ) + + emit: + binning = BIN_CONTIGS.out.binning + binning_main = BIN_CONTIGS.out.main + summary_stats = BINNING_SUMMARY.out.stats + summary_taxa = BINNING_SUMMARY.out.taxonomies + metabins = BINNING_SUMMARY.out.metabins +} diff --git a/subworkflows/local/contig_coverage.nf b/subworkflows/local/contig_coverage.nf new file mode 100644 index 000000000..250ecba0a --- /dev/null +++ b/subworkflows/local/contig_coverage.nf @@ -0,0 +1,50 @@ + +/* +======================= +TODO: Not yet implemented +======================= +*/ + + +params.rev_reads = null +params.fwd_reads = null + +params.length_table_options = [:] +params.align_reads_options = [:] +params.samtools_viewsort_options = [:] +params.genome_coverage_options = [:] + +include { LENGTH_TABLE } from './../../modules/local/length_table.nf' addParams( options: params.length_table_options ) +include { ALIGN_READS } from './../../modules/local/align_reads.nf' addParams( options: params.align_reads_options ) +include { SAMTOOLS_VIEW_AND_SORT } from './../../modules/local/samtools_view_sort.nf' addParams( samtools_viewsort_options: params.samtools_viewsort_options ) +include { GENOMECOV } from './../../subworkflows/local/genome_coverage.nf' addParams( options: params.genome_coverage_options ) + +workflow CONTIG_COVERAGE { + take: + metagenome + fwd_reads + rev_reads + se_reads + + main: + LENGTH_TABLE( + metagenome + ) + ALIGN_READS( + metagenome, + fwd_reads, + rev_reads, + se_reads + ) + SAMTOOLS_VIEW_AND_SORT( + ALIGN_READS.out + ) + GENOMECOV( + SAMTOOLS_VIEW_AND_SORT.out, + LENGTH_TABLE.out + ) + + emit: + bed = GENOMECOV.out.bed + coverage = GENOMECOV.out.coverage +} diff --git a/subworkflows/local/functions.nf b/subworkflows/local/functions.nf new file mode 100644 index 000000000..4492f839c --- /dev/null +++ b/subworkflows/local/functions.nf @@ -0,0 +1,99 @@ +/* +MIT License + +Copyright (c) 2018 nf-core + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +// +// Utility functions used in nf-core DSL2 module files +// + +// +// Extract name of software tool from process name using $task.process +// +def getSoftwareName(task_process) { + return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() +} + +// +// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules +// +def initOptions(Map args) { + def Map options = [:] + options.args = args.args ?: '' + options.args2 = args.args2 ?: '' + options.args3 = args.args3 ?: '' + options.publish_by_meta = args.publish_by_meta ?: [] + options.publish_dir = args.publish_dir ?: '' + options.publish_files = args.publish_files + options.suffix = args.suffix ?: '' + return options +} + +// +// Tidy up and join elements of a list to return a path string +// +def getPathFromList(path_list) { + def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries + paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes + return paths.join('/') +} + +// +// Function to save/publish module results +// +def saveFiles(Map args) { + if (!args.filename.endsWith('.version.txt')) { + def ioptions = initOptions(args.options) + def path_list = [ ioptions.publish_dir ?: args.publish_dir ] + if (ioptions.publish_by_meta) { + def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta + for (key in key_list) { + if (args.meta && key instanceof String) { + def path = key + if (args.meta.containsKey(key)) { + path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] + } + path = path instanceof String ? path : '' + path_list.add(path) + } + } + } + if (ioptions.publish_files instanceof Map) { + for (ext in ioptions.publish_files) { + if (args.filename.endsWith(ext.key)) { + def ext_list = path_list.collect() + ext_list.add(ext.value) + return "${getPathFromList(ext_list)}/$args.filename" + } + } + } else if (ioptions.publish_files == null) { + return "${getPathFromList(path_list)}/$args.filename" + } + } +} + +/* + * Check file extension + */ +def hasExtension(it, extension) { + it.toString().toLowerCase().endsWith(extension.toLowerCase()) +} diff --git a/subworkflows/local/genome_coverage.nf b/subworkflows/local/genome_coverage.nf new file mode 100644 index 000000000..4843e04b0 --- /dev/null +++ b/subworkflows/local/genome_coverage.nf @@ -0,0 +1,29 @@ +params.bedtools_genomecov_options = [:] + +include { BEDTOOLS_GENOMECOV } from './../../modules/nf-core/modules/bedtools/genomecov.nf' addParams( options: params.bedtools_genomecov_options ) + +workflow GENOME_COVERAGE { + take: + bam // channel: [ val(meta), path(bam) ] + lengths // channel: [ val(meta), path(lengths) ] // https://bedtools.readthedocs.io/en/latest/content/general-usage.html#genome-file-format + + main: + bedtools_input_ch = bam.combine(lengths) + + BEDTOOLS_GENOMECOV ( + bedtools_input_ch + ) + + bam.out.bed + .combine(lengths) + .combine(BEDTOOLS_GENOMECOV.out.bed) + .set{parse_bed_input_ch} + + PARSE_BED ( + parse_bed_input_ch + ) + + emit: + bed = BEDTOOLS_GENOMECOV.out.bed + coverage = PARSE_BED.out.coverage +} diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf new file mode 100644 index 000000000..23706f0c6 --- /dev/null +++ b/subworkflows/local/input_check.nf @@ -0,0 +1,58 @@ +// +// Check input samplesheet and get read channels +// + +params.options = [:] + +include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' addParams( options: params.options ) + +workflow INPUT_CHECK { + take: + samplesheet // file: /path/to/samplesheet.csv + + main: + SAMPLESHEET_CHECK ( samplesheet ) + .splitCsv ( header:true, sep:',' ) + .map { create_fastq_channels(it) } + .set { reads } + + emit: + reads // channel: [ val(meta), [ reads ] ] +} + +// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] +def create_fastq_channels(LinkedHashMap row) { + def meta = [:] + meta.id = row.sample + meta.single_end = row.single_end.toBoolean() + + def array = [] + if (!file(row.fastq_1).exists()) { + exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}" + } + if (meta.single_end) { + array = [ meta, [ file(row.fastq_1) ] ] + } else { + if (!file(row.fastq_2).exists()) { + exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" + } + array = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] + } + return array +} + + +workflow INPUT_CONTIGS { + main: + Channel + .fromPath(params.input) + .ifEmpty { exit 1, "Cannot find contig fasta file(s)." } + .map { row -> + def meta = [:] + meta.id = row.simpleName + return [ meta, row ] + } + .set { ch_fasta } + emit: + metagenome = ch_fasta +} diff --git a/subworkflows/local/mock_data.nf b/subworkflows/local/mock_data.nf new file mode 100644 index 000000000..62daf0dce --- /dev/null +++ b/subworkflows/local/mock_data.nf @@ -0,0 +1,88 @@ +process GET_ASSEMBLY_SUMMARY { + + output: + path "assembly_summary_refseq.txt" + + """ + curl -s https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt > assembly_summary_refseq.txt + """ +} + + +process GET_FTP_DIRS { + + input: + path assembly_summary_refseq + val x + + output: + path "outfile" + """ + cat "${assembly_summary_refseq}" |\ + grep "${x}" |\ + awk -F '\\t' '{print \$20}' > outfile + """ + +} + + +process DOWNLOAD_MOCK_DATA { + + input: + path url + + output: + path "**_genomic.fna.gz", emit: nucleotide + path "**_protein.faa.gz", emit: protein + + """ + cat outfile | sed 's,ftp://,rsync://,' | xargs -n 1 -I {} rsync -am --exclude='*_rna_from_genomic.fna.gz' --exclude='*_cds_from_genomic.fna.gz' --include="*_genomic.fna.gz" --include="*_protein.faa.gz" --include='*/' --exclude='*' {} . + """ +} + +process WRITE_FILE_APPEND_COV { + + input: + path x + val y + + output: + path "${y}" , emit: fasta + + """ + cat "${x}" | awk '/^>/ {\$0=\$1} 1' | sed 's/>.*/&_length_1_cov_1/' > "${y}" + """ +} + +assemblies = Channel.fromList( + [ + "GCF_008124965.1" + ] +) + +workflow CREATE_MOCK { + + main: + GET_ASSEMBLY_SUMMARY() + GET_FTP_DIRS( + GET_ASSEMBLY_SUMMARY.out, + assemblies.flatten() + ) + DOWNLOAD_MOCK_DATA(GET_FTP_DIRS.out) + WRITE_FILE_APPEND_COV( + DOWNLOAD_MOCK_DATA.out.nucleotide.splitFasta(by:1).collectFile(), + "mock_metagenome.fna" + ) + + WRITE_FILE_APPEND_COV.out.fasta + .map { row -> + def meta = [:] + meta.id = row.simpleName + return [ meta, row ] + } + .set { ch_fasta } + + emit: + fasta = ch_fasta +} + diff --git a/subworkflows/local/prepare_ncbi_taxinfo.nf b/subworkflows/local/prepare_ncbi_taxinfo.nf new file mode 100644 index 000000000..48ceb8da7 --- /dev/null +++ b/subworkflows/local/prepare_ncbi_taxinfo.nf @@ -0,0 +1,136 @@ +// this file probably needs to be reevaluated, but from a python-first +// perspective since the python code assumes file/directory structure +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +params.taxdump_tar_gz_dir = [:] +params.prot_accession2taxid_gz_dir = [:] +options = initOptions(params.options) + +process TEST_DOWNLOAD { + // For development work so you don't download the entire prot.accession2taxid.gz database + tag "Downloading first 10,000 lines of prot.accession2taxid.gz" + label 'process_low' + storeDir "${params.prot_accession2taxid_gz_dir}" + + conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + } else { + container "jason-c-kwan/autometa:${params.autometa_image_tag}" + } + + output: + path("prot.accession2taxid"), emit: singlefile + + script: + """ + # https://github.com/nextflow-io/nextflow/issues/1564 + trap 'echo OK; exit 0;' EXIT + curl -s ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz | zcat | head -n 1000 > prot.accession2taxid + """ +} + +process DOWNLOAD_ACESSION2TAXID { + tag "Downloading prot.accession2taxid.gz" + label 'process_low' + storeDir "${params.prot_accession2taxid_gz_dir}" + + conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + } else { + container "jason-c-kwan/autometa:${params.autometa_image_tag}" + } + + output: + // hack nf-core options.args3 and use for output name + path "prot.accession2taxid.gz" , emit: accession2taxid + path "*.version.txt" , emit: version + script: + """ + rsync -a \\ + --quiet \\ + 'rsync://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz' 'prot.accession2taxid.gz' + + rsync -a \\ + --quiet \\ + 'rsync://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.gz.md5' 'prot.accession2taxid.gz.md5' + + md5sum -c *.md5 + + rsync --version | head -n1 > rsync.version.txt + """ +} + + +process DOWNLOAD_TAXDUMP { + tag "Downloading taxdump.tar.gz" + label 'process_low' + storeDir "${params.taxdump_tar_gz_dir}" + + conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + } else { + container "jason-c-kwan/autometa:${params.autometa_image_tag}" + } + + output: + path "*" , emit: taxdump_files + path "*.version.txt" , emit: version + + script: + """ + rsync -a \ + --quiet \ + 'rsync://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz' 'taxdump.tar.gz' + + rsync -a \ + --quiet \ + 'rsync://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz.md5' 'taxdump.tar.gz.md5' + + md5sum -c *.md5 + rm 'taxdump.tar.gz.md5' + tar -xf taxdump.tar.gz + rm taxdump.tar.gz + + rsync --version | head -n1 > rsync.version.txt + """ +} + + +workflow PREPARE_TAXONOMY_DATABASES { + main: + taxdump_dir = file(params.taxdump_tar_gz_dir) + taxdump_dir_files = taxdump_dir.list() + expected_files = ['citations.dmp', 'delnodes.dmp', 'division.dmp', 'gencode.dmp', 'merged.dmp', 'names.dmp', 'nodes.dmp'] + + if (taxdump_dir_files.containsAll(expected_files)){ + taxdump_files = taxdump_dir_files + } else { + DOWNLOAD_TAXDUMP() + DOWNLOAD_TAXDUMP.out.taxdump_files + .set{taxdump_files} + } + + accession2taxid_dir = file(params.prot_accession2taxid_gz_dir) + accession2taxid_dir_files = accession2taxid_dir_files.list() + expected_files = ['prot.accession2taxid'] + + if (accession2taxid_dir_files.containsAll(expected_files)){ + prot_accession2taxid_ch = accession2taxid_dir_files + } else if (params.debug){ + TEST_DOWNLOAD().singlefile + .set{prot_accession2taxid_ch} + } else { + DOWNLOAD_ACESSION2TAXID().accession2taxid + .set{prot_accession2taxid_ch} + } + + emit: + taxdump = taxdump_files + prot_accession2taxid = prot_accession2taxid_ch + +} + diff --git a/subworkflows/local/prepare_nr.nf b/subworkflows/local/prepare_nr.nf new file mode 100644 index 000000000..ba0436e27 --- /dev/null +++ b/subworkflows/local/prepare_nr.nf @@ -0,0 +1,92 @@ +// Import generic module functions +include { initOptions; saveFiles; getSoftwareName } from './functions' + +params.options = [:] +options = initOptions(params.options) + +params.diamond_makedb_options = [:] +params.nr_dmnd_dir = [:] + +include { DIAMOND_MAKEDB } from './../../modules/local/diamond_makedb.nf' addParams( options: params.diamond_makedb_options, nr_dmnd_dir: params.nr_dmnd_dir) + +process DOWNLOAD_NR { + tag "Downloading nr.gz (>100GB download. May take some time.)" + label 'process_low' + storeDir "${params.nr_dmnd_dir}" + + conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + } else { + container "jason-c-kwan/autometa:${params.autometa_image_tag}" + } + + output: + path("nr.gz"), emit: singlefile + + script: + """ + rsync -a \\ + --quiet \\ + 'rsync://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz' 'nr.gz' + + rsync -a \\ + --quiet \\ + 'rsync://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz.md5' 'nr.gz.md5' + + md5sum -c *.md5 + """ +} + + +process TEST_DOWNLOAD { + // For development work so you don't download the entire nr.gz database + tag "Downloading first 10,000 lines of nr.gz" + label 'process_low' + + conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null) + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" + } else { + container "jason-c-kwan/autometa:${params.autometa_image_tag}" + } + + output: + path("nr.gz"), emit: singlefile + + script: + """ + # https://github.com/nextflow-io/nextflow/issues/1564 + trap 'echo OK; exit 0;' EXIT + curl -s ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz | zcat | head -n 10000 | gzip > nr.gz + """ +} + +workflow PREPARE_NR_DB { + + main: + if (file("${params.nr_dmnd_dir}/nr.dmnd").exists()){ + // skip huge download and db creation if nr.dmnd already exists + out_ch = file("${params.nr_dmnd_dir}/nr.dmnd") + } else if (file("${params.nr_dmnd_dir}/nr.gz").exists()){ + // skip huge download if nr.gz already exists + DIAMOND_MAKEDB(file("${params.nr_dmnd_dir}/nr.gz"), "nr") + DIAMOND_MAKEDB.out.diamond_db + .set{out_ch} + } else if (params.debug){ + TEST_DOWNLOAD().singlefile + .set{nr_db_ch} + DIAMOND_MAKEDB(nr_db_ch, "nr") + DIAMOND_MAKEDB.out.diamond_db + .set{out_ch} + } else { + DOWNLOAD_NR().singlefile + .set{nr_db_ch} + DIAMOND_MAKEDB(nr_db_ch, "nr") + DIAMOND_MAKEDB.out.diamond_db + .set{out_ch} + } + + emit: + diamond_db = out_ch +} diff --git a/subworkflows/local/taxon_assignment.nf b/subworkflows/local/taxon_assignment.nf new file mode 100644 index 000000000..dcc5daf35 --- /dev/null +++ b/subworkflows/local/taxon_assignment.nf @@ -0,0 +1,82 @@ +params.lca_options = [:] +params.majority_vote_options = [:] +params.split_kingdoms_options = [:] +params.nr_dmnd_dir = [:] +params.taxdump_tar_gz_dir = [:] +params.prot_accession2taxid_gz_dir = [:] +params.diamond_blastp_options = [:] + +params.debug = [:] +params.diamond_makedb_options = [:] +params.large_downloads_permission = [:] + + +include { PREPARE_NR_DB } from './prepare_nr.nf' addParams( debug: params.debug, diamond_makedb_options: params.diamond_makedb_options, nr_dmnd_dir: params.nr_dmnd_dir ) +include { PREPARE_TAXONOMY_DATABASES } from './prepare_ncbi_taxinfo.nf' addParams( debug: params.debug, taxdump_tar_gz_dir: params.taxdump_tar_gz_dir, prot_accession2taxid_gz_dir: params.prot_accession2taxid_gz_dir ) +include { LCA } from './../../modules/local/lca.nf' addParams( options: params.lca_options ) +include { MAJORITY_VOTE } from './../../modules/local/majority_vote.nf' addParams( options: params.majority_vote_options ) +include { SPLIT_KINGDOMS } from './../../modules/local/split_kingdoms.nf' addParams( options: params.split_kingdoms_options ) +include { DIAMOND_BLASTP } from './../../modules/local/diamond_blastp.nf' addParams( options: params.diamond_blastp_options ) + + +// Autometa taxon assignment workflow +workflow TAXON_ASSIGNMENT { + take: + metagenome + merged_prodigal + + main: + // check if user has given permission for large downloads + if (params.large_downloads_permission) { + // Download and prep necessary databases + PREPARE_NR_DB() + PREPARE_NR_DB.out.diamond_db + .set{diamond_db} + PREPARE_TAXONOMY_DATABASES() + PREPARE_TAXONOMY_DATABASES.out.taxdump + .set{ncbi_taxdump} + PREPARE_TAXONOMY_DATABASES.out.prot_accession2taxid + .set{prot_accession2taxid} + } else { + diamond_db = file("${params.nr_dmnd_dir}/nr.dmnd") + ncbi_taxdump = file("${params.taxdump_tar_gz_dir}/taxdump.tar.gz") + prot_accession2taxid = file("${params.prot_accession2taxid_gz_dir}/prot.accession2taxid.gz") + } + + DIAMOND_BLASTP ( + merged_prodigal, + diamond_db + ) + + ncbi_tax_dir = file(params.taxdump_tar_gz_dir) + + LCA ( + DIAMOND_BLASTP.out.diamond_results, + ncbi_tax_dir + ) // output '${blast.simpleName}.lca.tsv' + + MAJORITY_VOTE ( + LCA.out.lca, + ncbi_tax_dir + ) //output ${lca.simpleName}.votes.tsv + + metagenome + .join( + MAJORITY_VOTE.out.votes + ) + .set{split_kingdoms_input} + + SPLIT_KINGDOMS ( + split_kingdoms_input, + ncbi_tax_dir + ) + + emit: + taxonomy = SPLIT_KINGDOMS.out.taxonomy + bacteria = SPLIT_KINGDOMS.out.bacteria + archaea = SPLIT_KINGDOMS.out.archaea + orf_votes = LCA.out.lca + contig_votes = MAJORITY_VOTE.out.votes +} + + diff --git a/subworkflows/local/unclustered_recruitment.nf b/subworkflows/local/unclustered_recruitment.nf new file mode 100644 index 000000000..afd656f21 --- /dev/null +++ b/subworkflows/local/unclustered_recruitment.nf @@ -0,0 +1,70 @@ +params.binning_options = [:] +params.unclustered_recruitment_options = [:] +params.binning_summary_options = [:] +params.taxdump_tar_gz_dir = [:] + +include { RECRUIT } from './../../modules/local/unclustered_recruitment.nf' addParams( options: params.unclustered_recruitment_options ) +include { BINNING_SUMMARY as UNCLUSTERED_BINNING_SUMMARY } from './../../modules/local/binning_summary.nf' addParams( options: params.binning_summary_options, taxdump_tar_gz_dir: params.taxdump_tar_gz_dir ) + + +workflow UNCLUSTERED_RECRUITMENT { + + take: + metagenome + kmers_normalized + coverage + markers + taxon_assignments + binning + + main: + + kmers_normalized + .join( + coverage + ).join( + binning //BINNING.out.binning + ).join( + markers + ) + .set{coverage_binningout_markers} + + if (params.taxonomy_aware) { + coverage_binningout_markers + .join( + taxon_assignments + ) + .set{unclustered_recruitment_ch} + } else { + coverage_binningout_markers + .combine( + taxon_assignments + ) + .set{unclustered_recruitment_ch} + } + + RECRUIT ( + unclustered_recruitment_ch + ) + + RECRUIT.out.main + .join( + markers + ).join( + metagenome + ) + .set{unclustered_recruitment_summary_ch} + + // UNCLUSTERED_BINNING_SUMMARY ( + // unclustered_recruitment_summary_ch, + // "recruited_cluster" + // ) + + emit: + recruitment = RECRUIT.out.binning + recruitment_main = RECRUIT.out.main + all_binning_results = binning | mix(RECRUIT.out) | collect + // unclustered_recruitment_summary_stats = UNCLUSTERED_BINNING_SUMMARY.out.stats + // unclustered_recruitment_summary_taxa = UNCLUSTERED_BINNING_SUMMARY.out.taxonomies + // unclustered_recruitment_metabins = UNCLUSTERED_BINNING_SUMMARY.out.metabins +} diff --git a/workflows/autometa.nf b/workflows/autometa.nf new file mode 100644 index 000000000..29a3d61ff --- /dev/null +++ b/workflows/autometa.nf @@ -0,0 +1,238 @@ +/* + * ------------------------------------------------- + * Autometa workflow + * ------------------------------------------------- +*/ + +def modules = params.modules.clone() + +def check_for_file(path) { + return +} + +// check if user wants to separate contigs based on taxonomy before binning + +if (params.single_db_dir) { + internal_nr_dmnd_dir = params.single_db_dir + internal_prot_accession2taxid_gz_dir = params.single_db_dir + internal_taxdump_tar_gz_dir = params.single_db_dir +} +// TODO: when implementing the ability to set individual DB dirs +// just override e.g. 'internal_nr_dmnd_location' here so users can set +// 'single_db_dir' but also set individual other db paths if they have them +// e.g. if they have nr.dmnd but not the other files. + +if (params.large_downloads_permission) { + // TODO: check if files already exist, if they don't fail the pipeline early at this stage +} else { + // TODO: check if files exist, if they don't fail the pipeline early at this stage +} + + +// if these are still null then it means they weren't set, so make them null. +// this only works because the markov models are inside the docker image. +// that needs to be changed in future versions + +if (!params.taxonomy_aware) { + single_db_dir = null + internal_nr_dmnd_dir = null + internal_prot_accession2taxid_gz_dir = null + internal_taxdump_tar_gz_dir = null +} + + + + + +/* + * ------------------------------------------------- + * Import local modules + * ------------------------------------------------- +*/ + +include { ANALYZE_KMERS } from '../modules/local/analyze_kmers' addParams( options: modules['analyze_kmers_options'] ) +include { GET_SOFTWARE_VERSIONS } from '../modules/local/get_software_versions' addParams( options: [publish_files : ['csv':'']] ) +include { HMMER_HMMSEARCH } from '../modules/local/hmmer_hmmsearch' addParams( options: modules['hmmsearch_options'] ) +include { HMMER_HMMSEARCH_FILTER } from '../modules/local/hmmer_hmmsearch_filter' addParams( options: modules['hmmsearch_filter_options'] ) +include { SEQKIT_FILTER } from '../modules/local/seqkit_filter' addParams( options: [publish_files : ['*':'']] ) +include { MERGE_TSV_WITH_HEADERS as MERGE_SPADES_COVERAGE_TSV } from '../modules/local/merge_tsv' addParams( options: modules['spades_kmer_coverage'] ) +include { MERGE_TSV_WITH_HEADERS as MERGE_HMMSEARCH } from '../modules/local/merge_tsv' addParams( options: modules['merge_hmmsearch_options'] ) +include { SEQKIT_SPLIT } from '../modules/local/seqkit_split' addParams( options: modules['seqkit_split_options'], num_splits: params.num_splits ) +include { SPADES_KMER_COVERAGE } from '../modules/local/spades_kmer_coverage' addParams( options: modules['spades_kmer_coverage'] ) +include { MERGE_FASTA as MERGE_PRODIGAL } from '../modules/local/merge_fasta' addParams( ) +include { MARKERS } from '../modules/local/markers' addParams( options: modules['seqkit_split_options'] ) + +/* + * ------------------------------------------------- + * Import nf-core modules + * ------------------------------------------------- +*/ +// https://github.com/nf-core/modules/tree/master/modules +// https://nf-co.re/tools/#modules +// nf-core modules --help +include { PRODIGAL } from './../modules/nf-core/modules/prodigal/main' addParams( options: modules['prodigal_options'] ) + +/* + * ------------------------------------------------- + * Import local subworkflows + * ------------------------------------------------- +*/ + +include { BINNING } from '../subworkflows/local/binning' addParams( binning_options: modules['binning_options'], unclustered_recruitment_options: modules['unclustered_recruitment_options'], binning_summary_options: modules['binning_summary_options'], taxdump_tar_gz_dir: internal_taxdump_tar_gz_dir ) +include { UNCLUSTERED_RECRUITMENT } from '../subworkflows/local/unclustered_recruitment' addParams( binning_options: modules['binning_options'], unclustered_recruitment_options: modules['unclustered_recruitment_options'], binning_summary_options: modules['binning_summary_options'], taxdump_tar_gz_dir: internal_taxdump_tar_gz_dir ) +include { INPUT_CONTIGS } from '../subworkflows/local/input_check' addParams( ) +include { CREATE_MOCK } from '../subworkflows/local/mock_data' addParams( ) +include { TAXON_ASSIGNMENT } from '../subworkflows/local/taxon_assignment' addParams( options: modules['taxon_assignment'], majority_vote_options: modules['majority_vote_options'], split_kingdoms_options: modules['split_kingdoms_options'], nr_dmnd_dir: internal_nr_dmnd_dir, taxdump_tar_gz_dir: internal_taxdump_tar_gz_dir, prot_accession2taxid_gz_dir: internal_prot_accession2taxid_gz_dir, diamond_blastp_options: modules['diamond_blastp_options'], large_downloads_permission: params.large_downloads_permission ) + +workflow AUTOMETA { + ch_software_versions = Channel.empty() + + if (params.mock_test){ + CREATE_MOCK() + CREATE_MOCK.out.fasta + .set{input_ch} + } else { + INPUT_CONTIGS() + INPUT_CONTIGS.out.metagenome + .set{input_ch} + } + + + SEQKIT_FILTER( + input_ch + ) + + // Split contigs FASTA if running in parallel + if ( params.num_splits > 1 ) { + SEQKIT_SPLIT ( + SEQKIT_FILTER.out.fasta + ) + fasta_ch = SEQKIT_SPLIT.out.fasta.transpose() + } else { + fasta_ch = SEQKIT_FILTER.out.fasta + } + +/* + * ------------------------------------------------- + * Find coverage, currently only pulling from SPADES output + * ------------------------------------------------- +*/ + + SPADES_KMER_COVERAGE ( + fasta_ch + ) + +/* + * ------------------------------------------------- + * Find open reading frames with Prodigal + * ------------------------------------------------- +*/ + + PRODIGAL ( + fasta_ch, + "gbk" + ) + +/* + * ------------------------------------------------- + * If running in parallel, merge Prodigal results + * ------------------------------------------------- +*/ + + if ( params.num_splits > 0 ) { + MERGE_PRODIGAL ( + PRODIGAL.out.amino_acid_fasta.groupTuple(), + "faa" + ) + MERGE_PRODIGAL.out.merged + .set{merged_prodigal} + } else { + PRODIGAL.out.amino_acid_fasta + .set{merged_prodigal} + } + +/* + * ------------------------------------------------- + * OPTIONAL: Run Diamond BLASTp and split contigs into taxonomic groups + * ------------------------------------------------- +*/ + + if (params.taxonomy_aware) { + TAXON_ASSIGNMENT ( + SEQKIT_FILTER.out.fasta, + merged_prodigal + ) + TAXON_ASSIGNMENT.out.taxonomy + .set{taxonomy_results} + + TAXON_ASSIGNMENT.out.bacteria + ANALYZE_KMERS ( + TAXON_ASSIGNMENT.out.bacteria + ) + } else { + ANALYZE_KMERS ( SEQKIT_FILTER.out.fasta ) + taxonomy_results = file( "$baseDir/assets/dummy_file.txt", checkIfExists: true ) + taxonomy_results = Channel.fromPath( taxonomy_results ) + } + + ANALYZE_KMERS.out.embedded + .set{kmers_embedded_merged_tsv_ch} + + ANALYZE_KMERS.out.normalized + .set{kmers_normalized_tsv_ch} + +// -------------------------------------------------------------------------------- +// Run hmmsearch and look for marker genes in contig orfs +// -------------------------------------------------------------------------------- + MARKERS(PRODIGAL.out.amino_acid_fasta) + // To move to hmmsearch instead of hmmscan: + // HMMER_HMMSEARCH.out.domtblout + // .join(PRODIGAL.out.amino_acid_fasta) + // .set{hmmsearch_out} + // HMMER_HMMSEARCH_FILTER(hmmsearch_out) + + // Before binning we need to merge back everything that was run in parallel + if ( params.num_splits > 0 ) { + MERGE_SPADES_COVERAGE_TSV ( + SPADES_KMER_COVERAGE.out.coverages.groupTuple(), + "coverage" + ) + MERGE_SPADES_COVERAGE_TSV.out.merged_tsv + .set{spades_coverage_merged_tsv_ch} + + MERGE_HMMSEARCH ( + MARKERS.out.markers_tsv.groupTuple(), + "markers.tsv" + ) + MERGE_HMMSEARCH.out.merged_tsv + .set{markers_tsv_merged_tsv_ch} + } else { + fasta_ch = SEQKIT_FILTER.out.fasta + SPADES_KMER_COVERAGE.out.coverages + .set{spades_coverage_merged_tsv_ch} + MARKERS.out.markers_tsv + .set{markers_tsv_merged_tsv_ch} + } + + BINNING( + SEQKIT_FILTER.out.fasta, + kmers_embedded_merged_tsv_ch, + spades_coverage_merged_tsv_ch, + SEQKIT_FILTER.out.gc_content, + markers_tsv_merged_tsv_ch, + taxonomy_results, + "cluster" + ) + + if (params.unclustered_recruitment) { + UNCLUSTERED_RECRUITMENT( + SEQKIT_FILTER.out.fasta, + kmers_normalized_tsv_ch, + spades_coverage_merged_tsv_ch, + markers_tsv_merged_tsv_ch, + taxonomy_results, + BINNING.out.binning + ) + } + +}