Merge pull request #20 from NBISweden/develop

Merge development changes for release v2.2
NBISweden · Oct 15, 2020 · 4247197 · 4247197
2 parents a22d0c9 + eabf34f
commit 4247197
Show file tree

Hide file tree

Showing 46 changed files with 454 additions and 357 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -13,4 +13,3 @@ indent_size = 4
 [*.{yml,yaml}]
 indent_style = space
 indent_size = 2
-
diff --git a/.gitattributes b/.gitattributes
@@ -1,3 +1,2 @@
 *.smk linguist-language=Python
 Snakefile linguist-language=Python
-
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -13,6 +13,8 @@ on:
       - 'LICENSE'
     branches:
       - master
+env:
+  TMPDIR: /tmp
 
 jobs:
   test:
@@ -25,44 +27,50 @@ jobs:
     - uses: s-weigand/setup-conda@v1
     - name: Install the conda environment
       run: conda env update -n base -f environment.yml
+    # First dry-run on full workflow
     - name: Dry run
       run: snakemake --use-conda -j 4 --configfile .test/config/dry-run.yaml -n
+    # Test cutadapt (10k reads/sample)
     - name: Cutadapt
       run: |
         snakemake --use-conda -j 4 --configfile .test/config/cutadapt.yaml -p --notemp qc
         snakemake -j 1 --configfile .test/config/cutadapt.yaml --report report.html qc
+    # Upload sample report
     - name: Upload cutadapt report
       uses: actions/upload-artifact@v1
       with:
         name: ${{ runner.os }}-cutadapt.html
         path: report.html
+    # Test all other preprocessing software except cutadapt (10k reads/sample)
     - name: Preprocess
       run: |
         rm -rf results report.html
         snakemake --use-conda -j 4 --configfile .test/config/preprocess.yaml -p qc
         snakemake --use-conda -j 4 --configfile .test/config/preprocess.yaml -p --report report.html qc
+    # Upload samples report
     - name: Upload samples report
       uses: actions/upload-artifact@v1
       with:
         name: ${{ runner.os }}-samples_report.html
         path: report.html
+    # Run kraken for sample1 (paired-end) and sample4 (single-end) (10k reads/sample)
     - name: Kraken
       run: |
         snakemake --use-conda -j 4 --configfile .test/config/kraken.yaml -p results/kraken/sample1_1_pe.kreport results/kraken/sample4_1_se.kreport
-    - name: Metaspades
+        rm -rf results examples/data/sample*
+    # Test annotations + normalizations with metaspades (100k reads/sample)
+    - name: Annotation
       run: |
-        snakemake --use-conda -j 4 --configfile .test/config/metaspades.yaml -p assemble
-        rm -r results/assembly results/report examples/data/sample*
-    - name: Prepare taxonomy
-      run: bash .test/scripts/prep_taxonomy.sh
-    - name: Taxonomy
-      run: |
-        snakemake --use-conda -j 4 --configfile .test/config/taxonomy.yaml -p taxonomy
-        rm -r results examples/data/sample*
+        bash .test/scripts/prep_eggnog.sh
+        bash .test/scripts/prep_taxonomy.sh
+        snakemake --use-conda -j 2 --configfile .test/config/annotate.yaml -p annotate
+        rm -r results/assembly examples/data/sample*
+    # Test Metabat2 with Megahit (200k reads/sample)
     - name: Metabat
       run: |
         snakemake --use-conda -j 4 --configfile .test/config/metabat.yaml -p bin assemble
         snakemake --use-conda -j 4 --configfile .test/config/metabat.yaml -p --report report.html bin assemble
+    # Upload report
     - name: Upload snakemake report
       uses: actions/upload-artifact@v1
       with:
@@ -86,16 +94,20 @@ jobs:
         restore-keys: |
           ${{ runner.os }}-${{ env.cache-name }}
           ${{ runner.os }}-
+    # Run kraken and produce final output reports (linux only) (10k reads/sample)
     - name: Kraken
       run: |
         snakemake --use-conda -j 4 --configfile .test/config/kraken.yaml --notemp -p classify
         rm -r examples/data/sample* results
+    # Run binning including also checkm (200k reads/sample)
     - name: Binning
       run: |
         snakemake --use-conda -j 4 --configfile .test/config/binning.yaml -p assemble bin
+    # Create report for assembly and binning
     - name: Snakemake report
       run: |
         snakemake --use-conda -j 4 --configfile .test/config/binning.yaml -p --report report.html assemble bin
+    # Upload report
     - name: Upload snakemake report
       uses: actions/upload-artifact@v1
       with:

diff --git a/.test/README.md b/.test/README.md
@@ -1,24 +1,23 @@
 # Testing
 
-This directory contains config files, resources and scripts used to test the 
+This directory contains config files, resources and scripts used to test the
 workflow.
 
 **config/**
 
-Files under `config/` are used by the different test steps of the github 
+Files under `config/` are used by the different test steps of the github
 actions testing.
 
 **data/**
 
 The fasta file at `data/uniref100.fasta` was created by:
 
-1. searching [uniprot](https://uniprot.org) for the 5 taxids used to generate 
+1. searching [uniprot](https://uniprot.org) for the 5 taxids used to generate
 the [synthetic metagenome](https://zenodo.org/record/3737112#.XsUQncZ8LOQ) that
 this workflow uses for testing. This resulted in 8,122 identified sequences.
 2. mapping the sequences to their UniRef100 id via the Retrieve/ID mapping tool
 at uniprot, followed by downloading of the reference sequences.
 3. subsampling 100 sequences from the downloaded fastafile using `seqtk
 
-During testing the fasta file is used to build and query a diamond database 
-using `tango`. 
-
+During testing the fasta file is used to build and query a diamond database
+using `tango`.
diff --git a/.test/config/metaspades.yaml → .test/config/annotate.yaml b/.test/config/metaspades.yaml → .test/config/annotate.yaml
@@ -14,15 +14,15 @@ preprocessing:
   # if no preprocessing is done, fastqc will be run on the raw reads
   fastqc: False
   # trim reads with trimmomatic? (quality and adapter trimming)
-  trimmomatic: True
+  trimmomatic: False
   # trim reads with cutadapt? (runs instead of trimmomatic, no quality trimming)
   cutadapt: False
   # run fastuniq (removes duplicates from paired-end samples)
-  fastuniq: True
+  fastuniq: False
   # map reads agains the phix genome and keep only reads that do not map concordantly
-  phix_filter: True
+  phix_filter: False
   # run SortMeRNA to identify (and filter) rRNA sequences
-  sortmerna: True
+  sortmerna: False
 
 # parameters for trimmomatic
 trimmomatic:
@@ -91,7 +91,7 @@ remove_duplicates: True
 
 assembly:
   # run Megahit assembler?
-  megahit: True
+  megahit: False
   # Use Metaspades instead of Megahit for assembly?
   metaspades: True
 
@@ -117,15 +117,15 @@ annotation:
   # run tRNAscan-SE?
   tRNAscan: False
   # run infernal for rRNA identification?
-  infernal: True
+  infernal: False
   # run eggnog-mapper to infer KEGG orthologs, pathways and modules?
-  eggnog: False
+  eggnog: True
   # run PFAM-scan to infer protein families from PFAM?
-  pfam: True
+  pfam: False
   # run Resistance gene identifier?
-  rgi: False
+  rgi: True
   # run taxonomic annotation of assembled contigs (using tango + sourmash)?
-  taxonomy: False
+  taxonomy: True
 
 # params for taxonomic annotation of contigs/orfs
 taxonomy:
@@ -262,4 +262,4 @@ metaphlan:
 # this sets the number of reads to generate for example input files
 # the files are generated based on the config/samples.tsv file and stored
 # under examples/data
-example_dataset_size: 10000
+example_dataset_size: 100000
diff --git a/.test/config/metabat.yaml b/.test/config/metabat.yaml
@@ -16,7 +16,7 @@ preprocessing:
   # trim reads with trimmomatic? (quality and adapter trimming)
   trimmomatic: False
   # trim reads with cutadapt? (runs instead of trimmomatic, no quality trimming)
-  cutadapt: True
+  cutadapt: False
   # run fastuniq (removes duplicates from paired-end samples)
   fastuniq: False
   # map reads agains the phix genome and keep only reads that do not map concordantly
@@ -87,7 +87,7 @@ sortmerna:
   extra_settings: "--num_alignments 1"
 
 # remove duplicates from bam files?
-remove_duplicates: True
+remove_duplicates: False
 
 assembly:
   # run Megahit assembler?

diff --git a/.test/config/taxonomy.yaml b/.test/config/taxonomy.yaml
@@ -12,9 +12,9 @@ paths:
 preprocessing:
   # run fastqc on (preprocessed) input?
   # if no preprocessing is done, fastqc will be run on the raw reads
-  fastqc: True
+  fastqc: False
   # trim reads with trimmomatic? (quality and adapter trimming)
-  trimmomatic: True
+  trimmomatic: False
   # trim reads with cutadapt? (runs instead of trimmomatic, no quality trimming)
   cutadapt: False
   # run fastuniq (removes duplicates from paired-end samples)
@@ -87,13 +87,13 @@ sortmerna:
   extra_settings: "--num_alignments 1"
 
 # remove duplicates from bam files?
-remove_duplicates: True
+remove_duplicates: False
 
 assembly:
   # run Megahit assembler?
-  megahit: False
+  megahit: True
   # Use Metaspades instead of Megahit for assembly?
-  metaspades: True
+  metaspades: False
 
 megahit:
   # maximum threads for megahit

diff --git a/.test/scripts/prep_eggnog.sh b/.test/scripts/prep_eggnog.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+base="https://github.com/eggnogdb/eggnog-mapper/blob/master/tests/fixtures"
+
+mkdir -p resources/eggnog-mapper
+curl -L -o resources/eggnog-mapper/eggnog.db "${base}/eggnog.db?raw=true"
+curl -L -o resources/eggnog-mapper/eggnog_proteins.dmnd "${base}/eggnog_proteins.dmnd?raw=true"
+touch resources/eggnog-mapper/eggnog.version
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@ A workflow for metagenomic projects
 [![Snakemake 5.11.2](https://img.shields.io/badge/snakemake-5.11.2-brightgreen.svg)](https://img.shields.io/badge/snakemake-5.11.2)
 ![CI](https://github.com/NBISweden/nbis-meta/workflows/CI/badge.svg?branch=master)
 ![Docker](https://img.shields.io/docker/pulls/nbisweden/nbis-meta)
-
+[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
 
 ## Overview
 A [snakemake](http://snakemake.readthedocs.io/en/stable/) workflow for
@@ -19,7 +19,7 @@ You can use this workflow for _e.g._:
 - **functional and taxonomic annotation**
 - **metagenomic binning**
 
-See the [Wiki-pages](https://github.com/NBISweden/nbis-meta/wiki) for 
+See the [Wiki-pages](https://github.com/NBISweden/nbis-meta/wiki) for
 instructions on how to run the workflow.
 
 ## Installation

diff --git a/slurm/slurm_utils.py b/slurm/slurm_utils.py
@@ -59,7 +59,7 @@ def format(_pattern, _quote_all=False, **kwargs):
 #  adapted from Job.format_wildcards in snakemake.jobs
 def format_wildcards(string, job_properties):
     """ Format a string with variables from the job. """
-    
+
     class Job(object):
         def __init__(self, job_properties):
             for key in job_properties:
@@ -100,7 +100,7 @@ def format_values(dictionary, job_properties):
                 )
                 raise WorkflowError(msg, e)
     return formatted
-    
+
 def convert_job_properties(job_properties, resource_mapping={}):
     options = {}
     resources = job_properties.get("resources", {})

diff --git a/workflow/.cfg/multiqc_preprocess_config.yaml b/workflow/.cfg/multiqc_preprocess_config.yaml
@@ -9,4 +9,4 @@ extra_fn_clean_exts:
     - '_PHIX'
     - '.sortmerna'
     - '_R1'
-    - '_R2'
+    - '_R2'
diff --git a/workflow/envs/annotation.yml b/workflow/envs/annotation.yml
@@ -8,4 +8,4 @@ dependencies:
   - pfam_scan=1.6
   - eggnog-mapper=2.0.1
   - infernal=1.1.2
-  - trnascan-se=2.0.5
+  - trnascan-se=2.0.5
diff --git a/workflow/envs/barrnap.yml b/workflow/envs/barrnap.yml
@@ -4,4 +4,4 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - barrnap=0.9
+  - barrnap=0.9
diff --git a/workflow/envs/centrifuge.yml b/workflow/envs/centrifuge.yml
@@ -2,4 +2,4 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - centrifuge=1.0.4_beta
+  - centrifuge=1.0.4_beta
diff --git a/workflow/envs/checkm.yml b/workflow/envs/checkm.yml
@@ -5,4 +5,4 @@ channels:
   - defaults
 dependencies:
   - python=3.7.6
-  - checkm-genome=1.1.2
+  - checkm-genome=1.1.2
diff --git a/workflow/envs/concoct.yml b/workflow/envs/concoct.yml
@@ -5,4 +5,4 @@ channels:
 dependencies:
   - python=3.7.6
   - biopython=1.76
-  - concoct=1.1.0
+  - concoct=1.1.0
diff --git a/workflow/envs/cookiecutter.yml b/workflow/envs/cookiecutter.yml
@@ -2,4 +2,4 @@ name: cookiecutter
 channels:
   - conda-forge
 dependencies:
-  - cookiecutter
+  - cookiecutter
diff --git a/workflow/envs/edger.yml b/workflow/envs/edger.yml
@@ -0,0 +1,9 @@
+name: edger
+channels:
+  - conda-forge
+  - bioconda
+  - r
+  - defaults
+dependencies:
+  - r-base
+  - bioconductor-edger=3.30.0
diff --git a/workflow/envs/graphlan.yml b/workflow/envs/graphlan.yml
@@ -2,4 +2,4 @@ channels:
   - bioconda
 dependencies:
   - graphlan=1.1.3
-  - export2graphlan=0.20
+  - export2graphlan=0.20
diff --git a/workflow/envs/kraken.yml b/workflow/envs/kraken.yml
@@ -3,4 +3,4 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - kraken2=2.0.8_beta
+  - kraken2=2.0.8_beta
diff --git a/workflow/envs/krona.yml b/workflow/envs/krona.yml
@@ -3,4 +3,4 @@ channels:
   - defaults
 dependencies:
   - krona=2.7.1
-  - make=4.2.1
+  - make=4.2.1
diff --git a/workflow/envs/maxbin.yml b/workflow/envs/maxbin.yml
@@ -2,4 +2,4 @@ channels:
    - bioconda
 dependencies:
   - maxbin2=2.2.7
-  - perl-lwp-simple=6.15
+  - perl-lwp-simple=6.15
diff --git a/workflow/envs/megahit.yml b/workflow/envs/megahit.yml
@@ -4,4 +4,4 @@ channels:
   - defaults
 dependencies:
   - python=3.8.1
-  - megahit=1.2.9
+  - megahit=1.2.9
diff --git a/workflow/envs/metabat.yml b/workflow/envs/metabat.yml
@@ -4,4 +4,4 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - metabat2=2.14
+  - metabat2=2.14
diff --git a/workflow/envs/metagenomeseq.yml b/workflow/envs/metagenomeseq.yml
@@ -0,0 +1,9 @@
+name: metagenomeseq
+channels:
+  - conda-forge
+  - bioconda
+  - r
+  - defaults
+dependencies:
+  - r-base
+  - bioconductor-metagenomeseq
diff --git a/workflow/envs/metaphlan.yml b/workflow/envs/metaphlan.yml
@@ -5,4 +5,4 @@ channels:
   - defaults
 dependencies:
   - python=3.7.6
-  - metaphlan=3.0
+  - metaphlan=3.0