Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
executable file 396 lines (354 sloc) 14.2 KB
##
## Snakefile_5Binning - Binning rules
##
## Knutt.org/KnuttReads2Bins
# This Snakefile uses metaBAT2 for binning and checkM for bin analysis.
# Each bin receives classification and depth data.
localrules: checkm_data, filter_cat_proteins, binmap
# Binning results
basedir_binning = config["output_dir"]+"/Binning"
basedir_bench_binning = basedir_bench + "/Binning"
basedir_data_binning = basedir_data + "/Binning"
# Run metabat2
rule metabat:
version: "1.0"
input:
contigs = rules.megahit_assembly.output.contigs,
depth = rules.metabat_depth.output
params:
prefix = basedir_binning + "/METABAT/{sample}/bins/{sample}"
output:
dir = directory(basedir_binning + "/METABAT/{sample}/bins"),
unbinned = basedir_binning + "/METABAT/{sample}/{sample}.unbinned.fa",
lowdepth = basedir_binning + "/METABAT/{sample}/{sample}.lowDepth.fa",
tooshort = basedir_binning + "/METABAT/{sample}/{sample}.tooShort.fa"
log:
basedir_binning + "/METABAT/{sample}/{sample}_metabat2.log"
benchmark:
basedir_bench_binning + "/metabat_{sample}.tsv"
threads:
16
conda:
"envs/KnuttReads2Bins.yml"
message:
"Running METABAT2 for {wildcards.sample}"
shell:
("{{ metabat2 -i {input.contigs} -a {input.depth} "
"-m {config[min_contiglen_binning]} -x {config[min_contigcov]} "
"-s {config[min_binsize]} -t {threads} -v -o {params.prefix} "
"--unbinned --seed 42 && "
"mv {params.prefix}.unbinned.fa {params.prefix}.lowDepth.fa "
"{params.prefix}.tooShort.fa {output.dir}/.. ; }} &> {log}")
# Download checkM data and set data root
rule checkm_data:
version: "1.0"
params:
dir = basedir_dbs + "/CheckM/"
output:
dat = basedir_dbs + "/CheckM/taxon_marker_sets.tsv",
conda:
"envs/KnuttReads2Bins.yml"
message:
"Downloading CheckM data"
shell:
("wget -qO- https://data.ace.uq.edu.au/public/CheckM_databases/checkm_data_2015_01_16.tar.gz | "
"tar xzf - -C {params.dir} && checkm data setRoot {params.dir}")
# Run checkM on a sample
rule checkm_sample:
version: "1.0"
input:
rules.checkm_data.output,
indir = rules.metabat.output.dir
params:
datadir = rules.checkm_data.params.dir,
newheader = "bin\tmarker_lineage\tlineage_genomes\tlineage_markers\tlineage_marker_sets\t0_sets\t1_sets\t2_sets\t3_sets\t4_sets\t5_or_more_sets\tcompleteness\tcontamination\tstrain_heterogeneity"
output:
sreport = basedir_data_binning + "/{sample}_checkm.tsv",
outdir = directory(basedir_binning + "/CheckM/{sample}/{sample}_checkm_data/")
log:
basedir_binning + "/CheckM/{sample}/{sample}_checkm_wf.log"
benchmark:
basedir_bench_binning + "/checkm_wf_{sample}.tsv"
threads:
16
conda:
"envs/KnuttReads2Bins.yml"
message:
"Running CheckM lineage workflow for {wildcards.sample}"
shell:
("checkm lineage_wf -f {output.sreport} --tab_table -x fa "
" -t {threads} {input.indir} {output.outdir} &> {log} && "
"sed -i '1c{params.newheader}' {output.sreport}")
# Calculate checkm help files
rule checkm_extra_data:
version: "1.0"
input:
rules.checkm_data.output,
indir = rules.checkm_sample.input.indir,
contigs = rules.metabat.input.contigs,
mapped = rules.map_reads.output.bam,
mappedi = rules.map_reads.output.bai,
params:
datadir = rules.checkm_data.params.dir,
covheader = "contigid\tbin\tlen\tbamfile\tavgcov\treads",
profileheader = "bin\tbinsize_Mbp\treads\treads_perc\tofbinnedpop_perc\tofcommunity_perc"
output:
tetras = basedir_data_binning + "/{sample}_checkm_tetras.tsv",
cov = basedir_data_binning + "/{sample}_checkm_cov.tsv",
profile = basedir_data_binning + "/{sample}_checkm_profile.tsv",
log:
basedir_binning + "/CheckM/{sample}/{sample}_checkm_helperfiles.log"
benchmark:
basedir_bench_binning + "/checkm_extra_{sample}.tsv"
threads:
16
conda:
"envs/KnuttReads2Bins.yml"
message:
"Running CheckM utilities for {wildcards.sample}"
shell:
("{{ checkm tetra -t {threads} {input.contigs} {output.tetras} && "
"checkm coverage -x fa -t {threads} {input.indir} {output.cov} {input.mapped} && "
"checkm profile --tab_table -f {output.profile} {output.cov} ; }} &> {log} && "
"sed -i '1 s/Sequence Id/contigid/' {output.tetras} && sed -i '1c{params.covheader}' {output.cov} && "
"sed -i '1c{params.profileheader}' {output.profile}")
rule sourmash_bins:
version: "1.0"
input:
rules.checkm_sample.input.indir
output:
basedir_binning + "/sourmash/{sample}/{sample}_bins_sourmash_k{k}.sig"
log:
basedir_binning + "/sourmash/{sample}/{sample}_bins_sourmash_k{k}.log"
benchmark:
basedir_bench_binning + "/sourmash_bins_k{k}_{sample}.tsv"
conda:
"envs/KnuttReads2Bins.yml"
message:
"Calculating sourmash {wildcards.k}-mer hashes for the bins from {wildcards.sample}"
shell:
"sourmash compute -k {wildcards.k} --track-abundance --scaled {config[sourmash_scaled]} --seed 42 -f -o {output} {input}/*.fa &> {log}"
rule sourmash_bins_describe:
version: "1.0"
input:
expand(basedir_binning + "/sourmash/{sample}/{sample}_bins_sourmash_k{k}.sig", k=sourmash_readclass_k, allow_missing=True)
output:
basedir_data_binning + "/{sample}_bins_sourmash_signature.tsv"
log:
basedir_binning + "/sourmash/{sample}/{sample}_bins_sourmash_descr.log"
benchmark:
basedir_bench_binning + "/sourmash_bins_description_{sample}.tsv"
conda:
"envs/KnuttReads2Bins.yml"
message:
"Describing sourmash signature for the bins from {wildcards.sample}"
shell:
"sourmash sig describe --csv {output} {input} &> {log} && sed -i -E 's/(\"([^\"]*)\")?,/\\2\t/g' {output}"
rule sourmash_bins_search:
version: "1.1"
input:
sig = expand(basedir_binning + "/sourmash/{sample}/{sample}_bins_sourmash_k{k}.sig", k=sourmash_readclass_k, allow_missing=True)[0],
db = basedir_dbs + "/Sourmash/" + sourmash_lca_name
params:
params = ['sourmash', 'search', '--containment'],
header = 'bin\tcontainment\tname\tfilename\tmd5'
output:
basedir_data_binning + "/{sample}_bins_sourmash_search.tsv"
log:
basedir_binning + "/sourmash/{sample}/{sample}_bins_sourmash_search.log"
benchmark:
basedir_bench_binning + "/sourmash_bins_search_{sample}.tsv"
conda:
"envs/KnuttReads2Bins.yml"
message:
"Searching the bin signatures from {wildcards.sample}"
script:
"scripts/Pipeline/iterateSourmashSearch.py"
# Run the LCA
rule sourmash_bins_lca:
version: "1.0"
input:
sig = expand(basedir_binning + "/sourmash/{sample}/{sample}_bins_sourmash_k{k}.sig", k=sourmash_readclass_k, allow_missing=True),
db = basedir_dbs + "/Sourmash/" + sourmash_lca_name
output:
basedir_data_binning + "/{sample}_bins_sourmash_classification.tsv"
log:
basedir_binning + "/sourmash/{sample}/{sample}_bins_sourmash_classification.log"
benchmark:
basedir_bench_binning + "/sourmash_classification_{sample}.tsv"
conda:
"envs/KnuttReads2Bins.yml"
message:
"Classifying the bins from {wildcards.sample} with Sourmash"
shell:
"sourmash lca classify --db {input.db} --query {input.sig} > {output} 2> {log} && sed -i -E 's/(\"([^\"]*)\")?,/\\2\t/g' {output}"
rule sourmash_compare_bins:
version: "1.1"
input:
expand(basedir_binning + "/sourmash/{sample}/{sample}_bins_sourmash_k{k}.sig", k=sourmash_readclass_k, sample=sample_names),
wildcard_constraints:
comptype = "|".join(samplecomptypes)
params:
regex = r'^.+/bins/(.+)\.fa$',
k = sourmash_readclass_k,
dir = basedir_data_binning,
typearg = lambda w: "--containment " if w["comptype"]==samplecomptypes[0] else ""
output:
tsv = basedir_data_binning + "/sourmash_bin_comparison_{comptype}.tsv",
np = basedir_data_binning + "/sourmash_bin_comparison_{comptype}",
labels = basedir_data_binning + "/sourmash_bin_comparison_{comptype}.labels.txt",
png = expand(basedir_data_binning + "/sourmash_bin_comparison_{comptype}.{type}.png", type=["dendro","hist","matrix"], allow_missing=True)
log:
basedir_readclass + "/sourmash/sourmash_bin_comparison_{comptype}.log"
benchmark:
basedir_bench_binning + "/sourmash_bin_comparison_{comptype}.tsv"
conda:
"envs/KnuttReads2Bins.yml"
message:
"Comparing the bins using sourmash based on {wildcards.comptype}."
shell:
("{{ sourmash compare {params.typearg}-k {params.k} {input} --csv {output.tsv} --output {output.np} && "
"scripts/Pipeline/sourmashlabels.py {output.labels} '{params.regex}' && "
"sourmash plot --labels --output-dir {params.dir} {output.np} ; }} &> {log} && "
"sed -i -e $(echo 1c$(cat {output.labels} | tr '\n' ',')) {output.tsv} && "
"sed -i -E 's/(\"([^\"]*)\")?,/\\2\t/g' {output.tsv}")
# Filter and add bins names to CAT proteins for BAT
rule filter_cat_proteins:
version: "1.0"
input:
indir = rules.metabat.output.dir,
proteins = rules.classify_contigs.output[2],
aligned = rules.classify_contigs.output[3]
output:
proteins = basedir_binning + "/BAT/{sample}/{sample}.bins.predicted_proteins.faa",
aligned = basedir_binning + "/BAT/{sample}/{sample}.bins.alignment.diamond"
conda:
"envs/KnuttReads2Bins.yml"
message:
"Reformatting CAT results from {wildcards.sample} for BAT"
shell:
("scripts/Pipeline/addBinsCATfileForBAT.py -b {input.indir} -s .fa "
"-p {input.proteins} -d {input.aligned} -P {output.proteins} "
"-D {output.aligned}")
# Classify the bins
rule classify_bins:
version: "1.0"
input:
rules.index_cat_bat.input,
indir = rules.metabat.output.dir,
db = rules.index_cat_bat.output,
proteins = rules.filter_cat_proteins.output.proteins,
aligned = rules.filter_cat_proteins.output.aligned,
params:
taxdir = rules.download_ncbi_tax.params.dir,
prefix = basedir_binning + "/BAT/{sample}/{sample}"
output:
binclass = basedir_binning + "/BAT/{sample}/{sample}.bin2classification.txt",
orflca = basedir_binning + "/BAT/{sample}/{sample}.ORF2LCA.txt"
log:
basedir_binning + "/BAT/{sample}/{sample}_BAT.log"
benchmark:
basedir_bench_binning + "/bat_{sample}.tsv"
threads: # Uses existing data, no DIAMOND call
1
conda:
"envs/KnuttReads2Bins.yml"
message:
"Running BAT on {wildcards.sample}"
shell:
("CAT bins -b {input.indir} -s .fa -d {input.db} -t {params.taxdir} "
"-o {params.prefix} -n {threads} -p {input.proteins} -a {input.aligned} &> {log}")
rule binmap:
version: "1.0"
input:
bindir = rules.metabat.output.dir,
additional = expand(basedir_binning + "/METABAT/{sample}/{sample}.{type}.fa", type=["unbinned", "lowDepth", "tooShort"], allow_missing=True),
assembly = rules.metabat.input.contigs
output:
basedir_data_binning + "/{sample}_binmap.tsv"
conda:
"envs/R.yml"
message:
"Creating bin map for {wildcards.sample}"
script:
"scripts/DataExtraction/binMap.R"
rule formatbat:
version: "1.0"
input:
cat = rules.formatcat.output.dat,
bat = basedir_binning + "/BAT/{sample}/{sample}.bin2classification.named.txt",
binmap = rules.binmap.output
output:
dat = basedir_data_binning + "/{sample}_bat.tsv",
krona = basedir_data_binning + "/{sample}_bat_krona.tsv"
conda:
"envs/R.yml"
message:
"Formatting BAT output from {wildcards.sample}"
script:
"scripts/DataExtraction/formatBATData.R"
# Create binning report
rule binningReport:
version: "1.0"
input:
checkmprofile = expand(basedir_data_binning + "/{sample}_checkm_profile.tsv", sample=sample_names),
checkmlineage = expand(basedir_data_binning + "/{sample}_checkm.tsv", sample=sample_names),
tetras = expand(basedir_data_binning + "/{sample}_checkm_tetras.tsv", sample=sample_names),
checkmcov = expand(basedir_data_binning + "/{sample}_checkm_cov.tsv", sample=sample_names),
sourmashclass = expand(basedir_data_binning + "/{sample}_bins_sourmash_classification.tsv", sample=sample_names),
commons = "scripts/Reports/commonReport.R",
params:
samples = sample_names
output:
basedir_reporting + "/8binning.html"
benchmark:
basedir_bench_binning + "/binning_report.tsv"
conda:
"envs/R.yml"
message:
"Generating binning report"
script:
"scripts/Reports/binning.Rmd"
rule binning:
version: "1.0"
input:
expand(basedir_data_binning + "/{sample}_checkm.tsv", sample=sample_names),
expand(basedir_data_binning + "/{sample}_checkm_profile.tsv", sample=sample_names),
expand(basedir_data_binning + "/{sample}_binmap.tsv", sample=sample_names)
message:
"Ran binning and CheckM"
rule binningSourmash:
version: "1.0"
input:
expand(basedir_data_binning + "/{sample}_bins_sourmash_classification.tsv", sample=sample_names),
expand(basedir_data_binning + "/{sample}_bins_sourmash_signature.tsv", sample=sample_names),
expand(basedir_data_binning + "/{sample}_bins_sourmash_search.tsv", sample=sample_names),
expand(basedir_data_binning + "/sourmash_bin_comparison_{comptype}.tsv", comptype=samplecomptypes)
rule binningRefData:
version: "1.0"
input:
expand(basedir_data_binning + "/{sample}_checkm.tsv", sample=sample_names),
expand(basedir_data_binning + "/{sample}_checkm_profile.tsv", sample=sample_names),
expand(basedir_data_binning + "/{sample}_binmap.tsv", sample=sample_names)
message:
"Ran binning and CheckM"
rule bat:
version: "1.0"
input:
expand(basedir_data_binning + "/{sample}_bat.tsv", sample=sample_names)
message:
"Ran BAT"
rule batKrona:
version: "1.0"
input:
expand(basedir_data_binning + "/{sample}_bat_krona.tsv", sample=sample_names),
params:
pairs = [file + "," + name for file, name in zip(expand(basedir_data_binning + "/{sample}_bat_krona.tsv", sample=sample_names), sample_names)]
output:
basedir_reporting + "/BAT_krona.html"
conda:
"envs/KnuttReads2Bins.yml"
message:
"Creating BAT Krona report"
shell:
"ktImportText -o {output} -n All {params.pairs}"