workflows/autometa.sh

#!/usr/bin/env bash
#SBATCH --partition=queue
#SBATCH -t 48:00:00
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=16
#SBATCH --error=autometa.%J.err
#SBATCH --output=autometa.%J.out

# NOTE: To create the conda environment for autometa you can supply the Makefile command:
# make create_environment

# Now activate the created conda env
# conda activate autometa

# NOTE: To install autometa in the created conda environment you can supply the Makefile command:
# make install

# Filepaths
assembly="Path to metagenome assembly fasta file"
bam="Path to metagenome read alignments.bam"
orfs="Path to orfs used as input to diamond blast"
blast="Path to diamond output file (outfmt 6)." # BlastP should be done against the NCBI `nr` database.
ncbi="Path to NCBI databases directory" # For more info see: https://autometa.readthedocs.io/en/latest/databases.html#ncbi
gtdb="Path to GTDB databases directory" # (Optional) For more info see: https://autometa.readthedocs.io/en/latest/databases.html#genome-taxonomy-database-gtdb

# Autometa Parameters
length_cutoff=3000 # in bp
# Taxon Assignment Parameters
taxa_routine="ncbi" # Choices are "ncbi" or "ncbi_gtdb"
# NOTE: When using the "ncbi_gtdb" option, blastP will be performed against the GTDB database
# using the kingdom-specific ORFs retrieved from the NCBI taxon-assignment sub-workflow.
# K-mer Counting, Normalization and Embedding Parameters
kmer_size=5
norm_method="am_clr" # choices: am_clr, clr, ilr
pca_dimensions=50 # NOTE: must be greater than $embed_dimensions
embed_dimensions=2 # NOTE: must be less than $pca_dimensions
embed_method="bhsne" # choices: bhsne, sksne, umap, densmap, trimap
# Binning Parameters (clustering methods and MAG quality thresholds)
cluster_method="hdbscan" # choices: hdbscan, dbscan
# Binning metrics cutoffs
completeness=20.0 # Accept MAGs greater than this value
purity=95.0 # Accept MAGs greater than this value
cov_stddev_limit=25.0 # Accept MAGs less than this value
gc_stddev_limit=5.0 # Accept MAGs less than this value
# Runtime Parameters
cpus=16
seed=42

if [[ $taxa_routine != "ncbi" ]] && [[ $taxa_routine != "ncbi_gtdb" ]]
then
    echo "ERROR: Invalid Taxonomic routine value. Please choose between ncbi or ncbi_gtdb. Current selection: ${taxa_routine}"
    exit 1
fi

# Step 0: Do some Path handling with the provided `assembly` filepath
simple_name="TemplateAssemblyName"
outdir="AutometaOutdir"
if [ ! -d $outdir ]
then mkdir -p $outdir
fi

######### BEGIN #########


# Step 00: Report autometa version
set -x
autometa --version
{ set +x; } 2>/dev/null

# Step 1: filter assembly by length and retrieve contig lengths as well as GC content

# input:
# $assembly --> User input
# $length_cutoff --> User input

# output:
filtered_assembly="${outdir}/${simple_name}.filtered.fna"
gc_content="${outdir}/${simple_name}.gc_content.tsv"

# script:
set -x
autometa-length-filter \
    --assembly $assembly \
    --cutoff $length_cutoff \
    --output-fasta $filtered_assembly \
    --output-gc-content $gc_content
{ set +x; } 2>/dev/null

# Step 2: Determine coverages from assembly read alignments

# input:
# NOTE: $bam is defined at top and the rest of the inputs are generated by autometa

# output:
bed="${outdir}/${simple_name}.coverages.bed.tsv"
coverages="${outdir}/${simple_name}.coverages.tsv"

# script:
set -x
autometa-bedtools-genomecov --ibam $bam --bed $bed --output $coverages
{ set +x; } 2>/dev/null

# Step 3: Annotate and filter markers
# input:
# $orfs --> User input
# $cpus --> User input
# $seed --> User input
kingdoms=(bacteria archaea)

# NOTE: We iterate through both sets of markers for binning both bacterial and archeal kingdoms
for kingdom in ${kingdoms[@]};do
    # kingdom-specific output:
    hmmscan="${outdir}/${simple_name}.${kingdom}.hmmscan.tsv"
    markers="${outdir}/${simple_name}.${kingdom}.markers.tsv"

    # script:
    set -x
    autometa-markers \
        --orfs $orfs \
        --hmmscan $hmmscan \
        --out $markers \
        --kingdom $kingdom \
        --parallel \
        --cpus 4 \
        --seed $seed
    { set +x; } 2>/dev/null
done

# Step 4.1: Determine ORF lowest common ancestor (LCA) amongst top hits

# input:
# $blast --> User Input
# $ncbi --> User Input
# $dbtype --> Updated according to $taxa_routine
dbtype="ncbi"
prefix="${simple_name}.${dbtype}"

# output:
lca="${outdir}/${prefix}.orfs.lca.tsv"
sseqid_to_taxid="${outdir}/${prefix}.orfs.sseqid2taxid.tsv"
error_taxids="${outdir}/${prefix}.orfs.errortaxids.tsv"

# script:
set -x
autometa-taxonomy-lca \
    --blast $blast \
    --dbdir $ncbi \
    --dbtype $dbtype \
    --lca-output $lca \
    --sseqid2taxid-output $sseqid_to_taxid \
    --lca-error-taxids $error_taxids
{ set +x; } 2>/dev/null

# Step 4.2: Perform Modified Majority vote of ORF LCAs for all contigs that returned hits in blast search

# input:
# $lca --> Generated by step 4.1
# $ncbi --> User Input
# $dbtype --> Updated according to $taxa_routine

# output:
votes="${outdir}/${prefix}.taxids.tsv"

# script:
set -x
autometa-taxonomy-majority-vote --lca $lca --output $votes --dbdir $ncbi --dbtype $dbtype
{ set +x; } 2>/dev/null

# Step 4.3: Split assigned taxonomies into kingdoms

# input:
# $votes --> Generated by step 4.2
# $outdir --> Generated by step 0
# $ncbi --> User Input
# $assembly --> User Input

# output:
# Will write recovered superkingdoms to ${outdir}
# e.g. ${outdir}/${prefix}.bacteria.fna
# e.g. ${outdir}/${prefix}.archaea.fna
# e.g. ${outdir}/${prefix}.taxonomy.tsv

# script:
set -x
autometa-taxonomy \
    --votes $votes \
    --output $outdir \
    --prefix $prefix \
    --split-rank-and-write superkingdom \
    --assembly $assembly \
    --dbdir $ncbi \
    --dbtype $dbtype
{ set +x; } 2>/dev/null

# Step 5: Taxon-assignment using the GTDB database 
# NOTE: only performed if `taxa_routine` is 'ncbi_gtdb'

# Step 5.1: Extract bacterial ORFs and run GTDB
# input:
# $kingdom_fasta --> Generated by step 4.3
# $orfs --> User Input

# output:
# orf_prefixes --> text file containing metagenome contig IDs classified within NCBI bacteria and archaea
# orf_ids --> text file containing contig ORF IDs classified within NCBI bacteria and archaea
# kingdom_orfs --> fasta file containing metagenome ORFs classified within NCBI bacteria or archaea
# gtdb_input_orfs --> metagenome orfs classified within NCBI bacteria *and* archaea

if [[ "$taxa_routine" == "ncbi_gtdb" ]]
then
    echo "Running GTDB taxon assignment step."
    # output
    gtdb_input_orfs="${outdir}/${prefix}.orfs.faa"
    

    for kingdom in ${kingdoms[@]};do

        kingdom_fasta="${outdir}/${prefix}.${kingdom}.fna"
        
        orf_prefixes="${outdir}/${prefix}.${kingdom}.contigIDs.txt"
        orf_ids="${outdir}/${prefix}.${kingdom}.orfIDs.txt"
        kingdom_orfs="${outdir}/${prefix}.${kingdom}.orfs.faa"

        if [ ! -f $kingdom_fasta ]
        then
            echo "${kingdom_fasta} does not exist, skipping..."
            continue
        fi

        # Retrieve contig IDs from kingdom fasta file
        set -x
        grep ">" $kingdom_fasta | \
            sed 's/^>//' | \
            cut -f1 -d" " | \
            sed 's/$/_/' > $orf_prefixes
        # Retrieve ORF IDs from contig IDs
        grep -f $orf_prefixes $orfs | cut -f1 -d" " | sed 's/^>//'  > $orf_ids
        # Retrieve ORF seqs from ORF IDs
        seqkit grep \
            --pattern-file $orf_ids \
            --out-file $kingdom_orfs \
            $orfs
        # Concatenate kingdom ORFs to single file for GTDB blastp
        cat $kingdom_orfs >> $gtdb_input_orfs
        { set +x; } 2>/dev/null
    done
    dbtype="gtdb"
    prefix="${simple_name}.${dbtype}"
    
    # Step 5.2: Run blastp
    # input:
    # $gtdb_input_orfs --> Generated from step 5.1
    gtdb_dmnd_db=$(find $gtdb -name "gtdb.dmnd") # generated using autometa-setup-gtdb (Must be performed prior to using this script)
    # output
    blast="${outdir}/${prefix}.blastp.tsv"

    # script
    set -x
    diamond blastp \
        --query $gtdb_input_orfs \
        --db $gtdb_dmnd_db \
        --evalue 1e-5 \
        --max-target-seqs 200 \
        --threads $cpus \
        --outfmt 6 \
        --out $blast
    { set +x; } 2>/dev/null

    #Step 5.3: Determine LCA
    # input:
    # $blast --> Generated from step 5.2
    # $gtdb --> User Input

    # output:
    lca="${outdir}/${prefix}.orfs.lca.tsv"
    sseqid_to_taxid="${outdir}/${prefix}.orfs.sseqid2taxid.tsv"
    error_taxids="${outdir}/${prefix}.orfs.errortaxids.tsv"

    # script:
    set -x
    autometa-taxonomy-lca \
        --blast $blast \
        --dbdir $gtdb \
        --dbtype $dbtype \
        --lca-output $lca \
        --sseqid2taxid-output $sseqid_to_taxid \
        --lca-error-taxids $error_taxids 
    { set +x; } 2>/dev/null

    # Step 5.4: Perform Modified Majority vote of ORF LCAs for all contigs that returned hits in blast search
    # input:
    # $lca --> Generated from step 5.3
    # $gtdb --> User Input

    # output:
    votes="${outdir}/${prefix}.taxids.tsv"

    # script:
    set -x
    autometa-taxonomy-majority-vote \
        --lca $lca \
        --output $votes \
        --dbdir $gtdb \
        --dbtype gtdb
    { set +x; } 2>/dev/null

    # Step 5.5: Split assigned taxonomies into kingdoms
    # input:
    # $votes --> Generated from step 5.4
    # $outdir --> Generated from step 0
    # prefix="${prefix}.${dbtype}"
    # $filtered_assembly --> Generated from step 1
    # $gtdb --> User Input

    # output:
    # Will write recovered superkingdoms to $outdir
    # e.g. ${outdir}/${prefix}.bacteria.fna
    # e.g. ${outdir}/${prefix}.archaea.fna
    # e.g. ${outdir}/${prefix}.taxonomy.tsv

    # script:
    set -x
    autometa-taxonomy \
        --votes $votes \
        --output $outdir \
        --prefix $prefix \
        --split-rank-and-write superkingdom \
        --assembly $filtered_assembly \
        --dbdir $gtdb \
        --dbtype gtdb
    { set +x; } 2>/dev/null
fi

# Step 6: Perform k-mer counting on respective kingdoms

# input:
# $kmer_size --> User input
# $norm_method --> User input
# $embed_method --> User input
# $embed_dimensions --> User input
# $cpus --> User input
# $seed --> User input

kingdoms=(bacteria archaea)

for kingdom in ${kingdoms[@]};do
    # kingdom-specific input:
    fasta="${outdir}/${prefix}.${kingdom}.fna" # NOTE: $prefix is updated according to taxa_routine above
    counts="${outdir}/${prefix}.${kingdom}.${kmer_size}mers.tsv"
    normalized="${outdir}/${prefix}.${kingdom}.${kmer_size}mers.${norm_method}.tsv"
    embedded="${outdir}/${prefix}.${kingdom}.${kmer_size}mers.${norm_method}.${embed_method}.tsv"

    if [ ! -f $fasta ]
    then
        echo "${fasta} does not exist, skipping..."
        continue
    fi
    # script:
    set -x
    autometa-kmers \
        --fasta $fasta \
        --kmers $counts \
        --size $kmer_size \
        --norm-output $normalized \
        --norm-method $norm_method \
        --pca-dimensions $pca_dimensions \
        --embedding-output $embedded \
        --embedding-method $embed_method \
        --embedding-dimensions $embed_dimensions \
        --cpus $cpus \
        --seed $seed
    { set +x; } 2>/dev/null
done

# Step 7: Perform binning on each set of bacterial and archaeal contigs

# input:
# $cpus --> User input
# $seed --> User input
# $gc_content --> Generated by step 1
taxonomy="${outdir}/${prefix}.taxonomy.tsv" # NOTE: $prefix is updated according to taxa_routine above
# $taxonomy is generated by either steps 4.3 or 5.5 depending on whether taxa_routine is 'ncbi' or 'ncbi_gtdb', respectively

kingdoms=(bacteria archaea)

for kingdom in ${kingdoms[@]};do
    # kingdom-specific input:
    kmers="${outdir}/${prefix}.${kingdom}.${kmer_size}mers.${norm_method}.${embed_method}.tsv" # Generated by step 6
    markers="${outdir}/${simple_name}.${kingdom}.markers.tsv" # Generated by step 3 (before taxon-assignment sub-workflows)

    # kingdom-specific output:
    output_binning="${outdir}/${prefix}.${kingdom}.${cluster_method}.tsv"
    output_main="${outdir}/${prefix}.${kingdom}.${cluster_method}.main.tsv"

    if [ -f $output_main ] && [ -s $output_main ];then
        echo "$(basename $output_main) already exists. continuing..."
        continue
    fi

    if [ ! -f $kmers ]
        then echo "${kingdom} file not found: kmers: ${kmers}), skipping."
        continue
    fi

    # script:
    set -x
    autometa-binning \
        --kmers $kmers \
        --coverages $coverages \
        --gc-content $gc_content \
        --markers $markers \
        --output-binning $output_binning \
        --output-main $output_main \
        --clustering-method $cluster_method \
        --completeness $completeness \
        --purity $purity \
        --cov-stddev-limit $cov_stddev_limit \
        --gc-stddev-limit $gc_stddev_limit \
        --taxonomy $taxonomy \
        --starting-rank superkingdom \
        --rank-filter superkingdom \
        --rank-name-filter $kingdom
    { set +x; } 2>/dev/null
done

# Step 8: Create binning summary files

# input:
# $ncbi -> User input
# $gtdb -> User input
# $assembly -> User input
# $dbtype -> # NOTE: $prefix is updated according to taxa_routine above

kingdoms=(bacteria archaea)

for kingdom in ${kingdoms[@]};do

    # kingdom-specific input:
    binning_main="${outdir}/${prefix}.${kingdom}.${cluster_method}.main.tsv" # Generated by step 7
    markers="${outdir}/${simple_name}.${kingdom}.markers.tsv" # Generated by step 3

    # kingdom-specific output:
    output_stats="${outdir}/${prefix}_${kingdom}_metabin_stats.tsv"
    output_taxonomy="${outdir}/${prefix}_${kingdom}_metabin_taxonomy.tsv"
    output_metabins="${outdir}/${prefix}_${kingdom}_metabins"

    if [ ! -f $binning_main ]
    then
        echo "${binning_main} does not exist, skipping..."
        continue
    fi

    if [[ "$taxa_routine" == "ncbi_gtdb" ]]
    then
        dbdir=$gtdb
    else
        dbdir=$ncbi
    fi
    set -x
    autometa-binning-summary \
        --binning-main $binning_main \
        --markers $markers \
        --metagenome $assembly \
        --dbdir $dbdir \
        --dbtype $dbtype \
        --output-stats $output_stats \
        --output-taxonomy $output_taxonomy \
        --output-metabins $output_metabins
    { set +x; } 2>/dev/null

done

#########  END  #########