#!/usr/bin/bash #SBATCH --nodes=1 #SBATCH --time=2-2:25:00 #SBATCH --job-name=autometa #SBATCH --partition=defq #SBATCH -o dlt_gtdbtk_%j.out #SBATCH -e dlt_gtdbtk_%j.err #SBATCH --mail-type=ALL #SBATCH --mail-user=v.berriosfarias@gmail.com module load miniconda/miniconda3 source activate /GWSPH/home/ecastron/miniconda3/envs/autometa # NOTE: To create the conda environment for autometa you can supply the Makefile command: # make create_environment # Now activate the created conda env # conda activate autometa # NOTE: To install autometa in the created conda environment you can supply the Makefile command: # make install # Filepaths assembly="/lustre/groups/cbi/Users/ecastron/valentin/sea_lions/filtered/no_host/cluster2/coassembly/final.contigs.fa" bam="/lustre/groups/cbi/Users/ecastron/valentin/sea_lions/filtered/no_host/cluster2/bam_files/sorted/merged/merged_sorted.bam" orfs="/lustre/groups/cbi/Users/ecastron/valentin/sea_lions/filtered/no_host/cluster2/autometa/metagneome.orfs.faa" blast="/lustre/groups/cbi/Users/ecastron/valentin/sea_lions/filtered/no_host/cluster2/autometa/blastp.tsv" ncbi="/GWSPH/home/ecastron/Databases/autometa_db" # Autometa Parameters length_cutoff=2500 # in bp # kmer counting, normalization transform and embedding method kmer_size=5 norm_method="am_clr" # choices: am_clr, clr, ilr pca_dimensions=50 # NOTE: must be greater than $embed_dimensions embed_dimensions=2 # NOTE: must be less than $pca_dimensions embed_method="bhsne" # choices: bhsne, sksne, umap, densne, trimap # Binning clustering method cluster_method="hdbscan" # choices: hdbscan, dbscan # Binning metrics cutoffs completeness=20.0 purity=95.0 cov_stddev_limit=25.0 gc_stddev_limit=5.0 cpus=32 seed=42 # Step 0: Do some Path handling with the provided `assembly` filepath simpleName="SLF_CL2" outdir="/lustre/groups/cbi/Users/ecastron/valentin/sea_lions/filtered/no_host/cluster2/autometa_output" if [ ! -d $outdir ] then mkdir -p $outdir fi ######### BEGIN ######### # Step 00: Report autometa version autometa --version # Step 1: filter assembly by length and retrieve contig lengths as well as GC content # input: # $assembly --> User input # $length_cutoff --> User input # output: filtered_assembly="${outdir}/${simpleName}.filtered.fna" gc_content="${outdir}/${simpleName}.gc_content.tsv" # script: autometa-length-filter \ --assembly $assembly \ --cutoff $length_cutoff \ --output-fasta $filtered_assembly \ --output-gc-content $gc_content # Step 2: Determine coverages from assembly read alignments # input: # NOTE: $bam is defined at top and the rest of the inputs are generated by autometa # output: bed="${outdir}/${simpleName}.coverages.bed.tsv" coverages="${outdir}/${simpleName}.coverages.tsv" # script: autometa-bedtools-genomecov --ibam $bam --bed $bed --output $coverages # Step 3: Annotate and filter markers # input: # $orfs --> User input # $cpus --> User input # $seed --> User input kingdoms=(bacteria archaea) # NOTE: We iterate through both sets of markers for binning both bacterial and archeal kingdoms for kingdom in ${kingdoms[@]};do # kingdom-specific output: hmmscan="${outdir}/${simpleName}.${kingdom}.hmmscan.tsv" markers="${outdir}/${simpleName}.${kingdom}.markers.tsv" # script: autometa-markers \ --orfs $orfs \ --hmmscan $hmmscan \ --out $markers \ --kingdom $kingdom \ --parallel \ --cpus 4 \ --seed $seed done # Step 4.1: Determine ORF lowest common ancestor (LCA) amongst top hits # input: # $blast --> User Input # $ncbi --> User Input # output: lca="${outdir}/${simpleName}.orfs.lca.tsv" sseqid2taxid="${outdir}/${simpleName}.orfs.sseqid2taxid.tsv" errorTaxids="${outdir}/${simpleName}.orfs.errortaxids.tsv" # script: autometa-taxonomy-lca \ --blast $blast \ --dbdir $ncbi \ --lca-output $lca \ --sseqid2taxid-output $sseqid2taxid \ --lca-error-taxids $errorTaxids # Step 4.2: Perform Modified Majority vote of ORF LCAs for all contigs that returned hits in blast search # input: # $lca --> Generated by step 4.1 # $ncbi --> User Input # output: votes="${outdir}/${simpleName}.taxids.tsv" # script: autometa-taxonomy-majority-vote --lca $lca --output $votes --dbdir $ncbi # Step 4.3: Split assigned taxonomies into kingdoms # input: # $votes --> Generated by step 4.2 # $outdir --> Generated by step 0 # $ncbi --> User Input # $assembly --> User Input # output: # Will write recovered superkingdoms to ${outdir} # e.g. ${outdir}/${simpleName}.bacteria.fna # e.g. ${outdir}/${simpleName}.archaea.fna # e.g. ${outdir}/${simpleName}.taxonomy.tsv # script: autometa-taxonomy \ --votes $votes \ --output $outdir \ --prefix $simpleName \ --split-rank-and-write superkingdom \ --assembly $assembly \ --ncbi $ncbi # Step 5: Perform k-mer counting on respective kingdoms # input: # $kmer_size --> User input # $norm_method --> User input # $embed_method --> User input # $embed_dimensions --> User input # $cpus --> User input # $seed --> User input kingdoms=(bacteria archaea) for kingdom in ${kingdoms[@]};do # kingdom-specific input: # NOTE: $fasta --> Generated by step 4.3 fasta="${outdir}/${simpleName}.${kingdom}.fna" # kingdom-specific output: counts="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.tsv" normalized="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.${norm_method}.tsv" embedded="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.${norm_method}.${embed_method}.tsv" if [ ! -f $fasta ] then echo "${fasta} does not exist, skipping..." continue fi # script: autometa-kmers \ --fasta $fasta \ --kmers $counts \ --size $kmer_size \ --norm-output $normalized \ --norm-method $norm_method \ --pca-dimensions $pca_dimensions \ --embedding-output $embedded \ --embedding-method $embed_method \ --embedding-dimensions $embed_dimensions \ --cpus $cpus \ --seed $seed done # Step 6: Perform binning on each set of bacterial and archaeal contigs # input: # $cpus --> User input # $seed --> User input taxonomy="${outdir}/${simpleName}.taxonomy.tsv" # Generated by step 4.3 # $gc_content --> Generated by step 1 kingdoms=(bacteria archaea) for kingdom in ${kingdoms[@]};do # kingdom-specific input: kmers="${outdir}/${simpleName}.${kingdom}.${kmer_size}mers.${norm_method}.${embed_method}.tsv" # Generated by step 5 markers="${outdir}/${simpleName}.${kingdom}.markers.tsv" # Generated by step 3 # kingdom-specific output: output_binning="${outdir}/${simpleName}.${kingdom}.${cluster_method}.tsv" output_main="${outdir}/${simpleName}.${kingdom}.${cluster_method}.main.tsv" if [ -f $output_main ] && [ -s $output_main ];then echo "$(basename $output_main) already exists. continuing..." continue fi # script: autometa-binning \ --kmers $kmers \ --coverages $coverages \ --gc-content $gc_content \ --markers $markers \ --output-binning $output_binning \ --output-main $output_main \ --clustering-method $cluster_method \ --completeness $completeness \ --purity $purity \ --cov-stddev-limit $cov_stddev_limit \ --gc-stddev-limit $gc_stddev_limit \ --taxonomy $taxonomy \ --starting-rank superkingdom \ --rank-filter superkingdom \ --rank-name-filter $kingdom done # Step 7: Create binning summary files # input: # $ncbi -> User input # $assembly -> User input kingdoms=(bacteria archaea) for kingdom in ${kingdoms[@]};do # kingdom-specific input: binning_main="${outdir}/${simpleName}.${kingdom}.${cluster_method}.main.tsv" # Generated by step 6 markers="${outdir}/${simpleName}.${kingdom}.markers.tsv" # Generated by step 3 # kingdom-specific output: output_stats="${outdir}/${simpleName}_${kingdom}_metabin_stats.tsv" output_taxonomy="${outdir}/${simpleName}_${kingdom}_metabin_taxonomy.tsv" output_metabins="${outdir}/${simpleName}_${kingdom}_metabins" if [ ! -f $binning_main ] then echo "${binning_main} does not exist, skipping..." continue fi # script: autometa-binning-summary \ --binning-main $binning_main \ --markers $markers \ --metagenome $assembly \ --ncbi $ncbi \ --output-stats $output_stats \ --output-taxonomy $output_taxonomy \ --output-metabins $output_metabins done ######### END #########