This part of the pipeline extracts the core gene sequences from the pangenome sequence collection and then employs these as marker genes to construct a bootstrapped core genome phylogeny for the entire genome set. Finally it splits this phylogeny into subtrees for each rRNA cluster.

### Checking dependencies

In [None]:
conda activate phylophlan
phylophlan --version
conda deactivate

### Paths and parameters

#### Pipeline input folders

In [None]:
merged_pangenome="./05-pangenomes/merge"
matrix="$merged_pangenome/matrix.csv"
prot_fams="$merged_pangenome/all_protein_families.faa"

pangenomes="./05-pangenomes"

genomes="./02-QC/data/genomes"

indices="./02-QC/indices"

metadata="./genomes_metadata"

#### Pipeline output folders

In [None]:
task_root="./08-core-phylogeny"
markers=$task_root/core_gene_markers
input=$task_root/input
output=$task_root/output
subtrees=$task_root/subtrees
roary_plots=$task_root/roary_plots

mkdir -p $task_root $markers $input $output $subtrees $roary_plots

#### Tool pointers and parameters

In [None]:
extract_core_genes="./utils/extract_core_genes.py"
extract_tree="./utils/extract_tree.R"
annotate_tree="./utils/annotateContrees.py"
roary_plotter="./utils/roary_plots.py"
remove_outgroup="./utils/remove_outgroup.R"
outgroup="./utils/Bsubtilis_outgroup.fna"

core_threshold=0.99

### Extract core genes

In [None]:
python $extract_core_genes $core_threshold $merged_pangenome $markers

### Build the marker database

In [None]:
conda activate phylophlan

In [None]:
mkdir -p $markers/core_gene_markers
phylophlan_setup_database -i $markers/core_gene_seqs.faa -o $markers/core_gene_markers -d core_gene_markers -t a \
| tee $task_root/build_database.log

### Build the phylogeny

#### Getting config and input files ready

In [None]:
phylophlan_write_config_file -d a -o $output/config.conf --db_aa diamond --map_dna diamond --map_aa diamond \
--msa mafft --trim trimal --tree1 iqtree --verbose | tee $output/config.log

manually add the request to make 10.000 bootstraps

In [None]:
echo \
"[tree1]
program_name = $(which iqtree)
params = -quiet -nt AUTO -m LG -B 10000
input = -s
output = -pre
command_line = #program_name# #params# #input# #output#
" \
| cat <(cat $output/config.conf | cut -z -d '[' -f -6 | tr -d '\0') - > $output/_config.conf
mv $output/_config.conf $output/config.conf

Copy the taxa as well as the outgroup to the input folder

In [None]:
mkdir -p $input
cp $genomes/* $input/
cp $outgroup $input/

#### Build!

In [None]:
phylophlan -i $input -d core_gene_markers -f $output/config.conf -t a --diversity medium \
-o $output --databases_folder $markers --nproc 20 --verbose \
| tee $output/run.log

In [None]:
conda deactivate

In [None]:
rm -rf $output/tmp

### Remove outgroup

In [None]:
mv $output/input.tre.contree $output/input.tree
Rscript $remove_outgroup $output/input.tree $output $(basename -s .fna $outgroup)

### Split the tree into rRNA-based subtrees

In [None]:
cp $output/merge.contree $subtrees/merge.contree
Rscript $extract_tree $output/merge.contree "$indices/group1" "$subtrees/group1.contree"
Rscript $extract_tree $output/merge.contree "$indices/group4" "$subtrees/group4.contree"
Rscript $extract_tree $output/merge.contree "$indices/group14a" "$subtrees/group14a.contree"
Rscript $extract_tree $output/merge.contree "$indices/group14b" "$subtrees/group14b.contree"

### Annotate subtrees

Replacing the RefSeq accession IDs with the taxa's names.

In [None]:
dir -1 $subtrees | grep .contree | xargs -I % python $annotate_tree $subtrees/% $metadata "contree"

### Roary plots

In [None]:
function roary_plotting () {
    $roary_plotter $subtrees/$1.contree $pangenomes/$1/matrix.csv
    mkdir -p $roary_plots/$1
    mv pangenome_*.pdf $roary_plots/$1
}

In [None]:
for p in $(dir -1 $pangenomes)
do
roary_plotting $p
done