This part of the pipeline carries out a fastANI analysis and immediately processes the results into a clustermap. It also collects the assemblies that had an ANI similarity higher the recommended threshold of 95% to delineate species.

### Checking dependencies

In [None]:
!fastANI --version

### Paths and parameters

#### Pipeline input folders

In [None]:
genomes = "./02-QC/data/genomes"
metadata = "./genomes_metadata"

#### Pipeline output folders

In [None]:
task_root = "./03-fastANI"

!mkdir -p $task_root

#### Tool pointers and parameters

In [None]:
runner_script = "./utils/run_fastANI.sh"

### Run the fastANI analysis

In [None]:
!bash $runner_script $genomes $task_root

### Process the results

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
fastani_file = task_root+"/out"

In [None]:
fastani = pd.read_table(fastani_file, usecols = [0,1,2], header = None, names = ['Genome 1', 'Genome 2', 'ANI'])
fastani['Genome 1'] = fastani['Genome 1'].apply(lambda x: x.split('/')[-1][:-4])
fastani['Genome 2'] = fastani['Genome 2'].apply(lambda x: x.split('/')[-1][:-4])
fastani = fastani.pivot(index = 'Genome 1', columns = 'Genome 2', values = 'ANI').fillna(70)
fastani = (fastani + fastani.T)/2

In [None]:
fastani

In [None]:
fastani.to_csv(task_root+'/processed_out.tsv', sep = '\t')

### Making ANI clustermap

In [None]:
groups = pd.read_table(metadata, usecols = [1,2]).set_index('Genome_accession').squeeze()
group_colours = groups.map(dict(zip(['1','4','14a','14b'],'rbgk')))
group_colours.name = ""

In [None]:
group_colours

In [None]:
#sns.set(font_scale=1)
fig = sns.clustermap(fastani, row_colors = group_colours, cmap=sns.color_palette('magma_r', as_cmap=True), dendrogram_ratio = 0.15,
                     figsize = (8,8), cbar_kws = {'orientation': 'horizontal'}, cbar_pos = (0.01, 0.89, 0.16, 0.05),
                     xticklabels = False, yticklabels = False)
fig.ax_cbar.set_title('ANI (%)')
ax = fig.ax_heatmap
ax.set(xlabel = None, ylabel = None)
# ax.text(65,65, "ANI group 1", fontsize = 14, ha = "center", va = "center")
# ax.text(435,435, "ANI group 2", fontsize = 14, ha = "center", va = "center")
# ax.text(340,340, "ANI group 3", fontsize = 14, ha = "center", va = "center")
# ax.text(215,215, "ANI group 4", fontsize = 14, ha = "center", va = "center")
plt.savefig(task_root+"/ANI_heatmap.svg")
plt.show()

### Super-threshold ANIs

In [None]:
fastani_molten = fastani.where(np.triu(np.ones(fastani.shape), k = 1).astype(bool)).stack().reset_index().rename(columns = {0: 'ANI'})
fastani_molten

In [None]:
# Super-threshold ANIs omitting identical genomes
super_anis = fastani_molten[(fastani_molten['ANI'] >= 95) & (fastani_molten['ANI'] < 100)]

In [None]:
super_anis.to_csv(task_root+'/super_threshold', sep = '\t', index = False)
super_anis