This part of the pipeline processes the raw output generated by the microbeAnnotator tool and generates a clustermap of the KEGG module completenesses.

### Paths and parameters

#### Pipeline input folders

In [None]:
output_folder = "./04-KEGGCompleteness/output"
metadata = "./genomes_metadata"

#### Pipeline output folders

In [None]:
task_root = "./04-KEGGCompleteness"
results_folder = "./04-KEGGCompleteness/processed_output"

#### Tool pointers and parameters

In [None]:
inclusion_threshold = 50

In [None]:
my_color_palette = {'b': '#4477aa',
                    'g': '#228833',
                    'r': '#ee6677',
                    'k': '#ccbb44'}
cp = lambda x: my_color_palette[x]

### Processing results

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as ppt
import seaborn as sns
import os
import pandas as pd
from colorcet import glasbey
from scipy.cluster import hierarchy

In [None]:
os.makedirs(results_folder, exist_ok=True)

#### Parsing output

In [None]:
completeness = pd.read_table(output_folder + '/metabolic_summary__module_completeness.tab')

In [None]:
# Filtering out KEGG modules that have a maximum completeness less than 50%.
completeness.columns = list(map(lambda x: '.'.join(x.split('.')[:2]), list(completeness.columns)))
completeness['maximum_presence'] = completeness.max(axis = 1, numeric_only = True)
completeness_toplot = completeness[completeness['maximum_presence'] >= inclusion_threshold].drop(columns = ['maximum_presence'])
completeness_toplot

#### Load cluster annotations

In [None]:
metadata = pd.read_table(metadata, sep = '\t', usecols = [0,1,2,4,5])
metadata.columns = ['name','assemblyID', 'cluster', 'size', 'no. genes']
cluster_annotations_0 = metadata[['assemblyID', 'cluster']]

In [None]:
cluster_annotations = cluster_annotations_0.to_dict(orient = 'list')
cluster_annotations = dict(zip(*cluster_annotations.values()))
cluster_annotations = pd.Series(cluster_annotations, name = "Cluster")
cluster_annotations

#### Defining colour groups for rRNA cluster metadata

In [None]:
assembly_IDs = list(filter(lambda x: 'GC' in x, list(completeness.columns)))

In [None]:
clusters = cluster_annotations.loc[assembly_IDs]

# define colour mapping
groupColour = clusters.map(dict(zip(['1','4','14a','14b'], [cp(c) for c in 'rbgk'])))
groupColour.name = ""

In [None]:
groupColour

#### Defining colour groups for KEGG pathway modules

In [None]:
## Make a pathway to colour mapping via the pathway group
# get all pathway groups
pathway_groups = list(completeness_toplot['pathway group'].unique())

# mapping pathway groups to colours
palette = sns.color_palette(glasbey, n_colors = len(pathway_groups), as_cmap = True) 
pathway_colours = dict(zip(pathway_groups, palette))

# mapping pathways to pathway groups
name_pathway = dict(zip(*completeness_toplot[['name', 'pathway group']].to_dict(orient = 'list').values()))

# connecting the two mappings for all pathways
group_colours = {name: pathway_colours[name_pathway[name]] for name in completeness_toplot['name']}

#### Clustering the completenesses

In [None]:
completeness_numeric = completeness_toplot.select_dtypes(include = "number")

In [None]:
linkage = hierarchy.linkage(completeness_numeric.T, method = 'average', metric = 'euclidean', 
                            optimal_ordering = True) # with optimal ordering for visualising any lower-level pattern

In [None]:
row_linkage = hierarchy.linkage(completeness_numeric, method = 'average', metric = 'euclidean', 
                            optimal_ordering = True) # with optimal ordering for visualising any lower-level pattern

#### Clustermap

In [None]:
h = sns.clustermap(completeness_numeric, col_linkage = linkage, col_cluster = True, row_linkage = row_linkage, row_cluster = True,
                   xticklabels = False, yticklabels = True,
                   figsize = (9,20), dendrogram_ratio = (0.2, 0.05),
                   col_colors = groupColour, cmap = "magma_r")
h.ax_row_dendrogram.set_visible(False)
h.ax_heatmap.set_yticklabels(completeness_toplot['name'].iloc[h.dendrogram_row.reordered_ind])
h.ax_heatmap.set_xlabel('Genome assemblies', size = 18)
h.ax_heatmap.set_ylabel('KEGG module', size = 18)
h.ax_heatmap.collections[0].colorbar.set_label('Module completeness (%)', size = 16)

# The legend has to be composed on the fly. Otherwise, we can't use our pathway group-level label colouring
legend_patches = []
pathways_covered = []
for label in h.ax_heatmap.get_yticklabels():
    text = label.get_text()
    pathway = name_pathway[text]
    color = group_colours[text]
    # change the label of the pathway module at the tick label to the appropriate colour
    label.set_color(color)
    # only add a new entry to the legend if we haven't encountered a pathway from this group yet
    if pathway not in pathways_covered:
        legend_patches.append(ppt.Patch(color = color, label = pathway))
        pathways_covered.append(pathway)
        
plt.legend(handles = legend_patches, ncol = 1, bbox_to_anchor = (42,1,1,0))
plt.savefig(results_folder + '/' + 'module_completeness.svg')
plt.show()

In [None]:
# The metadata of the assemblies ordered by the clustering
clustered_ids = completeness_numeric.columns[h.dendrogram_col.reordered_ind]
clustered_ids.name = 'assemblyID'
clustered_ids = pd.DataFrame(clustered_ids)
clustered = clustered_ids.merge(metadata, on='assemblyID')
clustered

In [None]:
clustered.to_csv(results_folder + '/' + 'clustered_strains.tsv', sep = "\t", index = False)

#### Clustermap without pathway labels

In [None]:
h = sns.clustermap(completeness_numeric, col_linkage = linkage, col_cluster = True, row_cluster = True, 
                   xticklabels = False, yticklabels = False, cbar_pos = (0.01,0.5,0.05,0.4),
                   figsize = (6,6), dendrogram_ratio = (0.2, 0.05),
                   col_colors = groupColour, cmap = 'magma_r')
h.ax_row_dendrogram.set_visible(False)
h.ax_heatmap.set_xlabel('Genome assemblies', size = 12)
h.ax_heatmap.set_ylabel('KEGG module', size = 12)
h.ax_heatmap.collections[0].colorbar.set_label('Module completeness (%)', size = 11)

plt.savefig(results_folder + '/' + 'module_completeness_noTicks.svg')
plt.show()

#### Extract metabolically versatile and non-versatile group members

In [None]:
reordered_inds = h.dendrogram_col.reordered_ind
versatile_ids = list(hierarchy.fcluster(linkage, 2, criterion="maxclust")[reordered_inds])

In [None]:
edge = versatile_ids.index(2) # There are two groups, so the index of the first element of group 2 indicates the boundary between them
versatile = completeness_numeric.columns[reordered_inds][edge:].to_series(name = "versatile")
nonversatile = completeness_numeric.columns[reordered_inds][:edge].to_series(name = "non-versatile")

In [None]:
versatile.to_csv(task_root + '/versatile', header = False, index = False)
nonversatile.to_csv(task_root + '/non-versatile', header = False, index = False)