This part of the pipeline processes the raw COG annotation tables and produces count and frequency tables of COG annotations.

### Importing the packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools as it
from bidict import bidict
from collections import Counter
import os

### Paths and parameters

#### Pipeline input folders

In [None]:
pangenomes = "./05-pangenomes"
eggnog_mapper = "./07-PangenomeAnnotation/COG/mapper"
COG_cats = "./utils/COG_cats.tsv"
indices = "02-QC/indices"

#### Pipeline output folders

In [None]:
task_root = "./07-PangenomeAnnotation/COG"
processed_output = task_root+"/processed_output"
parent = "./07-PangenomeAnnotation"

!mkdir -p $task_root $processed_output

#### Tool pointers and parameters

In [None]:
core_acc_threshold = 99
acc_unique_threshold = 15

# Determine which eggNOG-mapper outputs there are
files = list(filter(lambda x: '.list' in x, os.listdir(indices)))
set_sizes = {file.split('.')[0]: pd.read_table(indices + "/" + file, usecols = [0], header = None).shape[0] for file in files}
sets = list(set_sizes.keys())

# Collect their paths and index by genome set
pangenome_matrices = {set: "./05-pangenomes/"+set+"/matrix.csv" for set in sets}
mapper_tables = {set: eggnog_mapper + "/all_protein_families_" + set + ".emapper.annotations" for set in sets}

In [None]:
# Setting the naming conventions for both taxonomic clusters and pangenome partitions
cluster_labels = ['Merged', 'I', 'IV', 'XIVa', 'XIVb']
n_clusters = len(cluster_labels)

pangenome_partition_labels = ["core", "accessory", "unique"]
n_partitions = len(pangenome_partition_labels)

### Importing the metadata

In [None]:
# COG category index
cog_cats = pd.read_table(COG_cats, sep='\t', header = None)
cog_cats = bidict(dict(zip(cog_cats[0], cog_cats[1])))

### Defining the pangenome partitions

#### Auxiliary functions

In [None]:
## Reads the presence/absence matrix and splits it out into pangenome partitions
##
## PARAMS
## path           file path to the presence/absence matrix
## ca_threshold   core-accessory threshold (default: 99%)
## au_threshold   accessory-unique threshold (default: 15%)
## print_size     flag to print the size of the genome partitions (default: false)
##
## OUTPUT
## a list of dataframe slices by partition containing the gene family ID, the absolute and relative presence of that family in the pangenome
##
def split_pangenome(path, ca_threshold = 99, au_threshold = 15, print_size = False):
    header = pd.read_table(path, sep = ",", nrows = 1)
    n_strains = header.shape[1] - 14 # There are 14 columns with metrics preceding the P/A matrix
    clusters = pd.read_table(path, sep = ",", usecols=[0,3])
    
    # Define the presence ratio as the percentage in which this gene family is present
    clusters["Presence_ratio"] = clusters["No. isolates"].apply(lambda x: int(x) / n_strains * 100)

    # Thresholding to define the pangenome partitions
    core = clusters[clusters["Presence_ratio"] >= ca_threshold].rename(columns = {'Gene': 'ID'})
    acc = clusters[(clusters["Presence_ratio"] >= au_threshold) & (clusters["Presence_ratio"] < ca_threshold)].rename(columns = {'Gene': 'ID'})
    unique = clusters[clusters["Presence_ratio"] < au_threshold].rename(columns = {'Gene': 'ID'})

    if print_size:
        print("Core:\t" + str(core.shape))
        print("Accessory:\t" + str(acc.shape))
        print("Unique:\t" + str(unique.shape))

    return [core, acc, unique]

In [None]:
## Reads the COG annotation from a raw eggNOG output file
##
## PARAMS
## path       filepath of the raw output file
##
## OUTPUT
## a dataframe with as columns the gene family ID and the associated COG category
##
def read_eggnog_annotations(path):
    eggnog_annots = pd.read_table(path, sep = "\t", usecols = [0,6], skiprows = 4, skipfooter = 3, header = 0)
    eggnog_annots.columns = ['ID', 'COG']
    return eggnog_annots

In [None]:
## Splits by pangenome partitioning and returns COG category count tables for each partition
##
## PARAMS
## annots        the full eggNOG annotation table for this genome set
## distr         the set of gene family IDs split out by pangenome partition using split_pangenome()
##
## OUTPUT
## a list of COG category count tables by pangenome partition
##
def split_eggnog_counts(annots, distr):
    distr_annots = list(map(lambda x: pd.merge(x, annots, how = "left", on = "ID"), distr)) # Get the set of annotations for each partition
    distr_counts = list(map(count_eggnog_annotations, distr_annots)) # Count the COG categories in each set of annotations
    return distr_counts

In [None]:
## Counts the frequencies of COG categories in the supplied annotation set and returns a count table
##
## PARAMS
## full_annots     dataframe of family IDs and COG annotations for a full genome set
## relative        flag to scale the category counts to a percentage scale (default: true)
##
## OUTPUT
## a dataframe with columns a COG category and its frequency
##
def count_eggnog_annotations(full_annots, relative = True):
    # gene families that were not present in the annotation table are unannotated as well
    nas_incl = full_annots['COG'].fillna('-').to_list()
    
    # counting COG categories, attributing plural annotations (e.g. BE) to all categories
    counts = dict(Counter(list(it.chain(*[list(i) for i in nas_incl]))))
    
    # convert into dataframe and sort the categories
    counts = pd.DataFrame.from_dict([counts]).fillna(0).astype(int)
    counts = counts[sorted(counts.columns)]
    counts = counts.rename(index = {0: 'COG'}).T

    # scale if requested
    if relative:
        counts = counts.apply(lambda x: x / sum(x))
        
    return counts

In [None]:
## Saves each annotation table contained within the partitioned pangenome array
##
## PARAMS
## path       path to which the annotation tables should be saved
## distr      list of COG category frequency tables, one for each pangenome partition
## prefix     prefix to distinguish the counted annotation tables (e.g. clusterI)
##
def save_eggnog_counts(path, distr, prefix = ''):
    out_path_template = '/'.join(path.split('/')[:-1]) + "/"
    if len(prefix) > 0:
        out_path_template = out_path_template + prefix + "_"
    distr[0].to_csv(out_path_template + "core_annot", sep = '\t', index_label = "Annotation")
    distr[1].to_csv(out_path_template + "acc_annot", sep = '\t', index_label = "Annotation")
    distr[2].to_csv(out_path_template + "unique_annot", sep = '\t', index_label = "Annotation")

In [None]:
## Concatenates the COG category count tables that were separated by pangenome partition
##
## PARAMS
## distr_counts    list of dataframes of COG category and frequency, one for each pangenome partition
##
## OUTPUT
## a dataframe of COG category frequencies, indexed by pangenome partition
##
def concat_counts(distr_counts, labels):
    concat_data = pd.concat(distr_counts, axis = 1).fillna(0).T
    concat_data = concat_data[sorted(concat_data.columns)]
    concat_data.index = labels
    return concat_data

In [None]:
## Returns the COG category frequencies by genome set, pangenome partition of that genome set and COG category
## by concatenating count tables for different genome sets on a "stitching column" (e.g. the species cluster tied to a genome set)
##
## PARAMS
## data_array       list of to be stitched concatenated count tables produced for different genome sets by concat_counts()
## group_index      name of the stitching column
## group_values     list of possible group labels to be used as value for the stitching column (e.g. name of genome sets),
##                  in the same order as @data_array
##
## OUTPUT
## a dataframe of COG category frequencies by genome set, pangenome partition in that genome set and COG category
##
def stitch_data(data_array, group_index, group_values):
    # Add the stitching column
    for i,val in enumerate(group_values):
        data_array[i][group_index] = val
    # Melt all count tables in the data array and concatenate them on the stitching column
    concat_data = pd.concat([pd.melt(i.reset_index(names = "Partition"), 
                                     id_vars=[group_index, "Partition"], 
                                     var_name = "Annotation", 
                                     value_name = "Fraction")
                             for i in data_array], ignore_index=True)
    return concat_data

#### Partitioning the pangenomes

In [None]:
distr_m = split_pangenome(pangenome_matrices['merge'], core_acc_threshold, acc_unique_threshold, True)
panm_core = distr_m[0]
panm_acc = distr_m[1]
panm_unique = distr_m[2]

In [None]:
distr_1 = split_pangenome(pangenome_matrices['group1'], core_acc_threshold, acc_unique_threshold, True)
pan1_core = distr_1[0]
pan1_acc = distr_1[1]
pan1_unique = distr_1[2]

In [None]:
distr_4 = split_pangenome(pangenome_matrices['group4'], core_acc_threshold, acc_unique_threshold, True)
pan4_core = distr_4[0]
pan4_acc = distr_4[1]
pan4_unique = distr_4[2]

In [None]:
distr_14a = split_pangenome(pangenome_matrices['group14a'], core_acc_threshold, acc_unique_threshold, True)
pan14a_core = distr_14a[0]
pan14a_acc = distr_14a[1]
pan14a_unique = distr_14a[2]

In [None]:
distr_14b = split_pangenome(pangenome_matrices['group14b'], core_acc_threshold, acc_unique_threshold, True)
pan14b_core = distr_14b[0]
pan14b_acc = distr_14b[1]
pan14b_unique = distr_14b[2]

#### Number of clusters split by taxonomic cluster and pangenome partition

In [None]:
cluster_counts = pd.DataFrame({cluster_labels[i]: 
                               list(map(lambda x: x.shape[0], p)) 
                               for i,p in enumerate([distr_m, distr_1, distr_4, distr_14a, distr_14b])
                              })
cluster_counts.index = pangenome_partition_labels
cluster_counts

In [None]:
cluster_counts_relative = cluster_counts.apply(lambda x: x/x.sum())
cluster_counts_relative

In [None]:
cluster_counts.stack().reset_index().rename(columns = {'level_0': 'Partition', 'level_1': 'Cluster', 0: 'No genes'}
                                           ).to_csv(parent+"/pangenome_partition_sizes.tsv", sep = "\t")

#### Processing the COG assignments

**Full set**

In [None]:
eggnog_annots_m = read_eggnog_annotations(mapper_tables['merge'])

In [None]:
eggnog_annots_m

In [None]:
panm_eggnog_distr_annots_counts = split_eggnog_counts(eggnog_annots_m, distr_m)
panm_eggnog_core_annots_counts = panm_eggnog_distr_annots_counts[0]
panm_eggnog_acc_annots_counts = panm_eggnog_distr_annots_counts[1]
panm_eggnog_unique_annots_counts = panm_eggnog_distr_annots_counts[2]

In [None]:
save_eggnog_counts(mapper_tables['merge'], panm_eggnog_distr_annots_counts, prefix = 'merge')

In [None]:
panm_eggnog_core_annots_counts

In [None]:
panm_eggnog_acc_annots_counts

In [None]:
panm_eggnog_unique_annots_counts

**Cluster 1**

In [None]:
eggnog_annots_1 = read_eggnog_annotations(mapper_tables['group1'])

In [None]:
eggnog_annots_1

In [None]:
pan1_eggnog_distr_annots_counts = split_eggnog_counts(eggnog_annots_1, distr_1)
pan1_eggnog_core_annots_counts = pan1_eggnog_distr_annots_counts[0]
pan1_eggnog_acc_annots_counts = pan1_eggnog_distr_annots_counts[1]
pan1_eggnog_unique_annots_counts = pan1_eggnog_distr_annots_counts[2]

In [None]:
save_eggnog_counts(mapper_tables['group1'], pan1_eggnog_distr_annots_counts, prefix = 'group1')

In [None]:
pan1_eggnog_core_annots_counts

In [None]:
pan1_eggnog_acc_annots_counts

In [None]:
pan1_eggnog_unique_annots_counts

**Cluster 4**

In [None]:
eggnog_annots_4 = read_eggnog_annotations(mapper_tables['group4'])

In [None]:
eggnog_annots_4

In [None]:
pan4_eggnog_distr_annots_counts = split_eggnog_counts(eggnog_annots_4, distr_4)
pan4_eggnog_core_annots_counts = pan4_eggnog_distr_annots_counts[0]
pan4_eggnog_acc_annots_counts = pan4_eggnog_distr_annots_counts[1]
pan4_eggnog_unique_annots_counts = pan4_eggnog_distr_annots_counts[2]

In [None]:
save_eggnog_counts(mapper_tables['group4'], pan4_eggnog_distr_annots_counts, prefix = 'group4')

In [None]:
pan4_eggnog_core_annots_counts

In [None]:
pan4_eggnog_acc_annots_counts

In [None]:
pan4_eggnog_unique_annots_counts

**Cluster 14a**

In [None]:
eggnog_annots_14a = read_eggnog_annotations(mapper_tables['group14a'])

In [None]:
eggnog_annots_14a

In [None]:
pan14a_eggnog_distr_annots_counts = split_eggnog_counts(eggnog_annots_14a, distr_14a)
pan14a_eggnog_core_annots_counts = pan14a_eggnog_distr_annots_counts[0]
pan14a_eggnog_acc_annots_counts = pan14a_eggnog_distr_annots_counts[1]
pan14a_eggnog_unique_annots_counts = pan14a_eggnog_distr_annots_counts[2]

In [None]:
save_eggnog_counts(mapper_tables['group14a'], pan14a_eggnog_distr_annots_counts, prefix = 'group14a')

In [None]:
pan14a_eggnog_core_annots_counts

In [None]:
pan14a_eggnog_acc_annots_counts

In [None]:
pan14a_eggnog_unique_annots_counts

**Cluster 14b**

In [None]:
eggnog_annots_14b = read_eggnog_annotations(mapper_tables['group14b'])

In [None]:
eggnog_annots_14b

In [None]:
pan14b_eggnog_distr_annots_counts = split_eggnog_counts(eggnog_annots_14b, distr_14b)
pan14b_eggnog_core_annots_counts = pan14b_eggnog_distr_annots_counts[0]
pan14b_eggnog_acc_annots_counts = pan14b_eggnog_distr_annots_counts[1]
pan14b_eggnog_unique_annots_counts = pan14b_eggnog_distr_annots_counts[2]

In [None]:
save_eggnog_counts(mapper_tables['group14b'], pan14b_eggnog_distr_annots_counts, prefix = 'group14b')

In [None]:
pan14b_eggnog_core_annots_counts

In [None]:
pan14b_eggnog_acc_annots_counts

In [None]:
pan14b_eggnog_unique_annots_counts

**Data export**

In [None]:
concat_data_m = concat_counts(panm_eggnog_distr_annots_counts, pangenome_partition_labels)*100 # converting to percentages
concat_data_1 = concat_counts(pan1_eggnog_distr_annots_counts, pangenome_partition_labels)*100
concat_data_4 = concat_counts(pan4_eggnog_distr_annots_counts, pangenome_partition_labels)*100
concat_data_14a = concat_counts(pan14a_eggnog_distr_annots_counts, pangenome_partition_labels)*100
concat_data_14b = concat_counts(pan14b_eggnog_distr_annots_counts, pangenome_partition_labels)*100

In [None]:
concat_data_m

In [None]:
concat_data_1

In [None]:
concat_data_4

In [None]:
concat_data_14a

In [None]:
concat_data_14b

In [None]:
concat_data = stitch_data([concat_data_m, concat_data_1, concat_data_4, concat_data_14a, concat_data_14b], 
                          "Cluster", cluster_labels)

In [None]:
concat_data.to_csv(processed_output + '/COG_fractions.tsv', sep='\t', index = False)

In [None]:
concat_data