This part of the pipeline parses the raw KOG annotation files from KAAS and produces count and frequency tables of the KOG annotations.

### Importing the packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from collections import Counter
import os

### Paths and parameters

#### Pipeline input folders

In [None]:
pangenomes = "./05-pangenomes"
kaas_mapper = "./07-PangenomeAnnotation/KOG/mapper"
indices = "02-QC/indices"

#### Pipeline output folders

In [None]:
task_root = "./07-PangenomeAnnotation/KOG"
processed_output = task_root+"/processed_output"

!mkdir -p $task_root $processed_output

#### Tool pointers and parameters

In [None]:
core_acc_threshold = 99
acc_unique_threshold = 15

# Determine which KAAS outputs there are
files = list(filter(lambda x: '.list' in x, os.listdir(indices)))
set_sizes = {file.split('.')[0]: pd.read_table(indices + "/" + file, usecols = [0], header = None).shape[0] for file in files}
sets = list(set_sizes.keys())

# Collect their paths and index by genome set
pangenome_matrices = {set: "./05-pangenomes/" + set + "/matrix.csv" for set in sets}
mapper_tables = {set: kaas_mapper + "/" + set + "_KAAS.json" for set in sets}

# Determine number of strains
# There are 14 columns with metrics preceding the P/A matrix
ns_strains = {set: pd.read_table(path, sep = ",", nrows = 1).shape[1] - 14 for set,path in pangenome_matrices.items()}

In [None]:
# Setting the naming conventions for both taxonomic clusters and pangenome partitions
cluster_labels = ['Merged', 'I', 'IV', 'XIVa', 'XIVb']
n_clusters = len(cluster_labels)

pangenome_partition_labels = ["core", "accessory", "unique"]
n_partitions = len(pangenome_partition_labels)

### Defining the pangenome partitions

#### Auxiliary functions

In [None]:
## Reads the presence/absence matrix and splits it out into pangenome partitions
##
## PARAMS
## path           file path to the presence/absence matrix
## ca_threshold   core-accessory threshold (default: 99%)
## au_threshold   accessory-unique threshold (default: 15%)
## print_size     flag to print the size of the genome partitions (default: false)
##
## OUTPUT
## a list of dataframe slices by partition containing the gene family ID, the absolute and relative presence of that family in the pangenome
##
def split_pangenome(path, n_strains, ca_threshold = 99, au_threshold = 15, print_size = False):
    clusters = pd.read_table(path, sep = ",", usecols=[0,3])
    
    # Define the presence ratio as the percentage in which this gene family is present
    clusters["Presence_ratio"] = clusters["No. isolates"].apply(lambda x: int(x) / n_strains * 100)

    # Thresholding to define the pangenome partitions
    core = clusters[clusters["Presence_ratio"] >= ca_threshold].rename(columns = {'Gene': 'ID'})
    acc = clusters[(clusters["Presence_ratio"] >= au_threshold) & (clusters["Presence_ratio"] < ca_threshold)].rename(columns = {'Gene': 'ID'})
    unique = clusters[clusters["Presence_ratio"] < au_threshold].rename(columns = {'Gene': 'ID'})

    if print_size:
        print("Core:\t" + str(core.shape))
        print("Accessory:\t" + str(acc.shape))
        print("Unique:\t" + str(unique.shape))

    return [core, acc, unique]

In [None]:
## Reads and parses the raw KAAS webserver output file and returns an annotation table in the format 
## 'cluster ID; KOG-A; KOG-B; KOG-C; KOG-D', as well as a loosely coupled dictionary-like hierarchy of the included KEGG annotation terms
##
## You can find the json file at the KAAS HTML output > BRITE hierarchies > KEGG Orthology (KO) > download json
##
## PARAMS
## path      file path to the raw KAAS json file
##
## OUTPUT
## a tuple containing a dataframe with the hierarchical KOG annotations of each gene family,
## and a dictionary of dictionaries reproducing the present fraction of the BRITE hierarchy
##
def read_kaas_annotations(path):
    with open(path,"r") as handle:
        cont = json.load(handle)
    
    kaas_annots=[] # will contain the hierarchical annotation of each gene family
    abs = {} # will contain the children terms (B-terms) for each A-level term
    bcs = {} # will contain the children terms (C-terms) for each B-level term
    cds = {} # will contain the children terms (D-terms) for each C-level term
    for a in cont["children"]:
        an = ' '.join(a["name"].split(' ')[1:]).split(' [')[0]
        abs[an] = set() # initialise children set
        try:
            for b in a["children"]:
                bn = ' '.join(b["name"].split(' ')[1:]).split(' [')[0] # parse name
                abs[an].add(bn) # each one of the terms in this loop has the same parent term, i.e. the current one in the outer loop
                bcs[bn] = set() # initialise the set for the children terms of this term
                try:
                    for c in b["children"]:
                        cn = ' '.join(c["name"].split(' ')[1:]).split(' [')[0]
                        bcs[bn].add(cn)
                        cds[cn] = set()
                        try:
                            for d in c["children"]:
                                dn = d["name"].split('; ')[2].split(' [')[0]
                                cds[cn].add(dn)
                                record = {'ID': d["name"].split(';')[0],
                                          "A": an, 
                                          "B": bn, 
                                          "C": cn, 
                                          "D": dn}
                                kaas_annots.append(record) # A full 4-level KEGG annotation record for this gene family
                        except KeyError: # Stop drilling down if there is no deeper annotation level (i.e. when there is no 'children' key)
                            continue
                except KeyError:
                    continue
        except KeyError:
            continue

    # Convert list of dictionary-like records to a dataframe
    kaas_annots = pd.DataFrame(kaas_annots)
    # Filter at the A level
    A_levels = pd.Series(['Metabolism',
                          'Genetic Information Processing',
                          'Environmental Information Processing',
                          'Not Included in Pathway or Brite',
                          'Cellular Processes']
                        )
    kaas_annots = kaas_annots[kaas_annots['A'].isin(A_levels)]
    kaas_annots = kaas_annots.reset_index(drop=True)

    # Convert each entry at each level of the KEGG hierarchy into a human-readable dictionary of lists
    # in the form 'parent annotation: list of children annotations' for each annotation level
    hierarchies = {level: {k: list(v) for k,v in cats.items()} for level,cats in list(zip(['A','B','C'],[abs, bcs, cds]))}
    
    return (kaas_annots, hierarchies)

In [None]:
## Splits by pangenome partitioning and returns KOG category count tables for each partition
##
## PARAMS
## annots        the full KAAS annotation table for this genome set
## distr         the set of gene family IDs split out by pangenome partition using split_pangenome()
##
## OUTPUT
## a list of KOG category count tables by pangenome partition
##
def split_kaas_counts(annots, distr):
    distr_annots = list(map(lambda x: pd.merge(x, annots, how = 'left', on = 'ID').drop(['No. isolates', 'Presence_ratio'], axis=1)
                            .fillna('-'), distr))
    distr_counts = list(map(count_kaas_annotations, distr_annots))
    return distr_counts

In [None]:
## Counts the frequencies of KOG categories of a certain hierarchical level in the supplied annotation set,
## and returns a count table per hierarchical level
##
## PARAMS
## full_annots     dataframe of family IDs and KOG annotations for a full genome set
## level           limit the KOG annotation to these levels (list of 'A','B','C' and/or 'D'; or None for no filter) (default: None)
## relative        flag to scale the category counts to a percentage scale (default: true)
##
## OUTPUT
## a dataframe with columns a KOG category and its frequency
##
def count_kaas_annotations(full_annots, level = None, relative = True):
    if level == None:
        levels = full_annots.columns.drop("ID")
    else:
        levels = pd.Series(level)
        
    annots_per_level = {}  
    for l in levels:
        # Annotations that have a different annotation at lower KOG levels, but not at higher levels, are deduplicated.
        # This may result in annotation counts at different KOG levels not adding up.
        # Example:
        # group_5022	Metabolism	Amino acid metabolism	Valine, leucine and isoleucine degradation	dihydrolipoyl dehydrogenase
        # group_5022	Metabolism	Amino acid metabolism	Lysine degradation	dihydrolipoyl dehydrogenase
        # => will yield a single KOG-B annotation, but a double KOG-C annotation:
        # group_5022	Amino acid metabolism    (B-level)
        # group_5022	Metabolism	Amino acid metabolism	Valine, leucine and isoleucine degradation    (C-level #1)
        # group_5022	Metabolism	Amino acid metabolism	Lysine degradation    (C-level #2)
        
        annots_per_level[l] = dict(Counter(full_annots[['ID', l]].drop_duplicates()[l].to_list()))
        
    annots = pd.DataFrame(annots_per_level).fillna(0).astype(int)
    
    if relative:
        annots = annots.apply(lambda x: x / sum(x))
        
    return annots

In [None]:
## Saves each annotation table at a certain KOG hierarchical level contained within the partitioned pangenome array
##
## PARAMS
## path       path to which the annotation tables should be saved
## distr      list of KOG category frequency tables, one for each pangenome partition
## prefix     prefix to distinguish the counted annotation tables (e.g. clusterI)
##
def save_kaas_counts(path, distr, prefix = ""):
    out_path_template = '/'.join(path.split('/')[:-1]) + "/"
    if len(prefix) > 0:
        out_path_template = out_path_template + prefix + "_"
    distr[0].to_csv(out_path_template + "core_annot", sep = '\t', index_label = "Annotation")
    distr[1].to_csv(out_path_template + "acc_annot", sep = '\t', index_label = "Annotation")
    distr[2].to_csv(out_path_template + "unique_annot", sep = '\t', index_label = "Annotation")

In [None]:
## Concatenates the KOG category count tables that were separated by pangenome partition
##
## PARAMS
## distr_counts    list of dataframes of KOG category and frequency, one for each pangenome partition
##
## OUTPUT
## a dataframe of KOG category frequencies, indexed by pangenome partition
##
def concat_counts(distr_counts, labels):
    concat_data = pd.concat(distr_counts, axis = 1).fillna(0).T
    concat_data = concat_data[sorted(concat_data.columns)]
    concat_data.index = labels
    return concat_data

In [None]:
## Returns the KOG category frequencies by genome set, pangenome partition of that genome set and KOG category
## by concatenating count tables for different genome sets on a "stitching column" (e.g. the species cluster tied to a genome set)
##
## PARAMS
## data_array       list of to be stitched concatenated count tables produced for different genome sets by concat_counts()
## group_index      name of the stitching column
## group_values     list of possible group labels to be used as value for the stitching column (e.g. name of genome sets),
##                  in the same order as @data_array
##
## OUTPUT
## a dataframe of KOG category frequencies by genome set, pangenome partition in that genome set and KOG category
##
def stitch_data(data_array, group_index, group_values):
    # Add the stitching column
    for i,val in enumerate(group_values):
        data_array[i][group_index] = val
    # Melt all count tables in the data array and concatenate them on the stitching column
    concat_data = pd.concat([pd.melt(i.reset_index(names = "Partition"), 
                                     id_vars=[group_index, "Partition"], 
                                     var_name = "Annotation", 
                                     value_name = "Fraction")
                             for i in data_array], ignore_index=True)
    return concat_data

#### Partitioning the genomes

In [None]:
distr_m = split_pangenome(pangenome_matrices['merge'], set_sizes['merge'], core_acc_threshold, acc_unique_threshold, True)
panm_core = distr_m[0]
panm_acc = distr_m[1]
panm_unique = distr_m[2]

In [None]:
distr_1 = split_pangenome(pangenome_matrices['group1'], set_sizes['group1'], core_acc_threshold, acc_unique_threshold, True)
pan1_core = distr_1[0]
pan1_acc = distr_1[1]
pan1_unique = distr_1[2]

In [None]:
distr_4 = split_pangenome(pangenome_matrices['group4'], set_sizes['group4'], core_acc_threshold, acc_unique_threshold, True)
pan4_core = distr_4[0]
pan4_acc = distr_4[1]
pan4_unique = distr_4[2]

In [None]:
distr_14a = split_pangenome(pangenome_matrices['group14a'], set_sizes['group14a'], core_acc_threshold, acc_unique_threshold, True)
pan14a_core = distr_14a[0]
pan14a_acc = distr_14a[1]
pan14a_unique = distr_14a[2]

In [None]:
distr_14b = split_pangenome(pangenome_matrices['group14b'], set_sizes['group14b'], core_acc_threshold, acc_unique_threshold, True)
pan14b_core = distr_14b[0]
pan14b_acc = distr_14b[1]
pan14b_unique = distr_14b[2]

#### Number of clusters split by taxonomic cluster and pangenome partition

In [None]:
cluster_counts = pd.DataFrame({cluster_labels[i]: 
                               list(map(lambda x: x.shape[0], p)) 
                               for i,p in enumerate([distr_m, distr_1, distr_4, distr_14a, distr_14b])
                              })
cluster_counts.index = pangenome_partition_labels
cluster_counts

In [None]:
cluster_counts_relative = cluster_counts.apply(lambda x: x/x.sum())
cluster_counts_relative

#### Processing the KOG counts

**Full set**

In [None]:
kaas_annots_m, cats_kaas_m = read_kaas_annotations(mapper_tables['merge'])

In [None]:
kaas_annots_m

In [None]:
kaas_annots_m.to_csv(processed_output + "/merged_full_annot", sep = "\t", index = False)

In [None]:
panm_distr_kaas_annots_counts = split_kaas_counts(kaas_annots_m, distr_m)
panm_core_kaas_annots_counts = panm_distr_kaas_annots_counts[0]
panm_acc_kaas_annots_counts = panm_distr_kaas_annots_counts[1]
panm_unique_kaas_annots_counts = panm_distr_kaas_annots_counts[2]

In [None]:
save_kaas_counts(mapper_tables['merge'], panm_distr_kaas_annots_counts, prefix = "merge")

In [None]:
panm_core_kaas_annots_counts

In [None]:
panm_acc_kaas_annots_counts

In [None]:
panm_unique_kaas_annots_counts

**Cluster 1**

In [None]:
kaas_annots_1, cats_kaas_1 = read_kaas_annotations(mapper_tables['group1'])

In [None]:
kaas_annots_1

In [None]:
kaas_annots_1.to_csv(processed_output + "/group1_full_annot", sep = "\t", index = False)

In [None]:
pan1_distr_kaas_annots_counts = split_kaas_counts(kaas_annots_1, distr_1)
pan1_core_kaas_annots_counts = pan1_distr_kaas_annots_counts[0]
pan1_acc_kaas_annots_counts = pan1_distr_kaas_annots_counts[1]
pan1_unique_kaas_annots_counts = pan1_distr_kaas_annots_counts[2]

In [None]:
save_kaas_counts(mapper_tables['group1'], pan1_distr_kaas_annots_counts, prefix = "group1")

In [None]:
pan1_core_kaas_annots_counts

In [None]:
pan1_acc_kaas_annots_counts

In [None]:
pan1_unique_kaas_annots_counts

**Cluster 4**

In [None]:
kaas_annots_4, cats_kaas_4 = read_kaas_annotations(mapper_tables['group4'])

In [None]:
kaas_annots_4

In [None]:
kaas_annots_4.to_csv(processed_output + "/group4_full_annot", sep = "\t", index = False)

In [None]:
pan4_distr_kaas_annots_counts = split_kaas_counts(kaas_annots_4, distr_4)
pan4_core_kaas_annots_counts = pan4_distr_kaas_annots_counts[0]
pan4_acc_kaas_annots_counts = pan4_distr_kaas_annots_counts[1]
pan4_unique_kaas_annots_counts = pan4_distr_kaas_annots_counts[2]

In [None]:
save_kaas_counts(mapper_tables['group4'], pan4_distr_kaas_annots_counts, prefix = "group4")

In [None]:
pan4_core_kaas_annots_counts

In [None]:
pan4_acc_kaas_annots_counts

In [None]:
pan4_unique_kaas_annots_counts

**Cluster 14a**

In [None]:
kaas_annots_14a, cats_kaas_14a = read_kaas_annotations(mapper_tables['group14a'])

In [None]:
kaas_annots_14a

In [None]:
kaas_annots_14a.to_csv(processed_output + "/group14a_full_annot", sep = "\t", index = False)

In [None]:
pan14a_distr_kaas_annots_counts = split_kaas_counts(kaas_annots_14a, distr_14a)
pan14a_core_kaas_annots_counts = pan14a_distr_kaas_annots_counts[0]
pan14a_acc_kaas_annots_counts = pan14a_distr_kaas_annots_counts[1]
pan14a_unique_kaas_annots_counts = pan14a_distr_kaas_annots_counts[2]

In [None]:
save_kaas_counts(mapper_tables['group14a'], pan14a_distr_kaas_annots_counts, prefix = "group14a")

In [None]:
pan14a_core_kaas_annots_counts

In [None]:
pan14a_acc_kaas_annots_counts

In [None]:
pan14a_unique_kaas_annots_counts

**Cluster 14b**

In [None]:
kaas_annots_14b, cats_kaas_14b = read_kaas_annotations(mapper_tables['group14b'])

In [None]:
kaas_annots_14b

In [None]:
kaas_annots_14b.to_csv(processed_output + "/group14b_full_annot", sep = "\t", index = False)

In [None]:
pan14b_distr_kaas_annots_counts = split_kaas_counts(kaas_annots_14b, distr_14b)
pan14b_core_kaas_annots_counts = pan14b_distr_kaas_annots_counts[0]
pan14b_acc_kaas_annots_counts = pan14b_distr_kaas_annots_counts[1]
pan14b_unique_kaas_annots_counts = pan14b_distr_kaas_annots_counts[2]

In [None]:
save_kaas_counts(mapper_tables['group14b'], pan14b_distr_kaas_annots_counts, prefix = "group14b")

In [None]:
pan14b_core_kaas_annots_counts

In [None]:
pan14b_acc_kaas_annots_counts

In [None]:
pan14b_unique_kaas_annots_counts

#### Data export

**Auxiliary functions**

In [None]:
## Extracts the KOG annotation counts of one hierarchical BRITE level, filtering for a certain category at another BRITE level, 
## using the local KEGG hierarchy as obtained through the read_kaas_annotations() function.
##
## PARAMS
## full_counts         the full KOG annotation table for a genome set
## level               the level from which counts should be extracted ('A','B','C' or 'D')
## filter_level        the level at which an additional filter is applied, should be one up from @level,
##                     yet this is not enforced ('A','B', or 'C'). Set at None for no filter. (default: None)
## filter_criterion    the criterion of any filter to be applied; ignored if @filter_level is None. (default: None)
## full_cats           the full present BRITE hierarchy as obtained from read_kaas_annotations();
##                     ignored if @filter_level is None. (default: None)
## OUTPUT
## a dataframe of counts for the requested BRITE level, after potential filtering
##
def extract_level_counts(full_counts, level, filter_level = None, filter_criterion = None, full_cats = None):
    extracted_counts = full_counts[full_counts[level] != 0]
    if filter_level != None:
        cats = pd.Index(full_cats[filter_level][filter_criterion])
        extracted_counts = extracted_counts.loc[cats.intersection(extracted_counts.index)]
    return extracted_counts[level]

In [None]:
## Exports a tsv file from which a KRONA plot can be directly generated
##
## PARAMS
## kaas_annots       the full KOG annotation table for a genome set
## distr             the distribution of gene families by pangenome partition as obtained from split_pangenome()
## distr_label       the label to identify this partition in the name of the exported file
## partition_labels  the labels of the pangenome partitions to identify the files for the different partitions
## path              the location at which these tsv files will be saved
##
def export_krona(kaas_annots, distr, distr_label, partition_labels, path):
    for label,partition in dict(zip(partition_labels, distr)).items():
        ids = partition['ID'].to_list()
        
        # KRONA expects an absolute frequency per category
        krona_out = pd.DataFrame(kaas_annots[kaas_annots['ID'].isin(ids)].groupby(['A','B','C','D']).count())
        
        krona_out = krona_out.reset_index().rename(columns = {'ID': 'counts'})
        krona_out = krona_out[['counts','A','B','C','D']]
        krona_out.to_csv(path + '/' + distr_label + '_' + label + '_KRONA' + '.tsv', sep = '\t', index = False, header = False)

**KRONA**

In [None]:
export_krona(kaas_annots_m, distr_m, 'merge', pangenome_partition_labels, processed_output)
export_krona(kaas_annots_1, distr_1, 'group1', pangenome_partition_labels, processed_output)
export_krona(kaas_annots_4, distr_4, 'group4', pangenome_partition_labels, processed_output)
export_krona(kaas_annots_14a, distr_14a, 'group14a', pangenome_partition_labels, processed_output)
export_krona(kaas_annots_14b, distr_14b, 'group14b', pangenome_partition_labels, processed_output)

**Level A**

In [None]:
# converting to percentages
concat_data_kog_A_m = concat_counts(list(map(lambda x: extract_level_counts(x, 'A'), panm_distr_kaas_annots_counts)),
                                    pangenome_partition_labels)*100
concat_data_kog_A_1 = concat_counts(list(map(lambda x: extract_level_counts(x, 'A'), pan1_distr_kaas_annots_counts)),
                                    pangenome_partition_labels)*100
concat_data_kog_A_4 = concat_counts(list(map(lambda x: extract_level_counts(x, 'A'), pan4_distr_kaas_annots_counts)),
                                    pangenome_partition_labels)*100
concat_data_kog_A_14a = concat_counts(list(map(lambda x: extract_level_counts(x, 'A'), pan14a_distr_kaas_annots_counts)),
                                      pangenome_partition_labels)*100
concat_data_kog_A_14b = concat_counts(list(map(lambda x: extract_level_counts(x, 'A'), pan14b_distr_kaas_annots_counts)),
                                      pangenome_partition_labels)*100

In [None]:
concat_data_kog_A_m.T

In [None]:
concat_data_kog_A_1.T

In [None]:
concat_data_kog_A_4.T

In [None]:
concat_data_kog_A_14a.T

In [None]:
concat_data_kog_A_14b.T

In [None]:
concat_data_kog_A = stitch_data([concat_data_kog_A_m, concat_data_kog_A_1, 
                                 concat_data_kog_A_4, concat_data_kog_A_14a,
                                 concat_data_kog_A_14b],
                                "Cluster", cluster_labels)

In [None]:
concat_data_kog_A.to_csv(processed_output + "/KOG_A_fractions.tsv", sep = '\t', index = False)

In [None]:
concat_data_kog_A

**Level B**

In [None]:
concat_data_kog_B_m = concat_counts(list(map(lambda x: extract_level_counts(x, 'B'), 
                                             panm_distr_kaas_annots_counts)),
                                    pangenome_partition_labels)*100 # converting to percentages
concat_data_kog_B_1 = concat_counts(list(map(lambda x: extract_level_counts(x, 'B'), 
                                             pan1_distr_kaas_annots_counts)),
                                    pangenome_partition_labels)*100
concat_data_kog_B_4 = concat_counts(list(map(lambda x: extract_level_counts(x, 'B'), 
                                             pan4_distr_kaas_annots_counts)),
                                    pangenome_partition_labels)*100
concat_data_kog_B_14a = concat_counts(list(map(lambda x: extract_level_counts(x, 'B'), 
                                             pan14a_distr_kaas_annots_counts)),
                                      pangenome_partition_labels)*100
concat_data_kog_B_14b = concat_counts(list(map(lambda x: extract_level_counts(x, 'B'), 
                                             pan14b_distr_kaas_annots_counts)),
                                      pangenome_partition_labels)*100

In [None]:
concat_data_kog_B_m.T

In [None]:
concat_data_kog_B_1.T

In [None]:
concat_data_kog_B_4.T

In [None]:
concat_data_kog_B_14a.T

In [None]:
concat_data_kog_B_14b.T

In [None]:
concat_data_kog_B = stitch_data([concat_data_kog_B_m, concat_data_kog_B_1, 
                                 concat_data_kog_B_4, concat_data_kog_B_14a,
                                 concat_data_kog_B_14b],
                                "Cluster", cluster_labels)

In [None]:
concat_data_kog_B.to_csv(processed_output + "/KOG_B_fractions.tsv", sep = '\t', index=False)

In [None]:
concat_data_kog_B

**Level B filtered for A = Not included in Pathway or BRITE**

In [None]:
concat_data_kog_B_ninc_m = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                filter_criterion = "Not Included in Pathway or Brite", 
                                                                                full_cats = cats_kaas_m), 
                                             panm_distr_kaas_annots_counts)),
                                         pangenome_partition_labels)*100 # converting to percentages
concat_data_kog_B_ninc_1 = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                filter_criterion = "Not Included in Pathway or Brite",
                                                                                full_cats = cats_kaas_1),
                                             pan1_distr_kaas_annots_counts)),
                                         pangenome_partition_labels)*100
concat_data_kog_B_ninc_4 = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                filter_criterion = "Not Included in Pathway or Brite",
                                                                                full_cats = cats_kaas_4), 
                                             pan4_distr_kaas_annots_counts)),
                                         pangenome_partition_labels)*100
concat_data_kog_B_ninc_14a = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                 filter_criterion = "Not Included in Pathway or Brite",
                                                                                 full_cats = cats_kaas_14a), 
                                              pan14a_distr_kaas_annots_counts)),
                                           pangenome_partition_labels)*100
concat_data_kog_B_ninc_14b = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                 filter_criterion = "Not Included in Pathway or Brite",
                                                                                 full_cats = cats_kaas_14b), 
                                              pan14b_distr_kaas_annots_counts)),
                                           pangenome_partition_labels)*100

In [None]:
concat_data_kog_B_ninc_m.T

In [None]:
concat_data_kog_B_ninc_1.T

In [None]:
concat_data_kog_B_ninc_4.T

In [None]:
concat_data_kog_B_ninc_14a.T

In [None]:
concat_data_kog_B_ninc_14b.T

In [None]:
concat_data_kog_B_ninc = stitch_data([concat_data_kog_B_ninc_m, concat_data_kog_B_ninc_1,
                                      concat_data_kog_B_ninc_4, concat_data_kog_B_ninc_14a,
                                      concat_data_kog_B_ninc_14b],
                                "Cluster", cluster_labels)

In [None]:
concat_data_kog_B_ninc.to_csv(processed_output + "/KOG_B_A_NINC_fractions.tsv", sep = '\t', index=False)

In [None]:
concat_data_kog_B_ninc

**Level B filtered for A = Metabolism**

In [None]:
concat_data_kog_B_met_m = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                filter_criterion = "Metabolism", 
                                                                                full_cats = cats_kaas_m), 
                                             panm_distr_kaas_annots_counts)),
                                        pangenome_partition_labels)*100 # converting to percentages
concat_data_kog_B_met_1 = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                filter_criterion = "Metabolism",
                                                                                full_cats = cats_kaas_1),
                                             pan1_distr_kaas_annots_counts)),
                                        pangenome_partition_labels)*100
concat_data_kog_B_met_4 = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                filter_criterion = "Metabolism",
                                                                                full_cats = cats_kaas_4), 
                                             pan4_distr_kaas_annots_counts)),
                                        pangenome_partition_labels)*100
concat_data_kog_B_met_14a = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                 filter_criterion = "Metabolism",
                                                                                 full_cats = cats_kaas_14a), 
                                              pan14a_distr_kaas_annots_counts)),
                                          pangenome_partition_labels)*100
concat_data_kog_B_met_14b = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                 filter_criterion = "Metabolism",
                                                                                 full_cats = cats_kaas_14b), 
                                              pan14b_distr_kaas_annots_counts)),
                                          pangenome_partition_labels)*100

In [None]:
# Including the unclassified fraction
def add_unclassified(met, ninc):
    try:
        return pd.concat([met, ninc['Unclassified: metabolism']], axis = 1)
    except KeyError:
        return met

concat_data_kog_B_met_m = add_unclassified(concat_data_kog_B_met_m, concat_data_kog_B_ninc_m)
concat_data_kog_B_met_1 = add_unclassified(concat_data_kog_B_met_1, concat_data_kog_B_ninc_1)
concat_data_kog_B_met_4 = add_unclassified(concat_data_kog_B_met_4, concat_data_kog_B_ninc_4)
concat_data_kog_B_met_14a = add_unclassified(concat_data_kog_B_met_14a, concat_data_kog_B_ninc_14a)
concat_data_kog_B_met_14b = add_unclassified(concat_data_kog_B_met_14b, concat_data_kog_B_ninc_14b)

In [None]:
concat_data_kog_B_met_m.T

In [None]:
concat_data_kog_B_met_1.T

In [None]:
concat_data_kog_B_met_4.T

In [None]:
concat_data_kog_B_met_14a.T

In [None]:
concat_data_kog_B_met_14b.T

In [None]:
concat_data_kog_B_met = stitch_data([concat_data_kog_B_met_m, concat_data_kog_B_met_1, 
                                     concat_data_kog_B_met_4, concat_data_kog_B_met_14a,
                                     concat_data_kog_B_met_14b],
                                "Cluster", cluster_labels)

In [None]:
concat_data_kog_B_met.to_csv(processed_output + "/KOG_B_A_MET_fractions.tsv", sep = '\t', index=False)

In [None]:
concat_data_kog_B_met

**Level B filtered for A = Environmental Information Processing**

In [None]:
concat_data_kog_B_eip_m = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                filter_criterion = "Environmental Information Processing", 
                                                                                full_cats = cats_kaas_m), 
                                             panm_distr_kaas_annots_counts)),
                                        pangenome_partition_labels)*100 # converting to percentages
concat_data_kog_B_eip_1 = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                filter_criterion = "Environmental Information Processing",
                                                                                full_cats = cats_kaas_1),
                                             pan1_distr_kaas_annots_counts)),
                                        pangenome_partition_labels)*100
concat_data_kog_B_eip_4 = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                filter_criterion = "Environmental Information Processing",
                                                                                full_cats = cats_kaas_4), 
                                             pan4_distr_kaas_annots_counts)),
                                        pangenome_partition_labels)*100
concat_data_kog_B_eip_14a = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                 filter_criterion = "Environmental Information Processing",
                                                                                 full_cats = cats_kaas_14a), 
                                              pan14a_distr_kaas_annots_counts)),
                                          pangenome_partition_labels)*100
concat_data_kog_B_eip_14b = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                 filter_criterion = "Environmental Information Processing",
                                                                                 full_cats = cats_kaas_14b), 
                                              pan14b_distr_kaas_annots_counts)),
                                          pangenome_partition_labels)*100

In [None]:
concat_data_kog_B_eip_m.T

In [None]:
concat_data_kog_B_eip_1.T

In [None]:
concat_data_kog_B_eip_4.T

In [None]:
concat_data_kog_B_eip_14a.T

In [None]:
concat_data_kog_B_eip_14b.T

In [None]:
concat_data_kog_B_eip = stitch_data([concat_data_kog_B_eip_m, concat_data_kog_B_eip_1, 
                                     concat_data_kog_B_eip_4, concat_data_kog_B_eip_14a,
                                     concat_data_kog_B_eip_14b],
                                "Cluster", cluster_labels)

In [None]:
concat_data_kog_B_eip.to_csv(processed_output + "/KOG_B_A_EIP_fractions.tsv", sep = '\t', index=False)

In [None]:
concat_data_kog_B_eip

**Level B filtered for A = Genetic Information Processing**

In [None]:
concat_data_kog_B_gip_m = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                filter_criterion = "Genetic Information Processing", 
                                                                                full_cats = cats_kaas_m), 
                                             panm_distr_kaas_annots_counts)),
                                        pangenome_partition_labels)*100 # converting to percentages
concat_data_kog_B_gip_1 = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                filter_criterion = "Genetic Information Processing",
                                                                                full_cats = cats_kaas_1),
                                             pan1_distr_kaas_annots_counts)),
                                        pangenome_partition_labels)*100
concat_data_kog_B_gip_4 = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                filter_criterion = "Genetic Information Processing",
                                                                                full_cats = cats_kaas_4), 
                                             pan4_distr_kaas_annots_counts)),
                                        pangenome_partition_labels)*100
concat_data_kog_B_gip_14a = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                 filter_criterion = "Genetic Information Processing",
                                                                                 full_cats = cats_kaas_14a), 
                                              pan14a_distr_kaas_annots_counts)),
                                          pangenome_partition_labels)*100
concat_data_kog_B_gip_14b = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                 filter_criterion = "Genetic Information Processing",
                                                                                 full_cats = cats_kaas_14b), 
                                              pan14b_distr_kaas_annots_counts)),
                                          pangenome_partition_labels)*100

In [None]:
# Including the unclassified fraction
def add_unclassified(gip, ninc):
    try:
        return pd.concat([gip, ninc['Unclassified: genetic information processing']], axis = 1)
    except KeyError:
        return gip

concat_data_kog_B_gip_m = add_unclassified(concat_data_kog_B_gip_m, concat_data_kog_B_ninc_m)
concat_data_kog_B_gip_1 = add_unclassified(concat_data_kog_B_gip_1, concat_data_kog_B_ninc_1)
concat_data_kog_B_gip_4 = add_unclassified(concat_data_kog_B_gip_4, concat_data_kog_B_ninc_4)
concat_data_kog_B_gip_14a = add_unclassified(concat_data_kog_B_gip_14a, concat_data_kog_B_ninc_14a)
concat_data_kog_B_gip_14b = add_unclassified(concat_data_kog_B_gip_14b, concat_data_kog_B_ninc_14b)

In [None]:
concat_data_kog_B_gip_m.T

In [None]:
concat_data_kog_B_gip_1.T

In [None]:
concat_data_kog_B_gip_4.T

In [None]:
concat_data_kog_B_gip_14a.T

In [None]:
concat_data_kog_B_gip_14b.T

In [None]:
concat_data_kog_B_gip = stitch_data([concat_data_kog_B_gip_m, concat_data_kog_B_gip_1, 
                                     concat_data_kog_B_gip_4, concat_data_kog_B_gip_14a,
                                     concat_data_kog_B_gip_14b],
                                "Cluster", cluster_labels)

In [None]:
concat_data_kog_B_gip.to_csv(processed_output + "/KOG_B_A_GIP_fractions.tsv", sep = '\t', index=False)

In [None]:
concat_data_kog_B_gip

**Level B filtered for A = Cellular Processes**

In [None]:
concat_data_kog_B_cps_m = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                filter_criterion = "Cellular Processes", 
                                                                                full_cats = cats_kaas_m), 
                                             panm_distr_kaas_annots_counts)),
                                        pangenome_partition_labels)*100 # converting to percentages
concat_data_kog_B_cps_1 = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                filter_criterion = "Cellular Processes",
                                                                                full_cats = cats_kaas_1),
                                             pan1_distr_kaas_annots_counts)),
                                        pangenome_partition_labels)*100
concat_data_kog_B_cps_4 = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                filter_criterion = "Cellular Processes",
                                                                                full_cats = cats_kaas_4), 
                                             pan4_distr_kaas_annots_counts)),
                                        pangenome_partition_labels)*100
concat_data_kog_B_cps_14a = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                 filter_criterion = "Cellular Processes",
                                                                                 full_cats = cats_kaas_14a), 
                                              pan14a_distr_kaas_annots_counts)),
                                          pangenome_partition_labels)*100
concat_data_kog_B_cps_14b = concat_counts(list(map(lambda x: extract_level_counts(x, 'B', filter_level = 'A', 
                                                                                 filter_criterion = "Cellular Processes",
                                                                                 full_cats = cats_kaas_14b), 
                                              pan14b_distr_kaas_annots_counts)),
                                          pangenome_partition_labels)*100

In [None]:
# Including the unclassified fraction
def add_unclassified(cps, ninc):
    try:
        return pd.concat([cps, ninc['Unclassified: signaling and cellular processes']], axis = 1)
    except KeyError:
        return cps

concat_data_kog_B_cps_m = add_unclassified(concat_data_kog_B_cps_m, concat_data_kog_B_ninc_m)
concat_data_kog_B_cps_1 = add_unclassified(concat_data_kog_B_cps_1, concat_data_kog_B_ninc_1)
concat_data_kog_B_cps_4 = add_unclassified(concat_data_kog_B_cps_4, concat_data_kog_B_ninc_4)
concat_data_kog_B_cps_14a = add_unclassified(concat_data_kog_B_cps_14a, concat_data_kog_B_ninc_14a)
concat_data_kog_B_cps_14b = add_unclassified(concat_data_kog_B_cps_14b, concat_data_kog_B_ninc_14b)

In [None]:
concat_data_kog_B_cps_m.T

In [None]:
concat_data_kog_B_cps_1.T

In [None]:
concat_data_kog_B_cps_4.T

In [None]:
concat_data_kog_B_cps_14a.T

In [None]:
concat_data_kog_B_cps_14b.T

In [None]:
concat_data_kog_B_cps = stitch_data([concat_data_kog_B_cps_m, concat_data_kog_B_cps_1,
                                     concat_data_kog_B_cps_4, concat_data_kog_B_cps_14a,
                                     concat_data_kog_B_cps_14b],
                                "Cluster", cluster_labels)

In [None]:
concat_data_kog_B_cps.to_csv(processed_output + "/KOG_B_A_CPS_fractions.tsv", sep = '\t', index=False)

In [None]:
concat_data_kog_B_cps