# eggNOG-Roary
Summary of [eggNOG-Roary](link) results from project: `[{{ project().name }}]` 

## Description
This report summarizes the functional annotation of Roary results using the eggNOG mapper. The eggNOG mapper provides functional annotation to genes based on the eggNOG database, which includes information on orthologous groups, functional descriptions, and additional metabolic and pathway data.

In [None]:
import pandas as pd
from pathlib import Path
from IPython.display import display, Markdown, HTML
import json
import altair as alt

import warnings
warnings.filterwarnings('ignore')

import altair as alt

from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.spatial.distance import pdist

from itables import to_html_datatable as DT
import itables.options as opt
opt.css = """
.itables table td { font-style: italic; font-size: .8em;}
.itables table th { font-style: oblique; font-size: .8em; }
"""
opt.classes = ["display", "compact"]
opt.lengthMenu = [5, 10, 20, 50, 100, 200, 500]

In [None]:
def merge_eggnog_and_pangenome(emapper_tsv_file, pangene_csv_file):
    """
    This function merges the eggnog and pangenome data.

    Parameters:
    emapper_tsv_file (str): The path to the emapper TSV file.
    pangene_csv_file (str): The path to the pangenome CSV file.

    Returns:
    DataFrame: The merged DataFrame.
    """
    df_eggnog = pd.read_csv(emapper_tsv_file, sep="\t", header=4, index_col="#query").iloc[:-3,:]
    df_pangene_summary = pd.read_csv(pangene_csv_file).set_index("locus_tag")
    df_eggnog_roary = df_pangene_summary.merge(df_eggnog, left_index=True, right_index=True, how="outer")
    df_cog_mapping = df_eggnog_roary.loc[:, ['Gene', 'pangenome_class_2','COG_category']].reset_index().set_index("Gene").rename(columns={"index" : "locus_tag"})
    return df_cog_mapping

def filter_by_pangenome_class(df_cog_mapping, gene_presence_binary_csv_file, category):
    """
    This function filters the DataFrame by the pangenome class and merges it with the gene presence binary data.

    Parameters:
    df_cog_mapping (DataFrame): The DataFrame to filter.
    gene_presence_binary_csv_file (str): The path to the gene presence binary CSV file.
    category (str): The pangenome class to filter by.

    Returns:
    DataFrame: The filtered and merged DataFrame.
    """
    df_cog_mapping_subset = df_cog_mapping[df_cog_mapping.pangenome_class_2 == category]
    df_matrix = pd.read_csv(gene_presence_binary_csv_file).set_index("Gene")
    df_matrix_cog = df_matrix.merge(df_cog_mapping_subset.loc[:, "COG_category"], left_index=True, right_index=True)
    return df_matrix_cog

def collapse_and_correct_cog_category(df_matrix_cog):
    """
    This function collapses and corrects the COG category in the DataFrame.

    Parameters:
    df_matrix_cog (DataFrame): The DataFrame to process.

    Returns:
    DataFrame: The processed DataFrame.
    """
    df_matrix_cog = df_matrix_cog.groupby("COG_category").sum()

    for i in df_matrix_cog.index:
        if len(i) > 1:
            for cog in i:
                for genome_id in df_matrix_cog.columns:
                    value = df_matrix_cog.loc[i, genome_id]
                    if cog not in df_matrix_cog.index.to_list():
                        # create new cog_value
                        df_matrix_cog.loc[cog, genome_id] = value
                    elif cog in df_matrix_cog.index.to_list():
                        df_matrix_cog.loc[cog, genome_id] = df_matrix_cog.loc[cog, genome_id] + value

    # Create a mask for indices with length 1
    mask = [len(i) <= 1 for i in df_matrix_cog.index]

    # Use the mask to filter the DataFrame
    df_matrix_cog = df_matrix_cog[mask].fillna(0).astype(int)
    return df_matrix_cog

def create_heatmap(df_matrix, sort_x=None, sort_y=None, x_labels=None, title="", subtitle=""):
    if sort_x == None:
        sort_x = df_matrix.columns.to_list()
    if sort_y == None:
        sort_y = df_matrix.index.to_list()

    source = df_matrix.reset_index().melt("COG_category")
    base = alt.Chart(source, 
                     title=alt.Title(
                         title,
                         subtitle=subtitle
                     ))
    
    heatmap = base.mark_rect().encode(
        alt.X('variable:O', title="Genome IDs", sort=sort_x),
        alt.Y('COG_category:O', title="COG_category", sort=sort_y),
        alt.Color('value:Q', title="Counts").scale(scheme="viridis", 
                                                   reverse=True
                                                  ).legend(direction="vertical")
    )
    return heatmap

# Source - https://github.com/SBRG/pymodulon/blob/master/src/pymodulon/gene_util.py
def get_cog_dict():
    """
    Get COG dict
    """

    cog_dict = {
        "A": "RNA processing and modification",
        "B": "Chromatin structure and dynamics",
        "C": "Energy production and conversion",
        "D": "Cell cycle control, cell division, chromosome partitioning",
        "E": "Amino acid transport and metabolism",
        "F": "Nucleotide transport and metabolism",
        "G": "Carbohydrate transport and metabolism",
        "H": "Coenzyme transport and metabolism",
        "I": "Lipid transport and metabolism",
        "J": "Translation, ribosomal structure and biogenesis",
        "K": "Transcription",
        "L": "Replication, recombination and repair",
        "M": "Cell wall/membrane/envelope biogenesis",
        "N": "Cell motility",
        "O": "Post-translational modification, protein turnover, and chaperones",
        "P": "Inorganic ion transport and metabolism",
        "Q": "Secondary metabolites biosynthesis, transport, and catabolism",
        "R": "General function prediction only",
        "S": "Function unknown",
        "T": "Signal transduction mechanisms",
        "U": "Intracellular trafficking, secretion, and vesicular transport",
        "V": "Defense mechanisms",
        "W": "Extracellular structures",
        "X": "Mobilome: prophages, transposons",
        "Y": "Nuclear structure",
        "Z": "Cytoskeleton",
        "-": "Not found in COG"
    }

    return cog_dict

In [None]:
report_dir = Path("../")
project_name = report_dir.resolve().stem

In [None]:
pangene_summary = report_dir / "tables/df_roary_pangene_summary_reassigned.csv"
emapper_annotations = report_dir / "eggnog_roary/emapper.annotations"
gene_presence_binary = report_dir / "roary/df_gene_presence_binary.csv"
automlst_tree = report_dir / "automlst_wrapper/df_genomes_tree.csv"

cog_dict = get_cog_dict()

In [None]:
df_gtdb = pd.read_csv(report_dir/ "tables" / "df_gtdb_meta.csv", index_col=0)

## Annotation Table

In [None]:
df_eggnog = pd.read_csv(emapper_annotations, sep="\t", header=4, index_col="#query").iloc[:-3,:]
df_eggnog.index.name = "locus_tag"

columns_to_show = ["seed_ortholog","evalue","score",#"eggNOG_OGs",
                   "max_annot_lvl","COG_category","Description","Preferred_name",#"GOs",
                   "EC","KEGG_ko","KEGG_Pathway","KEGG_Module","KEGG_Reaction","KEGG_rclass","BRITE","KEGG_TC","CAZy","BiGG_Reaction","PFAMs"]

display(HTML(DT(df_eggnog.loc[:, columns_to_show].reset_index(), columnDefs=[{"className": "dt-left", "targets": "_all"}], scrollX = True)))

## COG Category Distribution

In [None]:
df_tree = pd.read_csv(automlst_tree).set_index("genome_id", drop=False)

df_cog_mapping = merge_eggnog_and_pangenome(emapper_annotations, pangene_summary)

matrix_cogs = {}
for pangene_cat in df_cog_mapping.pangenome_class_2.unique():
    df_matrix_cog = filter_by_pangenome_class(df_cog_mapping, gene_presence_binary, pangene_cat)
    df_matrix_cog = collapse_and_correct_cog_category(df_matrix_cog)
    df_matrix_cog['row_sum'] = df_matrix_cog.sum(axis=1)
    df_sorted = df_matrix_cog.sort_values('row_sum', ascending=False)
    df_sorted = df_sorted.drop('row_sum', axis=1)
    matrix_cogs[pangene_cat] = df_sorted

In [None]:
mapping = {k:". ".join([i[0] if idx == 0 else i for idx, i in enumerate(v.strip("s__").split())]) for k,v in df_tree["organism"].to_dict().items()}
for k,v in mapping.items():
    if v == "":
        mapping[k] = f'{df_tree.loc[k, "genus"].strip("g__")} sp.'

In [None]:
# Perform hierarchical clustering and get an ordered list of index and column names
def reorder_heatmap(df_heatmap, title="", subtitle=""):
    linkage_matrix = linkage(pdist(df_heatmap.values))
    df_matrix_ordered = df_heatmap.copy()
    df_matrix_ordered = df_matrix_ordered.iloc[leaves_list(linkage_matrix)]

    heatmap = create_heatmap(df_matrix_ordered, sort_y=df_heatmap.index.to_list(), title=title, subtitle=subtitle)
    return heatmap

heatmaps = {}
for k, v in matrix_cogs.items():
    heatmaps[k] = reorder_heatmap(v, title=f"{k} Pangenes")

### Core Pangene

In [None]:
category = "Core"

source = matrix_cogs[category].reset_index().melt("COG_category")
source['description'] = source['COG_category'].map(cog_dict)
source = source.groupby('description')['value'].mean().reset_index()

rect = alt.Chart(source).mark_rect().encode(
        alt.Y('description:O', title="", 
              sort=[cog_dict[i] for i in matrix_cogs[category].index.to_list()], 
              axis=alt.Axis(orient='right')),
        alt.Color('value:Q')
    )

heatmaps[category] | rect

### Accessory Pangene

In [None]:
category = "Accessory"

source = matrix_cogs[category].reset_index().melt("COG_category")
source['description'] = source['COG_category'].map(cog_dict)
source = source.groupby('description')['value'].mean().reset_index()

rect = alt.Chart(source).mark_rect().encode(
        alt.Y('description:O', title="", 
              sort=[cog_dict[i] for i in matrix_cogs[category].index.to_list()], 
              axis=alt.Axis(orient='right')),
        #alt.X("value", title="Average count"),
        alt.Color('value:Q')
    )

heatmaps[category] | rect

### Rare Pangene

In [None]:
category = "Rare"

source = matrix_cogs[category].reset_index().melt("COG_category")
source['description'] = source['COG_category'].map(cog_dict)
source = source.groupby('description')['value'].mean().reset_index()

rect = alt.Chart(source).mark_rect().encode(
        alt.Y('description:O', title="", 
              sort=[cog_dict[i] for i in matrix_cogs[category].index.to_list()], 
              axis=alt.Axis(orient='right')),
        alt.Color('value:Q')
    )

heatmaps[category] | rect

### COG Categories 

In [None]:
# Convert dictionary to markdown string
md_string = "\n".join([f"**{key}**: *{value}* |" for key, value in cog_dict.items()])

# Display as markdown
Markdown(md_string)

## References

<font size="2">

{% for i in project().rule_used['eggnog-roary']['references'] %}
- *{{ i }}*
{% endfor %}

</font>