# eggNOG-mapper
Summary of [eggNOG-mapper](link) results from project: `[{{ project().name }}]` 

## Description
Fast functional annotation of novel sequences using precomputed orthologous groups and phylogenies from the [eggNOG database](http://eggnog5.embl.de) 

In [1]:
import pandas as pd
from pathlib import Path
from IPython.display import display, Markdown, HTML
import json
import altair as alt

import warnings
warnings.filterwarnings('ignore')

from itables import to_html_datatable as DT
import itables.options as opt
opt.css = """
.itables table td { font-style: italic; font-size: .8em;}
.itables table th { font-style: oblique; font-size: .8em; }
"""
opt.classes = ["display", "compact"]
opt.lengthMenu = [5, 10, 20, 50, 100, 200, 500]

In [2]:
#### UPDATE

report_dir = Path("/datadrive/bgcflow/data/processed/hq_saccharopolyspora/")
eggnog_interim = Path("/datadrive/bgcflow/data/interim/eggnog/")
df_gtdb_gtdbtk = pd.read_csv("/datadrive/bgcflow/data/processed/qc_saccharopolyspora/tables/df_gtdb_gtdbtk_meta.csv", index_col=0)

In [3]:
df_gtdb = pd.read_csv(report_dir/ "tables" / "df_gtdb_meta.csv", index_col=0)

In [4]:
#### UPDATE
df_gtdb = df_gtdb_gtdbtk.loc[df_gtdb.index, :]

In [5]:
# Source - https://github.com/SBRG/pymodulon/blob/master/src/pymodulon/gene_util.py
def get_cog_dict():
    """
    Get COG dict
    """

    cog_dict = {
        "A": "RNA processing and modification",
        "B": "Chromatin structure and dynamics",
        "C": "Energy production and conversion",
        "D": "Cell cycle control, cell division, chromosome partitioning",
        "E": "Amino acid transport and metabolism",
        "F": "Nucleotide transport and metabolism",
        "G": "Carbohydrate transport and metabolism",
        "H": "Coenzyme transport and metabolism",
        "I": "Lipid transport and metabolism",
        "J": "Translation, ribosomal structure and biogenesis",
        "K": "Transcription",
        "L": "Replication, recombination and repair",
        "M": "Cell wall/membrane/envelope biogenesis",
        "N": "Cell motility",
        "O": "Post-translational modification, protein turnover, and chaperones",
        "P": "Inorganic ion transport and metabolism",
        "Q": "Secondary metabolites biosynthesis, transport, and catabolism",
        "R": "General function prediction only",
        "S": "Function unknown",
        "T": "Signal transduction mechanisms",
        "U": "Intracellular trafficking, secretion, and vesicular transport",
        "V": "Defense mechanisms",
        "W": "Extracellular structures",
        "X": "Mobilome: prophages, transposons",
        "Y": "Nuclear structure",
        "Z": "Cytoskeleton",
        "-": "Not found in COG"
    }

    return cog_dict

cog_dict = get_cog_dict()

In [39]:
def get_enggnog_stats(df_genomes, eggnog_dir, cog_dict):
    '''
    Returns dataframe with eggnog statistics
    '''
    
    df_cog = pd.DataFrame(0, index=df_genomes.index, columns=list(cog_dict.keys()))
    df_cog_unique = pd.DataFrame(0, index=df_genomes.index, columns=list(cog_dict.keys()))
    
    df_eggNOG_PA = pd.DataFrame(index=df_genomes.index)
    
    df_kegg_module = pd.DataFrame(index=df_genomes.index)
    df_kegg_module_unique = pd.DataFrame(index=df_genomes.index)

    for genome_id in df_genomes.index:
        tsv_file = genome_id + ".emapper.annotations"
        eggnog_table_path = eggnog_dir/ genome_id/ tsv_file
        if eggnog_table_path.is_file():
            df_eggnog = pd.read_csv(eggnog_table_path, sep="\t", header=4, index_col="#query").iloc[:-3,:]
            df_eggnog.index.name = "locus_tag"
            for locus_tag in df_eggnog.index:
                cog_value = df_eggnog.loc[locus_tag, "COG_category"]
                if len(cog_value) == 1:
                    cog_id = cog_value
                    df_cog.loc[genome_id, cog_id] = df_cog.loc[genome_id, cog_id] + 1
                    df_cog_unique.loc[genome_id, cog_id] = df_cog_unique.loc[genome_id, cog_id] + 1
                else:
                    for cog_id in cog_value:
                        df_cog.loc[genome_id, cog_id] = df_cog.loc[genome_id, cog_id] + 1 
                
                max_annot_lvl = df_eggnog.loc[locus_tag, "max_annot_lvl"]
                eggNOG_OGs_list =  df_eggnog.loc[locus_tag, "eggNOG_OGs"].split(",")
                eggNOG_OG_dict = dict()
    
                for eggNOG_OG in eggNOG_OGs_list:
                    annot_lvl = eggNOG_OG.split("@")[1]
                    OG_value = eggNOG_OG.split("@")[0]
                    eggNOG_OG_dict[annot_lvl] = OG_value
                
                if max_annot_lvl in eggNOG_OG_dict.keys():
                    selected_OG = eggNOG_OG_dict[max_annot_lvl]
                
                if selected_OG not in df_eggNOG_PA.columns:
                    df_eggNOG_PA.loc[:, selected_OG] = 0
                    df_eggNOG_PA.loc[genome_id, selected_OG] = 1
                else:
                    df_eggNOG_PA.loc[genome_id, selected_OG] = df_eggNOG_PA.loc[genome_id, selected_OG] + 1
                
    df_cog = df_cog.reindex(columns=df_cog.sum().sort_values(ascending=False).index)
    df_cog_unique = df_cog_unique.reindex(columns=df_cog.sum().sort_values(ascending=False).index)
    
    df_cog_names = df_cog.rename(columns=cog_dict)
    df_cog_unique_names = df_cog_unique.rename(columns=cog_dict)
    
    df_eggNOG_PA = df_eggNOG_PA.reindex(columns=df_eggNOG_PA.sum().sort_values(ascending=False).index)
    
    return df_cog, df_cog_names, df_cog_unique, df_cog_unique_names, df_eggNOG_PA

df_cog, df_cog_names, df_cog_unique, df_cog_unique_names, df_eggNOG_PA = get_enggnog_stats(df_gtdb, eggnog_interim, cog_dict)

## Summary Table

Summary of number of unique genes belonging to each of the COG categories

[Download Table]({{ project().file_server() }}/tables/df_antismash_6.1.1_summary.csv){:target="_blank" .md-button}

In [None]:
display(HTML(DT(df_cog, columnDefs=[{"className": "dt-center", "targets": "_all"}],)))

Presence absence matrix of the eggNOG orthologous groups at widest annotation level (e.g. at Phylum level).

In [None]:
display(HTML(DT(df_eggNOG_PA, columnDefs=[{"className": "dt-center", "targets": "_all"}],)))

## References

<font size="2">

{% for i in project().rule_used['eggnog']['references'] %}
- *{{ i }}*
{% endfor %}

</font>