# Import

In [3]:
import numpy as np
import json
from scipy.sparse import load_npz,save_npz,diags,csr_matrix
import scipy.sparse as sp
import pandas as pd
import os
import requests
from io import BytesIO
from tqdm import tqdm
from scipy.sparse.linalg import eigsh
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
from pathlib import Path
from matplotlib.backends.backend_pdf import PdfPages
from pypdf import PdfReader, PdfWriter
from tempfile import NamedTemporaryFile
import networkx as nx
import pickle
import gseapy as gp
import mygene
from IPython.display import display, HTML
import re
from collections import deque
from goatools.obo_parser import GODag
import math
from itertools import combinations
from collections import Counter
from gseapy.parser import read_gmt
import time
import random

In [4]:
pd.set_option('display.width', None)      # No line-wrapping
pd.set_option('display.max_columns', None)  # Show all columns

# Prep

## Loading variables

In [5]:
DISEASE = "BIPOLAR"
DISEASE_FOLDER = f"../output/{DISEASE}/"
RESULT_FOLDER = DISEASE_FOLDER + "leiden_results"
DGIDB_DIRECTORY = f"../../Gen_Hypergraph/output/DGIDB_{DISEASE}/"
MSIGDB_DIRECTORY = "../../Gen_Hypergraph/output/MSigDB_Full/"
RESULT_GRAPH = "result_graph_agg"

with open(DISEASE_FOLDER + "gene_to_index_distinct.json", "r") as file:
    gene_to_index_distinct = json.load(file)
    
try:
    with open(DGIDB_DIRECTORY + f"gene_to_index_{DISEASE}.json", "r") as file:
        DGIDB_gene_to_index = json.load(file)
except FileNotFoundError:
    DGIDB_gene_to_index = {}
    print("File not found. Setting DGIDB_gene_to_index to be {}.")
    
    
sim_mat = load_npz(f"{DISEASE_FOLDER}/agg_sim_mat.npz")

File not found. Setting DGIDB_gene_to_index to be {}.


In [6]:
## ORIGINAL
index_to_gene_distinct = {v: k for k, v in gene_to_index_distinct.items()}

In [None]:
# Loading result graph and communities
with open(f"{RESULT_FOLDER}/result_communities_selected.pkl", "rb") as f:
    communities_selected = pickle.load(f)
with open(f"{RESULT_FOLDER}/result_communities.pkl", "rb") as f:
    communities = pickle.load(f)
with open(f"{RESULT_FOLDER}/{RESULT_GRAPH}.pkl", "rb") as f:
    graph = pickle.load(f)

## Helpful functions (big object, drop NAN)

In [8]:
# Helpful functions
def drop_nan_from_communities(communities):
    cleaned_communities = []
    total_dropped = 0

    for i, community in enumerate(communities):
        cleaned = []
        dropped = 0
        for g in community:
            if g is None or (isinstance(g, float) and math.isnan(g)):
                dropped += 1
            else:
                cleaned.append(g)
        cleaned_communities.append(cleaned)
        total_dropped += dropped
        print(f"Community {i}: dropped {dropped} NaN entries")

    print(f"\nTotal dropped across all communities: {total_dropped}")
    return cleaned_communities

def big_objects(n=10, min_mb=1):
    """
    Show the largest objects currently in memory.
    
    Parameters
    ----------
    n : int
        Number of top objects to show.
    min_mb : float
        Minimum size (in MB) to include.
    """
    import sys
    import numpy as np
    import pandas as pd
    import scipy.sparse as sp
    from IPython import get_ipython

    def get_size(obj):
        try:
            if isinstance(obj, np.ndarray):
                return obj.nbytes
            elif isinstance(obj, pd.DataFrame) or isinstance(obj, pd.Series):
                return obj.memory_usage(deep=True).sum()
            elif sp.issparse(obj):
                return (obj.data.nbytes +
                        obj.indptr.nbytes +
                        obj.indices.nbytes)
            else:
                return sys.getsizeof(obj)
        except Exception:
            return 0

    ip = get_ipython()
    if ip is None:
        ns = globals()
    else:
        ns = ip.user_ns

    items = []
    for name, val in ns.items():
        if name.startswith('_'):
            continue  # skip internals
        size = get_size(val)
        if size > min_mb * 1024 ** 2:
            items.append((name, type(val).__name__, size))

    items.sort(key=lambda x: x[2], reverse=True)

    print(f"{'Variable':30s} {'Type':25s} {'Size (MB)':>10s}")
    print("-" * 70)
    for name, t, size in items[:n]:
        print(f"{name:30s} {t:25s} {size / 1024 ** 2:10.2f}")

## Index to HGNC

In [9]:
# Convert index to ncbi
def index_to_ncbi(comms,index_to_ncbi = index_to_gene_distinct):
    comms_ncbi = [list(map(index_to_ncbi.get, c)) for c in comms]
    return comms_ncbi

In [10]:
communities_ncbi = index_to_ncbi(communities_selected,index_to_gene_distinct)
print(communities_ncbi)
print(len(communities_ncbi))

[['137', '400966', '101927994', '645832', '654817', '653691', '26810', '160824', '107133522', '112806053', '441410', '392275', '145978', '387990', '400798', '158062', '400579', '649159', '400658', '91181', '8587', '389672', '100507588', '147920', '148145', '28584', '729305', '440925', '26231', '642366', '100506555', '101928673', '106479989', '100506835', '728773', '100132707', '101927943', '128653', '54522', '101954267', '645212', '3542', '200159', '391012', '105369921', '114795', '26776', '407022', '285708', '259234', '28660', '374387', '118568801', '400027', '100506119', '79927', '728855', '619505', '100534612', '26798', '574042', '169436', '100847004', '159162', '115482686', '376132', '100129550', '56344', '100421681', '26786', '100129827', '388579', '401630', '54435', '105375624', '151477', '105373958', '83', '228', '646996', '729708', '267020', '391075', '647310', '399972', '104564224', '152641', '150590', '147965', '101927862', '100873614', '106480075', '101927978', '100302278', 

In [11]:
# NCBI to HGNC symbol
def ncbi_to_HGNC(comms_ncbi):
    comms_HGNC = []
    for community in comms_ncbi:
        mg = mygene.MyGeneInfo()
        entrez_ids = [str(e) for e in community]

        results = mg.querymany(
            entrez_ids,
            scopes="entrezgene",
            fields="symbol",
            species="human"
        )

        # Build a mapping: input ID -> symbol (or None)
        id_to_symbol = {}
        for r in results:
            q = str(r.get("query"))
            id_to_symbol[q] = r.get("symbol") if not r.get("notfound") else None

        # Preserve original order
        symbols = [id_to_symbol.get(str(e), None) for e in entrez_ids]
        comms_HGNC.append(symbols)
    return comms_HGNC


In [12]:
COMMUNITIES_HGNC = ncbi_to_HGNC(communities_ncbi)

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequen

In [13]:
print(len(COMMUNITIES_HGNC[7]))

754


In [14]:
COMMUNITIES_HGNC = drop_nan_from_communities(COMMUNITIES_HGNC)

Community 0: dropped 0 NaN entries
Community 1: dropped 0 NaN entries
Community 2: dropped 0 NaN entries
Community 3: dropped 0 NaN entries
Community 4: dropped 0 NaN entries
Community 5: dropped 0 NaN entries
Community 6: dropped 0 NaN entries
Community 7: dropped 0 NaN entries
Community 8: dropped 0 NaN entries
Community 9: dropped 0 NaN entries
Community 10: dropped 0 NaN entries
Community 11: dropped 0 NaN entries
Community 12: dropped 0 NaN entries
Community 13: dropped 0 NaN entries
Community 14: dropped 0 NaN entries

Total dropped across all communities: 0


In [15]:
num_selected_comm = len(COMMUNITIES_HGNC)

In [16]:
print(num_selected_comm)

15


# Categoization Prep

### GO-slim

In [17]:
DATA_DIRECTORY = "../../data"
GO_OBO = f"{DATA_DIRECTORY}/GO/go-basic.obo"            # put the file in your working dir (or give full path)
GOSLIM_OBO = f"{DATA_DIRECTORY}/GO/goslim_generic.obo"  # swap to another slim if you prefer
GOSLIM_PIR_OBO = f"{DATA_DIRECTORY}/GO/goslim_pir.obo"  # swap to another slim if you prefer
GOSLIM_YEAST_OBO = f"{DATA_DIRECTORY}/GO/goslim_yeast.obo"
GOSLIM_AGR_OBO = f"{DATA_DIRECTORY}/GO/goslim_agr.obo"

In [18]:
# GO library
go = GODag(GO_OBO)

# SLIM libraries
slim = GODag(GOSLIM_OBO)
slim_pir = GODag(GOSLIM_PIR_OBO)
slim_yeast = GODag(GOSLIM_YEAST_OBO)
slim_agr = GODag(GOSLIM_AGR_OBO)

slim_ids = set(slim.keys())
slim_pir_ids = set(slim_pir.keys())
slim_yeast_ids = set(slim_yeast.keys())
slim_agr_ids = set(slim_agr.keys())

../../data/GO/go-basic.obo: fmt(1.2) rel(2025-10-10) 42,666 Terms
../../data/GO/goslim_generic.obo: fmt(1.2) rel(go/2025-10-10/subsets/goslim_generic.owl) 205 Terms
../../data/GO/goslim_pir.obo: fmt(1.2) rel(go/2025-10-10/subsets/goslim_pir.owl) 617 Terms
../../data/GO/goslim_yeast.obo: fmt(1.2) rel(go/2025-10-10/subsets/goslim_yeast.owl) 295 Terms
../../data/GO/goslim_agr.obo: fmt(1.2) rel(go/2025-10-10/subsets/goslim_agr.owl) 94 Terms


In [19]:
GO_RE = re.compile(r"(GO:\d{7})")

def get_goid(term: str):
    if isinstance(term, str):
        m = GO_RE.search(term)
        if m:
            return m.group(1)
    raise RuntimeError("Term not found!!")

def get_go_ancestors(go_id):
    """Return a list of ancestor GO term IDs for the given GO ID using QuickGO."""
    url = f"https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/{go_id}/ancestors"
    headers = {"Accept": "application/json"}

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    data = r.json()
    results = data.get("results", [])
    if not results:
        return []

    # Ancestors come back as a simple list of GO IDs (strings)
    ancestors = results[0].get("ancestors", [])
    return set(ancestors)


def get_go_ancestors_in_slim(go_id):
    ancestors = get_go_ancestors(go_id)
    return slim_ids & ancestors

In [20]:
def get_go_ancestors_at_depth(go_id, depth, include_relations=("is_a", "part_of")):
    """
    Return the set of GO term IDs that are ancestors of `go_id` and have
    absolute depth == `depth` in the GO DAG.

    Parameters
    ----------
    go_id : str
        Starting GO term (e.g., "GO:0051310").
    depth : int
        Absolute depth in the GO DAG (e.g., 3 means all ancestors at depth=3).
    include_relations : tuple[str]
        Relation types to traverse upward, e.g. ("is_a", "part_of", "regulates", ...).

    Returns
    -------
    set[str]
        Ancestor GO IDs whose term.depth == `depth`. Empty set if none.
    """
    if depth < 0:
        return set()
    if go_id not in go:
        return set()

    # One-hop function honoring relation filter
    def parent_ids(term):
        ids = set()
        if "is_a" in include_relations:
            # GOATOOLS usually puts is_a parents here (and sometimes part_of merged)
            ids.update(p.id for p in term.parents)

        rel = getattr(term, "relationship", {}) or {}
        for r in include_relations:
            # relationship entries are already GO IDs
            ids.update(rel.get(r, []))

        # ensure IDs exist in DAG
        return {pid for pid in ids if pid in go}

    result = set()
    frontier = {go_id}
    visited = {go_id}

    # BFS upwards, but pruning branches that are already above the target depth
    while frontier:
        next_frontier = set()
        for node in frontier:
            for pid in parent_ids(go[node]):
                if pid in visited:
                    continue
                visited.add(pid)
                d = go[pid].depth  # absolute depth in DAG

                if d == depth:
                    # ancestor at the exact target depth
                    result.add(pid)
                elif d > depth:
                    # still "below" target depth (further from root),
                    # its parents might reach the target depth
                    next_frontier.add(pid)
                # if d < depth: this branch has gone above the target,
                # and all further ancestors will have depth <= d, so we can skip
        frontier = next_frontier

    return result


### KEGG

In [21]:
def build_kegg_name_to_id(species="hsa"):
    """Map KEGG pathway name -> 'hsaXXXXX' (species-specific)."""
    lines = requests.get(f"https://rest.kegg.jp/list/pathway/{species}").text.strip().splitlines()
    name_to_id = {}
    for ln in lines:
        pid, raw = ln.split("\t")
        pid = pid.replace("path:", "")  # e.g. hsa03010
        # strip " - Homo sapiens (human)" suffix
        name = re.sub(r"\s*-\s*Homo sapiens.*$", "", raw).strip()
        name_to_id[name.lower()] = pid
    return name_to_id

name_to_id = build_kegg_name_to_id("hsa")

In [22]:
def get_kegg_level2(hsa_id: str) -> str | None:
    """
    Return the KEGG Level 2 category for a pathway like 'hsa03040'.
    Example: get_kegg_level2("hsa03040") -> 'Transcription'
    """
    url = f"http://rest.kegg.jp/get/{hsa_id}"
    try:
        text = requests.get(url, timeout=10).text
    except Exception:
        return None

    for line in text.splitlines():
        if line.startswith("CLASS"):
            # CLASS line looks like: CLASS       Genetic Information Processing; Transcription
            parts = [p.strip() for p in line.split(";", maxsplit=2)]
            if len(parts) >= 2:
                return [parts[1]]
            elif len(parts) == 1:
                return [parts[0].replace("CLASS", "").strip()]
    return []

### Reactome

In [23]:
def build_reactome_level_map(level=1, species="9606"):
    """
    Returns { 'R-HSA-xxxxx': ['CategoryNameAtLevel', ...], ... } for the given species.

    Parameters
    ----------
    level : int, default=1
        1-based depth in the Reactome pathway hierarchy:
          - level=1 → top-level Reactome categories (original behavior)
          - level=2 → second-level ancestors, etc.
        If a node is shallower than `level`, the deepest available ancestor
        is used as a fallback.
    species : str, default="9606"
        Taxonomy ID ("9606") or species name ("Homo sapiens").
    """
    if level < 1:
        raise ValueError("level must be >= 1 (1-based depth)")

    # ensure spaces are encoded if a name is used
    species_path = species.replace(" ", "+")
    url = f"https://reactome.org/ContentService/data/eventsHierarchy/{species_path}"
    r = requests.get(url, headers={"Accept": "application/json"}, timeout=60)
    r.raise_for_status()
    trees = r.json()  # list of trees, one per TopLevelPathway

    mapping = {}

    def walk(node, ancestors):
        """
        node: current node dict
        ancestors: list of ancestor nodes from root to parent of `node`
        """
        # ancestors_chain includes current node at the end
        ancestors_chain = ancestors + [node]

        st_id = node.get("stId")
        if st_id:
            # We want the ancestor at depth `level` (1-based).
            # If the path is shorter than `level`, fall back to the deepest one.
            if len(ancestors_chain) >= level:
                cat_node = ancestors_chain[level - 1]
            else:
                cat_node = ancestors_chain[-1]

            cat_name = cat_node.get("name")
            if cat_name:
                mapping.setdefault(st_id, set()).add(cat_name)

        # Recurse into children
        for child in node.get("children", []):
            walk(child, ancestors_chain)

    # Each tree is a top-level pathway
    for top in trees:
        walk(top, [])

    # sets -> sorted lists
    return {k: sorted(v) for k, v in mapping.items()}

# Example:
reactome_level1 = build_reactome_level_map(level = 2)
  # -> ['Signal Transduction']

In [24]:
print(reactome_level1["R-HSA-9007101"])

['Membrane Trafficking']


# Run Enrichment Analysis

In [25]:
TERM_SCORE_CAP = 0.001
PERCENTAGE = 0.1

In [26]:
len(COMMUNITIES_HGNC[0])

851

### GO

In [27]:
# GO Analysis; save terms with small size and high p-value
def go_enrichment(communities,
                  term_score_cap,
                  percentage, 
                  slim_ids = slim_yeast_ids,
                  depth = 1):
    important_terms = pd.DataFrame(columns=["Community Index","Community Size","Term", "Overlap", "Adjusted P-value","Category"])
    category_counts_and_overlap_score_list = {}
    i = 0
    num_nonzero_communities = 0
    
    for community in communities:
        # Gene Ontology enrichment
        enr_go = gp.enrichr(
            gene_list=community,
            gene_sets=['GO_Biological_Process_2023',
                    'GO_Molecular_Function_2023',
                    'GO_Cellular_Component_2023'],
            organism='Human',
            outdir=None # don't write to disk
        )
        go_df = enr_go.results
        

        # Filter by overlap percentage and adjusted p-value
        mask =  (go_df["Adjusted P-value"] < term_score_cap) & (go_df["Overlap"].apply(lambda x: int(x.split("/")[0])/int(x.split("/")[1]) > percentage))
        filtered = go_df[mask].copy()
        
        # Categorization from GO-Slim
        filtered["GO_ID"] = filtered["Term"].apply(get_goid)
        # filtered["Slim_IDs"] = filtered["GO_ID"].apply(get_go_ancestors_in_slim)
        filtered["Slim_IDs"] = filtered["GO_ID"].apply(lambda id: get_go_ancestors_at_depth(id, depth=depth, include_relations=("is_a", "part_of")))
        
        # Get empty count
        empty_count = (filtered["Slim_IDs"].apply(len) == 0).sum()
        
        # Get slim names    
        filtered["Category"] = filtered["Slim_IDs"].apply(lambda ids: [go[i].name for i in ids])
        
        # Sort
        filtered['Overlap (value)'] = filtered['Overlap'].apply(lambda x: int(x.split("/")[0])/int(x.split("/")[1]))
        filtered = filtered.sort_values(['Overlap (value)'], ascending=False)
        
        # Compute overlap score for every category:
        filtered_exploded = filtered.explode('Category').reset_index(drop=True)
        category_counts_and_overlap_score = {}
        for val, group in filtered_exploded.groupby('Category'):
            overlap_list = group["Overlap"].tolist()
            numerators = [(lambda x: int(x.split("/")[0]))(e) for e in overlap_list]
            denominators = [(lambda x: int(x.split("/")[1]))(e) for e in overlap_list]
            overlap_score = sum(numerators)/sum(denominators)
            
            category_counts_and_overlap_score[val] = (len(group),overlap_score,)
        
        category_counts_and_overlap_score_list[i] = category_counts_and_overlap_score
        
        # Add results to important terms
        if not filtered.empty:
            # print size of community
            print(f"Size of community: {len(community)}")
            
            # print number of filtered terms
            print(f"Number of filtered terms: {len(filtered)}")
            print(f"Number of unmapped terms: {empty_count}")      
            print(category_counts_and_overlap_score)
            filtered.loc[:, "Community Index"] = i
            filtered.loc[:, "Community Size"] = len(community)
            important_terms = pd.concat([important_terms, filtered], ignore_index=True)
            display(HTML(filtered[["Community Index",'Term','Overlap','Adjusted P-value',"Slim_IDs","Category"]].head(10).to_html(max_cols=None)))
            num_nonzero_communities += 1

        i += 1
    print(f"{num_nonzero_communities} out of {len(communities)} communities had significant GO terms.")
    return important_terms,category_counts_and_overlap_score_list

In [28]:
go_important_terms,go_category_counts_and_overlap_score = go_enrichment(COMMUNITIES_HGNC,TERM_SCORE_CAP,PERCENTAGE,slim_ids,depth = 2)

Size of community: 851
Number of filtered terms: 3
Number of unmapped terms: 0
{'catalytic activity, acting on a protein': (3, 0.15141955835962145), 'hydrolase activity': (3, 0.15141955835962145)}


  important_terms = pd.concat([important_terms, filtered], ignore_index=True)


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,Slim_IDs,Category
270,0,Cysteine-Type Deubiquitinase Activity (GO:0004843),16/98,0.000318,"{GO:0016787, GO:0140096}","[hydrolase activity, catalytic activity, acting on a protein]"
271,0,Cysteine-Type Endopeptidase Activity (GO:0004197),16/106,0.00045,"{GO:0016787, GO:0140096}","[hydrolase activity, catalytic activity, acting on a protein]"
272,0,Deubiquitinase Activity (GO:0101005),16/113,0.000684,"{GO:0016787, GO:0140096}","[hydrolase activity, catalytic activity, acting on a protein]"


Size of community: 883
Number of filtered terms: 15
Number of unmapped terms: 0
{'cellular component organization or biogenesis': (3, 0.16246498599439776), 'membrane': (1, 0.375), 'membrane protein complex': (1, 0.5), 'metabolic process': (7, 0.21802575107296138), 'nuclear protein-containing complex': (1, 0.2054794520547945), 'nucleic acid binding': (1, 0.10843373493975904), 'organelle': (1, 0.36363636363636365), 'ribonucleoprotein complex': (1, 0.2054794520547945)}


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,Slim_IDs,Category
1977,3,COPI Vesicle Coat (GO:0030126),6/12,0.0001645669,{GO:0098796},[membrane protein complex]
1979,3,COPI-coated Vesicle Membrane (GO:0030663),6/16,0.0009797697,{GO:0016020},[membrane]
1976,3,Mitochondrial Ribosome (GO:0005761),8/22,9.044272e-05,{GO:0043226},[organelle]
1,3,Mitochondrial Translation (GO:0032543),31/98,8.709975e-16,{GO:0008152},[metabolic process]
4,3,Mitochondrial Gene Expression (GO:0140053),29/103,1.529954e-13,{GO:0008152},[metabolic process]
3,3,Peptide Biosynthetic Process (GO:0043043),36/158,9.61509e-14,{GO:0008152},[metabolic process]
0,3,Translation (GO:0006412),51/234,1.708936e-18,{GO:0008152},[metabolic process]
6,3,Cytoplasmic Translation (GO:0002181),20/93,7.281577e-07,{GO:0008152},[metabolic process]
2,3,Macromolecule Biosynthetic Process (GO:0009059),39/183,8.218507e-14,{GO:0008152},[metabolic process]
1973,3,Small-Subunit Processome (GO:0032040),15/73,3.175424e-05,"{GO:0140513, GO:1990904}","[nuclear protein-containing complex, ribonucleoprotein complex]"


Size of community: 826
Number of filtered terms: 84
Number of unmapped terms: 3
{'Sm-like protein family complex': (2, 0.2926829268292683), 'catalytic activity, acting on a protein': (2, 0.10429447852760736), 'catalytic complex': (4, 0.2605042016806723), 'cell cycle process': (2, 0.18253968253968253), 'cellular component organization or biogenesis': (18, 0.17077175697865354), 'cellular localization': (10, 0.17294388931591084), 'establishment of localization': (12, 0.16009852216748768), 'intracellular protein-containing complex': (3, 0.25555555555555554), 'macromolecule localization': (4, 0.12277730736663844), 'membrane': (6, 0.12845990063875087), 'metabolic process': (14, 0.1578558225508318), 'nuclear protein-containing complex': (7, 0.322884012539185), 'nucleic acid binding': (1, 0.1276595744680851), 'organelle': (5, 0.14314928425357873), 'organelle subcompartment': (1, 0.16182572614107885), 'process utilizing autophagic mechanism': (1, 0.16346153846153846), 'protein binding': (3, 0.1

Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,Slim_IDs,Category
3457,4,Amphisome Membrane (GO:1904930),5/9,0.0002090631,{GO:0016020},[membrane]
52,4,Multivesicular Body-Lysosome Fusion (GO:0061763),5/9,0.0007178876,{GO:0071840},[cellular component organization or biogenesis]
51,4,Vesicle Fusion With Vacuole (GO:0051469),5/9,0.0007178876,{GO:0071840},[cellular component organization or biogenesis]
40,4,Lysosomal Membrane Organization (GO:0097212),6/11,0.0001342854,{GO:0071840},[cellular component organization or biogenesis]
3466,4,Amphisome (GO:0044753),5/11,0.0004768241,{GO:0043226},[organelle]
38,4,Late Endosome To Lysosome Transport (GO:1902774),7/16,0.0001235062,"{GO:0051234, GO:0051641}","[establishment of localization, cellular localization]"
55,4,Midbody Abscission (GO:0061952),6/15,0.0009237005,"{GO:0071840, GO:0022402}","[cellular component organization or biogenesis, cell cycle process]"
24,4,U2-type Prespliceosome Assembly (GO:1903241),9/23,1.896814e-05,{GO:0071840},[cellular component organization or biogenesis]
3447,4,U2-type Precatalytic Spliceosome (GO:0071005),18/49,1.271857e-11,"{GO:0140513, GO:1990904}","[nuclear protein-containing complex, ribonucleoprotein complex]"
3445,4,Precatalytic Spliceosome (GO:0071011),19/52,4.182128e-12,"{GO:0140513, GO:1990904}","[nuclear protein-containing complex, ribonucleoprotein complex]"


Size of community: 747
Number of filtered terms: 9
Number of unmapped terms: 0
{'cellular component organization or biogenesis': (2, 0.16463414634146342), 'cellular localization': (1, 0.20967741935483872), 'establishment of localization': (2, 0.16230366492146597), 'nuclear protein-containing complex': (1, 0.8), 'protein-containing complex binding': (1, 0.16470588235294117), 'regulation of biological process': (3, 0.2542372881355932)}


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,Slim_IDs,Category
3006,5,THO Complex Part Of Transcription Export Complex (GO:0000445),4/5,0.000292,{GO:0140513},[nuclear protein-containing complex]
0,5,Positive Regulation Of rRNA Processing (GO:2000234),7/9,8e-06,{GO:0050789},[regulation of biological process]
1,5,Formation Of Cytoplasmic Translation Initiation Complex (GO:0001732),7/14,0.000237,{GO:0071840},[cellular component organization or biogenesis]
4,5,Regulation Of rRNA Processing (GO:2000232),7/15,0.000244,{GO:0050789},[regulation of biological process]
3,5,mRNA Export From Nucleus (GO:0006406),13/62,0.000237,"{GO:0051234, GO:0051641}","[establishment of localization, cellular localization]"
2,5,"Regulation Of mRNA Splicing, Via Spliceosome (GO:0048024)",16/94,0.000237,{GO:0050789},[regulation of biological process]
2501,5,Ribosome Binding (GO:0043022),14/85,0.000739,{GO:0044877},[protein-containing complex binding]
6,5,Protein Targeting (GO:0006605),18/129,0.000528,{GO:0051234},[establishment of localization]
5,5,protein-RNA Complex Assembly (GO:0022618),20/150,0.000341,{GO:0071840},[cellular component organization or biogenesis]


Size of community: 844
Number of filtered terms: 11
Number of unmapped terms: 0
{'metabolic process': (10, 0.3076923076923077), 'regulation of biological process': (1, 0.2727272727272727)}


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,Slim_IDs,Category
1,6,snRNA Processing (GO:0016180),9/19,3.005605e-05,{GO:0008152},[metabolic process]
3,6,snRNA Metabolic Process (GO:0016073),9/21,4.430285e-05,{GO:0008152},[metabolic process]
5,6,snRNA 3'-End Processing (GO:0034472),9/22,4.811229e-05,{GO:0008152},[metabolic process]
9,6,Protein Neddylation (GO:0045116),8/22,0.0004299547,{GO:0008152},[metabolic process]
10,6,Histone mRNA Metabolic Process (GO:0008334),8/23,0.0005771991,{GO:0008152},[metabolic process]
2,6,mRNA 3'-End Processing (GO:0031124),12/40,4.320363e-05,{GO:0008152},[metabolic process]
8,6,RNA 3'-End Processing (GO:0031123),11/38,7.792154e-05,{GO:0008152},[metabolic process]
0,6,tRNA Modification (GO:0006400),19/67,5.193735e-08,{GO:0008152},[metabolic process]
6,6,Regulation Of DNA-templated Transcription Elongation (GO:0032784),12/44,5.98192e-05,{GO:0050789},[regulation of biological process]
4,6,tRNA Processing (GO:0008033),13/50,4.790312e-05,{GO:0008152},[metabolic process]


Size of community: 754
Number of filtered terms: 743
Number of unmapped terms: 34
{'Sm-like protein family complex': (9, 0.4694835680751174), 'anatomical structure development': (16, 0.18181818181818182), 'anatomical structure formation involved in morphogenesis': (2, 0.2413793103448276), 'anatomical structure morphogenesis': (11, 0.23641304347826086), 'carbohydrate derivative binding': (2, 0.141156462585034), 'catalytic activity, acting on a protein': (16, 0.1601713062098501), 'catalytic complex': (3, 0.41818181818181815), 'cell adhesion': (6, 0.29591836734693877), 'cell communication': (1, 0.5), 'cell cycle process': (14, 0.22081575246132207), 'cell death': (2, 0.15384615384615385), 'cell junction': (8, 0.14658210007047218), 'cell motility': (3, 0.23214285714285715), 'cellular component organization or biogenesis': (37, 0.15787860208461066), 'cellular developmental process': (10, 0.2662037037037037), 'cellular localization': (4, 0.1600877192982456), 'cellular response to stimulus': (

Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,Slim_IDs,Category
238,7,RNA Capping (GO:0036260),6/7,3.038516e-07,{GO:0008152},[metabolic process]
237,7,7-Methylguanosine Cap Hypermethylation (GO:0036261),6/7,3.038516e-07,{GO:0008152},[metabolic process]
321,7,Cardiac Muscle Cell-Cardiac Muscle Cell Adhesion (GO:0086042),5/6,5.16534e-06,{GO:0007155},[cell adhesion]
4404,7,U7 snRNP (GO:0005683),5/6,4.413747e-06,"{GO:0140513, GO:0120114, GO:1990904}","[nuclear protein-containing complex, Sm-like protein family complex, ribonucleoprotein complex]"
3856,7,U4 snRNA Binding (GO:0030621),4/5,0.0001084645,{GO:0003676},[nucleic acid binding]
4390,7,U4 snRNP (GO:0005687),8/10,3.135409e-09,"{GO:0140513, GO:0120114, GO:1990904}","[nuclear protein-containing complex, Sm-like protein family complex, ribonucleoprotein complex]"
439,7,Bundle Of His cell-Purkinje Myocyte Adhesion Involved In Cell Communication (GO:0086073),4/5,8.390268e-05,{GO:0007155},[cell adhesion]
440,7,Negative Regulation Of Protein Kinase Activity By Regulation Of Protein Phosphorylation (GO:0044387),4/5,8.390268e-05,"{GO:0065009, GO:0050789}","[regulation of molecular function, regulation of biological process]"
4407,7,U6 snRNP (GO:0005688),5/7,1.360595e-05,"{GO:0140513, GO:0120114, GO:1990904}","[nuclear protein-containing complex, Sm-like protein family complex, ribonucleoprotein complex]"
356,7,Positive Regulation Of Cardiac Epithelial To Mesenchymal Transition (GO:0062043),5/7,1.579795e-05,{GO:0050789},[regulation of biological process]


Size of community: 513
Number of filtered terms: 24
Number of unmapped terms: 1
{'anatomical structure development': (1, 0.14285714285714285), 'cellular component organization or biogenesis': (1, 0.15151515151515152), 'cellular developmental process': (2, 0.14202898550724638), 'hydrolase activity': (2, 0.38461538461538464), 'molecular function activator activity': (3, 0.20634920634920634), 'nucleic acid binding': (4, 0.13220815752461323), 'pattern specification process': (1, 0.1864406779661017), 'peptide binding': (1, 0.2857142857142857), 'protein binding': (3, 0.20634920634920634), 'regulation of biological process': (3, 0.23626373626373626), 'signaling receptor activity': (5, 0.2538860103626943), 'signaling receptor regulator activity': (3, 0.20634920634920634)}


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,Slim_IDs,Category
766,8,Prostaglandin Receptor Activity (GO:0004955),4/9,0.0004839994,{GO:0038023},[signaling receptor activity]
763,8,Lipid Phosphatase Activity (GO:0042577),5/13,0.0001395919,{GO:0016787},[hydrolase activity]
762,8,Phosphatidate Phosphatase Activity (GO:0008195),5/13,0.0001395919,{GO:0016787},[hydrolase activity]
764,8,G Protein-Coupled Photoreceptor Activity (GO:0008020),5/14,0.000195534,{GO:0038023},[signaling receptor activity]
757,8,Neuropeptide Receptor Activity (GO:0008188),12/36,9.645121e-10,{GO:0038023},[signaling receptor activity]
760,8,Neuropeptide Hormone Activity (GO:0005184),8/26,2.682652e-06,"{GO:0030545, GO:0140677, GO:0005515}","[signaling receptor regulator activity, molecular function activator activity, protein binding]"
759,8,Neuropeptide Activity (GO:0160041),8/26,2.682652e-06,"{GO:0030545, GO:0140677, GO:0005515}","[signaling receptor regulator activity, molecular function activator activity, protein binding]"
7,8,Positive Regulation Of Cytosolic Calcium Ion Concentration Involved In Phospholipase C-activating G Protein-Coupled Signaling Pathway (GO:0051482),8/27,2.171713e-05,{},[]
761,8,Neuropeptide Binding (GO:0042923),8/28,4.736852e-06,{GO:0042277},[peptide binding]
2,8,Neuropeptide Signaling Pathway (GO:0007218),19/68,1.11185e-12,{GO:0050789},[regulation of biological process]


Size of community: 220
Number of filtered terms: 5
Number of unmapped terms: 0
{'detection of stimulus': (2, 0.5035714285714286), 'signaling receptor activity': (1, 0.494475138121547), 'system process': (2, 0.5029411764705882)}


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,Slim_IDs,Category
0,10,Sensory Perception Of Smell (GO:0007608),117/230,7.036509999999999e-178,{GO:0003008},[system process]
2,10,Detection Of Chemical Stimulus Involved In Sensory Perception Of Smell (GO:0050911),70/139,2.7825239999999998e-102,{GO:0051606},[detection of stimulus]
1,10,Detection Of Chemical Stimulus Involved In Sensory Perception (GO:0050907),71/141,1.2415840000000001e-103,{GO:0051606},[detection of stimulus]
8,10,Olfactory Receptor Activity (GO:0004984),179/362,4.221339e-291,{GO:0038023},[signaling receptor activity]
3,10,Sensory Perception Of Chemical Stimulus (GO:0007606),54/110,1.8204219999999998e-77,{GO:0003008},[system process]


Size of community: 134
Number of filtered terms: 10
Number of unmapped terms: 1
{'molecular function activator activity': (1, 0.14285714285714285), 'protein binding': (3, 0.17045454545454544), 'signaling receptor regulator activity': (1, 0.14285714285714285), 'transmembrane transporter activity': (6, 0.21428571428571427)}


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,Slim_IDs,Category
154,12,CCR6 Chemokine Receptor Binding (GO:0031731),5/9,1.142788e-07,{GO:0005515},[protein binding]
155,12,Potassium:Proton Antiporter Activity (GO:0015386),4/12,3.384327e-05,{GO:0022857},[transmembrane transporter activity]
156,12,Sodium:Proton Antiporter Activity (GO:0015385),4/14,3.386511e-05,{GO:0022857},[transmembrane transporter activity]
157,12,Solute:Inorganic Anion Antiporter Activity (GO:0005452),4/14,3.386511e-05,{GO:0022857},[transmembrane transporter activity]
159,12,Solute:Potassium Antiporter Activity (GO:0022821),4/17,5.284791e-05,{GO:0022857},[transmembrane transporter activity]
160,12,Metal Cation:Proton Antiporter Activity (GO:0051139),4/21,0.0001020117,{GO:0022857},[transmembrane transporter activity]
158,12,Chemoattractant Activity (GO:0042056),5/35,5.120485e-05,"{GO:0030545, GO:0140677, GO:0005515}","[signaling receptor regulator activity, molecular function activator activity, protein binding]"
162,12,Inorganic Anion Transmembrane Transporter Activity (GO:0015103),4/32,0.0004924283,{},[]
163,12,Sodium Ion Transmembrane Transporter Activity (GO:0015081),4/34,0.0005656581,{GO:0022857},[transmembrane transporter activity]
161,12,CCR Chemokine Receptor Binding (GO:0048020),5/44,0.0001020117,{GO:0005515},[protein binding]


9 out of 15 communities had significant GO terms.


In [29]:
go_important_terms

Unnamed: 0,Community Index,Community Size,Term,Overlap,Adjusted P-value,Category,Gene_set,P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,GO_ID,Slim_IDs,Overlap (value)
0,0,851,Cysteine-Type Deubiquitinase Activity (GO:0004...,16/98,0.000318,"[hydrolase activity, catalytic activity, actin...",GO_Molecular_Function_2023,0.000004,0.0,0.0,4.455557,55.831114,USP17L15;USP17L18;USP17L17;USP17L19;USP17L21;U...,GO:0004843,"{GO:0016787, GO:0140096}",0.163265
1,0,851,Cysteine-Type Endopeptidase Activity (GO:0004197),16/106,0.000450,"[hydrolase activity, catalytic activity, actin...",GO_Molecular_Function_2023,0.000010,0.0,0.0,4.057804,46.622370,USP17L15;USP17L18;USP17L17;USP17L19;USP17L21;U...,GO:0004197,"{GO:0016787, GO:0140096}",0.150943
2,0,851,Deubiquitinase Activity (GO:0101005),16/113,0.000684,"[hydrolase activity, catalytic activity, actin...",GO_Molecular_Function_2023,0.000023,0.0,0.0,3.763590,40.143355,USP17L15;USP17L18;USP17L17;USP17L19;USP17L21;U...,GO:0101005,"{GO:0016787, GO:0140096}",0.141593
3,3,883,COPI Vesicle Coat (GO:0030126),6/12,0.000165,[membrane protein complex],GO_Cellular_Component_2023,0.000005,0.0,0.0,21.791334,264.505025,COPA;COPB1;TMED3;COPZ2;COPG1;TMED7,GO:0030126,{GO:0098796},0.500000
4,3,883,COPI-coated Vesicle Membrane (GO:0030663),6/16,0.000980,[membrane],GO_Cellular_Component_2023,0.000040,0.0,0.0,13.072064,132.432287,COPA;COPB1;TMED3;COPZ2;COPG1;TMED7,GO:0030663,{GO:0016020},0.375000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
899,12,134,Metal Cation:Proton Antiporter Activity (GO:00...,4/21,0.000102,[transmembrane transporter activity],GO_Molecular_Function_2023,0.000011,0.0,0.0,35.925792,411.674959,SLC9A4;SLC9A5;SLC9A8;SLC9A9,GO:0051139,{GO:0022857},0.190476
900,12,134,Chemoattractant Activity (GO:0042056),5/35,0.000051,"[signaling receptor regulator activity, molecu...",GO_Molecular_Function_2023,0.000003,0.0,0.0,25.627907,322.253082,DEFB130A;DEFB133;DEFB130B;DEFB110;DEFB109B,GO:0042056,"{GO:0030545, GO:0140677, GO:0005515}",0.142857
901,12,134,Inorganic Anion Transmembrane Transporter Acti...,4/32,0.000492,[],GO_Molecular_Function_2023,0.000060,0.0,0.0,21.800000,211.961450,SLC4A10;SLC4A2;SLC26A4;SLC26A3,GO:0015103,{},0.125000
902,12,134,Sodium Ion Transmembrane Transporter Activity ...,4/34,0.000566,[transmembrane transporter activity],GO_Molecular_Function_2023,0.000076,0.0,0.0,20.344615,192.846624,SLC9A4;SLC9A5;SLC9A8;SLC9A9,GO:0015081,{GO:0022857},0.117647


### KEGG

In [30]:
# KEGG
def kegg_enrichment(communities,
                    term_score_cap,
                    percentage):
    important_terms = pd.DataFrame(columns=["Community Index","Community Size","Term", "Overlap", "Adjusted P-value","Category"])
    category_counts_and_overlap_score_list = {}
    i = 0
    num_nonzero_communities = 0
    for community in communities:
        enr_path = gp.enrichr(
            gene_list=community,
            gene_sets=['KEGG_2021_Human'],
            organism='Human',
            outdir=None
        )
        KEGG_df = enr_path.results

        # Filter by overlap percentage and adjusted p-value
        mask =  (KEGG_df["Adjusted P-value"] < term_score_cap) & (KEGG_df["Overlap"].apply(lambda x: int(x.split("/")[0])/int(x.split("/")[1]) > percentage))
        filtered = KEGG_df[mask].copy()
        
        # Categorization from KEGG Level 2
        filtered["KEGG_ID"] = filtered["Term"].str.replace(r"\s*-\s*Homo sapiens.*$", "", regex=True).str.lower().map(name_to_id)
        filtered["Category"] = filtered["KEGG_ID"].map(get_kegg_level2)
        
        # Sort
        filtered['Overlap (value)'] = filtered['Overlap'].apply(lambda x: int(x.split("/")[0])/int(x.split("/")[1]))
        filtered = filtered.sort_values(['Overlap (value)'], ascending=False)
        
        # Compute overlap score for every category:
        filtered_exploded = filtered.explode('Category').reset_index(drop=True)
        category_counts_and_overlap_score = {}
        for val, group in filtered_exploded.groupby('Category'):
            overlap_list = group["Overlap"].tolist()
            numerators = [(lambda x: int(x.split("/")[0]))(e) for e in overlap_list]
            denominators = [(lambda x: int(x.split("/")[1]))(e) for e in overlap_list]
            overlap_score = sum(numerators)/sum(denominators)
            
            category_counts_and_overlap_score[val] = (len(group),overlap_score,)
        
        category_counts_and_overlap_score_list[i] = category_counts_and_overlap_score
        
        # Add results to important terms
        if not filtered.empty:
            # print size of community
            print(f"Size of community: {len(community)}")   
            
            # print number of filtered terms
            print(f"Number of filtered terms: {len(filtered)}")
            filtered.loc[:, "Community Index"] = i
            filtered.loc[:, "Community Size"] = len(community)
            important_terms = pd.concat([important_terms, filtered], ignore_index=True)
            
            # show results
            display(HTML(filtered[["Community Index",'Term','Overlap','Adjusted P-value',"KEGG_ID","Category"]].head(10).to_html(max_cols=None)))
            print(category_counts_and_overlap_score)
            num_nonzero_communities += 1

        i += 1
    print(f"{num_nonzero_communities} out of {len(communities)} communities had significant GO terms.")
    return important_terms,category_counts_and_overlap_score_list

In [31]:
kegg_important_terms,kegg_category_counts_and_overlap_score = kegg_enrichment(COMMUNITIES_HGNC,TERM_SCORE_CAP,PERCENTAGE)

Size of community: 883
Number of filtered terms: 1


  important_terms = pd.concat([important_terms, filtered], ignore_index=True)


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,KEGG_ID,Category
0,3,Ribosome,34/158,1.64602e-12,hsa03010,[Translation]


{'Translation': (1, 0.21518987341772153)}
Size of community: 826
Number of filtered terms: 4


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,KEGG_ID,Category
3,4,SNARE interactions in vesicular transport,10/33,2.544328e-05,hsa04130,"[Folding, sorting and degradation]"
1,4,Glycosaminoglycan biosynthesis,15/53,2.104377e-07,,[]
0,4,Spliceosome,33/150,3.614363e-13,hsa03040,[Transcription]
2,4,RNA degradation,16/79,7.434263e-06,hsa03018,"[Folding, sorting and degradation]"


{'Folding, sorting and degradation': (2, 0.23214285714285715), 'Transcription': (1, 0.22)}
Size of community: 747
Number of filtered terms: 3


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,KEGG_ID,Category
2,5,N-Glycan biosynthesis,11/50,9.406295e-05,hsa00510,[Glycan biosynthesis and metabolism]
1,5,Ribosome biogenesis in eukaryotes,18/108,7.583253e-06,hsa03008,[Translation]
0,5,RNA transport,28/186,5.105571e-08,,[]


{'Glycan biosynthesis and metabolism': (1, 0.22), 'Translation': (1, 0.16666666666666666)}
Size of community: 754
Number of filtered terms: 30


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,KEGG_ID,Category
0,7,Spliceosome,42/150,7.630515e-23,hsa03040,[Transcription]
1,7,Ubiquitin mediated proteolysis,34/140,1.894155e-16,hsa04120,"[Folding, sorting and degradation]"
12,7,Basal cell carcinoma,15/63,1.472603e-07,hsa05217,[Cancer: specific types]
18,7,Mitophagy,14/68,2.240709e-06,,[]
20,7,Adherens junction,14/71,3.566339e-06,hsa04520,[Cellular community - eukaryotes]
2,7,Hippo signaling pathway,32/163,8.927223e-13,hsa04390,[Signal transduction]
28,7,Nucleotide excision repair,9/47,0.0004186959,hsa03420,[Replication and repair]
17,7,ECM-receptor interaction,16/88,1.993798e-06,hsa04512,[Signaling molecules and interaction]
22,7,Arrhythmogenic right ventricular cardiomyopathy,14/77,9.216888e-06,hsa05412,[Cardiovascular disease]
6,7,Wnt signaling pathway,29/166,1.461896e-10,hsa04310,[Signal transduction]


{'Cancer: overview': (3, 0.12607758620689655), 'Cancer: specific types': (5, 0.16476345840130505), 'Cardiovascular disease': (1, 0.18181818181818182), 'Cell growth and death': (1, 0.10897435897435898), 'Cell motility': (1, 0.11926605504587157), 'Cellular community - eukaryotes': (3, 0.1566265060240964), 'Folding, sorting and degradation': (3, 0.1717948717948718), 'Infectious disease: viral': (1, 0.1148036253776435), 'Replication and repair': (1, 0.19148936170212766), 'Signal transduction': (7, 0.1447166921898928), 'Signaling molecules and interaction': (1, 0.18181818181818182), 'Transcription': (1, 0.28), 'Transport and catabolism': (1, 0.11904761904761904)}
Size of community: 513
Number of filtered terms: 1


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,KEGG_ID,Category
0,8,Neuroactive ligand-receptor interaction,71/341,4.688945e-42,hsa04080,[Signaling molecules and interaction]


{'Signaling molecules and interaction': (1, 0.20821114369501467)}
Size of community: 220
Number of filtered terms: 1


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,KEGG_ID,Category
0,10,Olfactory transduction,216/440,0.0,hsa04740,[Sensory system]


{'Sensory system': (1, 0.4909090909090909)}
6 out of 15 communities had significant GO terms.


In [32]:
kegg_important_terms

Unnamed: 0,Community Index,Community Size,Term,Overlap,Adjusted P-value,Category,Gene_set,P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,KEGG_ID,Overlap (value)
0,3,883,Ribosome,34/158,1.64602e-12,[Translation],KEGG_2021_Human,1.016062e-14,0.0,0.0,6.133991,197.638758,RPL4;MRPS17;RPL3;MRPS16;MRPS11;RPL12;MRPL19;MR...,hsa03010,0.21519
1,4,826,SNARE interactions in vesicular transport,10/33,2.544328e-05,"[Folding, sorting and degradation]",KEGG_2021_Human,5.328435e-07,0.0,0.0,10.204071,147.39819,STX17;STX16;GOSR1;STX6;VTI1A;STX3;SEC22B;VAMP4...,hsa04130,0.30303
2,4,826,Glycosaminoglycan biosynthesis,15/53,2.104377e-07,[],KEGG_2021_Human,2.203537e-09,0.0,0.0,9.314037,185.658588,HS3ST3B1;CHST7;GLCE;HS6ST2;HS3ST1;NDST1;FUT8;C...,,0.283019
3,4,826,Spliceosome,33/150,3.614363e-13,[Transcription],KEGG_2021_Human,1.892337e-15,0.0,0.0,6.778123,229.784893,ISY1;SF3B4;TCERG1;SF3B5;SF3B2;DHX8;SF3B6;SRSF1...,hsa03040,0.22
4,4,826,RNA degradation,16/79,7.434263e-06,"[Folding, sorting and degradation]",KEGG_2021_Human,1.167685e-07,0.0,0.0,5.992083,95.652055,HSPA9;DDX6;PARN;ENO2;ENO3;TOB1;LSM8;EXOSC5;CNO...,hsa03018,0.202532
5,5,747,N-Glycan biosynthesis,11/50,9.406295e-05,[Glycan biosynthesis and metabolism],KEGG_2021_Human,1.808903e-06,0.0,0.0,7.363225,97.362373,GANAB;ST6GAL2;MAN2A2;MAN1A2;MAN2A1;MGAT5B;MGAT...,hsa00510,0.22
6,5,747,Ribosome biogenesis in eukaryotes,18/108,7.583253e-06,[Translation],KEGG_2021_Human,9.722119e-08,0.0,0.0,5.257339,84.88645,UTP15;POP7;POP1;RPP40;HEATR1;WDR75;NMD3;WDR43;...,hsa03008,0.166667
7,5,747,RNA transport,28/186,5.105571e-08,[],KEGG_2021_Human,3.272802e-10,0.0,0.0,4.706431,102.78942,NUP205;POP7;POP1;DDX20;NMD3;NXF1;XPO5;RPP14;ND...,,0.150538
8,7,754,Spliceosome,42/150,7.630515e-23,[Transcription],KEGG_2021_Human,3.468416e-25,0.0,0.0,10.453027,588.724222,DDX5;RBM25;SF3B3;RBM8A;EIF4A3;HNRNPU;SNU13;PRP...,hsa03040,0.28
9,7,754,Ubiquitin mediated proteolysis,34/140,1.894155e-16,"[Folding, sorting and degradation]",KEGG_2021_Human,1.721959e-18,0.0,0.0,8.52673,348.769409,UBA6;UBE2D2;UBE2D3;NEDD4L;PRPF19;RCHY1;UBE2L3;...,hsa04120,0.242857


### Reactome

In [33]:
# Reactome enrichment
def reactome_enrichment(communities,
                        term_score_cap,
                        percentage):
    important_terms = pd.DataFrame(columns=["Community Index","Community Size","Term", "Overlap", "Adjusted P-value","Category"])
    category_counts_and_overlap_score_list= {}
    i = 0
    num_nonzero_communities = 0
    for community in communities:
        enr_path = gp.enrichr(
            gene_list=community,
            gene_sets=['Reactome_2022'],
            organism='Human',
            outdir=None
        )
        Reactome_df = enr_path.results

        # Filter by overlap percentage and adjusted p-value
        mask =  (Reactome_df["Adjusted P-value"] < term_score_cap) & (Reactome_df["Overlap"].apply(lambda x: int(x.split("/")[0])/int(x.split("/")[1]) > percentage))
        filtered = Reactome_df[mask].copy()
        
        # Categorization from Reactome Level 1
        filtered["Category"] = filtered["Term"].str.extract(r"(R-[A-Z]+-\d+)", expand=False).map(reactome_level1)
        
        # Sort
        filtered['Overlap (value)'] = filtered['Overlap'].apply(lambda x: int(x.split("/")[0])/int(x.split("/")[1]))
        filtered = filtered.sort_values(['Overlap (value)'], ascending=False)
        
        # Compute overlap score for every category:
        filtered_exploded = filtered.explode('Category').reset_index(drop=True)
        category_counts_and_overlap_score = {}
        for val, group in filtered_exploded.groupby('Category'):
            overlap_list = group["Overlap"].tolist()
            numerators = [(lambda x: int(x.split("/")[0]))(e) for e in overlap_list]
            denominators = [(lambda x: int(x.split("/")[1]))(e) for e in overlap_list]
            overlap_score = sum(numerators)/sum(denominators)
            
            category_counts_and_overlap_score[val] = (len(group),overlap_score,)
        
        category_counts_and_overlap_score_list[i] = category_counts_and_overlap_score
        
        # Add results to important terms
        if not filtered.empty:
            print(f"Size of community: {len(community)}")
            print(f"Number of filtered terms: {len(filtered)}")
            filtered.loc[:, "Community Index"] = i
            filtered.loc[:, "Community Size"] = len(community)
            important_terms = pd.concat([important_terms, filtered], ignore_index=True)
            display(HTML(filtered[["Community Index",'Term','Overlap','Adjusted P-value',"Category"]].head(30).to_html(max_cols=None)))
            print(category_counts_and_overlap_score)
            num_nonzero_communities += 1
        i += 1
    print(f"{num_nonzero_communities} out of {len(communities)} communities had significant GO terms.")
    return important_terms,category_counts_and_overlap_score_list

In [34]:
reactome_important_terms,reactome_category_counts_and_overlap_score = reactome_enrichment(COMMUNITIES_HGNC,TERM_SCORE_CAP,PERCENTAGE)

Size of community: 910
Number of filtered terms: 2


  important_terms = pd.concat([important_terms, filtered], ignore_index=True)


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,Category
1,1,Formation Of Cornified Envelope R-HSA-6809371,19/74,3.061773e-08,[Keratinization]
0,1,Keratinization R-HSA-6805567,49/208,7.911049e-20,[Keratinization]


{'Keratinization': (2, 0.24113475177304963)}
Size of community: 883
Number of filtered terms: 62


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,Category
63,3,Metallothioneins Bind Metals R-HSA-5661231,5/11,0.0006832758,[Response to metal ions]
54,3,Response To Metal Ions R-HSA-5660526,6/14,0.0002089298,[Response to metal ions]
3,3,Mitochondrial Translation Elongation R-HSA-5389840,28/82,7.446369e-16,[Translation]
4,3,Mitochondrial Translation Initiation R-HSA-5368286,28/82,7.446369e-16,[Translation]
2,3,Mitochondrial Translation R-HSA-5368287,29/88,7.446369e-16,[Translation]
5,3,Mitochondrial Translation Termination R-HSA-5419276,27/82,7.0716e-15,[Translation]
21,3,NS1 Mediated Effects On Host Pathways R-HSA-168276,12/42,5.359201e-06,[Infectious disease]
45,3,Defective TPR May Confer Susceptibility Towards Thyroid Papillary Carcinoma (TPC) R-HSA-5619107,9/32,0.0001033035,[Disorders of transmembrane transporters]
30,3,Transport Of Mature mRNA Derived From An Intronless Transcript R-HSA-159231,11/42,3.267625e-05,[Processing of Capped Intron-Containing Pre-mRNA]
53,3,Vpr-mediated Nuclear Import Of PICs R-HSA-180910,9/35,0.000202251,[Infectious disease]


{'Cellular responses to stress': (2, 0.14741035856573706), 'Cytokine Signaling in Immune system': (2, 0.20253164556962025), 'Disorders of transmembrane transporters': (1, 0.28125), 'Infectious disease': (10, 0.215625), 'Membrane Trafficking': (5, 0.1660958904109589), 'Metabolism of RNA': (1, 0.13213213213213212), 'Metabolism of amino acids and derivatives': (2, 0.1568627450980392), 'Metabolism of non-coding RNA': (1, 0.19230769230769232), 'Nonsense-Mediated Decay (NMD)': (2, 0.16176470588235295), 'Post-translational protein modification': (4, 0.13505311077389984), 'Processing of Capped Intron-Containing Pre-mRNA': (8, 0.16228070175438597), 'RNA Polymerase II Transcription': (1, 0.1791044776119403), 'Response to metal ions': (2, 0.44), 'Signaling by Rho GTPases, Miro GTPases and RHOBTB3': (1, 0.12751677852348994), 'Translation': (16, 0.2292713567839196), 'rRNA processing': (4, 0.1850079744816587), 'tRNA processing': (2, 0.16049382716049382)}
Size of community: 826
Number of filtered ter

Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,Category
22,4,KSRP (KHSRP) Binds And Destabilizes mRNA R-HSA-450604,7/17,9.622986e-05,[Regulation of mRNA stability by proteins that bind AU-rich elements]
35,4,"Tristetraprolin (TTP, ZFP36) Binds And Destabilizes mRNA R-HSA-450513",6/17,0.0009275528,[Regulation of mRNA stability by proteins that bind AU-rich elements]
30,4,Sema4D Induced Cell Migration And Growth-Cone Collapse R-HSA-416572,7/20,0.0002552494,[Nervous system development]
5,4,Retrograde Transport At Trans-Golgi-Network R-HSA-6811440,16/48,5.530059e-09,[Membrane Trafficking]
32,4,RHOBTB2 GTPase Cycle R-HSA-9013418,7/23,0.0006802642,"[Signaling by Rho GTPases, Miro GTPases and RHOBTB3]"
34,4,Sema4D In Semaphorin Signaling R-HSA-400685,7/24,0.0008733398,[Nervous system development]
8,4,mRNA 3-End Processing R-HSA-72187,16/58,8.874492e-08,[Processing of Capped Intron-Containing Pre-mRNA]
21,4,mRNA Splicing - Minor Pathway R-HSA-72165,12/49,1.907113e-05,[Processing of Capped Intron-Containing Pre-mRNA]
18,4,Late SARS-CoV-2 Infection Events R-HSA-9772573,14/58,3.029576e-06,[Infectious disease]
12,4,RNA Polymerase II Transcription Termination R-HSA-73856,16/67,6.095239e-07,[RNA Polymerase II Transcription]


{'Adaptive Immune System': (2, 0.10948905109489052), 'Infectious disease': (1, 0.2413793103448276), 'Membrane Trafficking': (4, 0.1707920792079208), 'Metabolism of RNA': (1, 0.10810810810810811), 'Metabolism of carbohydrates and carbohydrate derivatives': (1, 0.14166666666666666), 'Nervous system development': (2, 0.3181818181818182), 'Post-translational protein modification': (3, 0.12607944732297063), 'Processing of Capped Intron-Containing Pre-mRNA': (7, 0.21575342465753425), 'RNA Polymerase II Transcription': (1, 0.23880597014925373), 'Regulation of mRNA stability by proteins that bind AU-rich elements': (2, 0.38235294117647056), 'Signaling by Rho GTPases, Miro GTPases and RHOBTB3': (8, 0.14129443938012762)}
Size of community: 747
Number of filtered terms: 7


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,Category
2,5,rRNA Modification In Nucleus And Cytosol R-HSA-6790901,14/60,8e-06,[rRNA processing]
5,5,mRNA 3-End Processing R-HSA-72187,13/58,2.1e-05,[Processing of Capped Intron-Containing Pre-mRNA]
4,5,RNA Polymerase II Transcription Termination R-HSA-73856,14/67,2.1e-05,[RNA Polymerase II Transcription]
10,5,Transport Of Mature mRNA Derived From An Intron-Containing Transcript R-HSA-159236,13/74,0.000221,[Processing of Capped Intron-Containing Pre-mRNA]
8,5,Transport Of Mature Transcript To Cytoplasm R-HSA-72202,14/83,0.000188,[Processing of Capped Intron-Containing Pre-mRNA]
9,5,Rab Regulation Of Trafficking R-HSA-9007101,17/122,0.000221,[Membrane Trafficking]
11,5,Intra-Golgi And Retrograde Golgi-to-ER Traffic R-HSA-6811442,20/181,0.000959,[Membrane Trafficking]


{'Membrane Trafficking': (2, 0.12211221122112212), 'Processing of Capped Intron-Containing Pre-mRNA': (3, 0.18604651162790697), 'RNA Polymerase II Transcription': (1, 0.208955223880597), 'rRNA processing': (1, 0.23333333333333334)}
Size of community: 844
Number of filtered terms: 3


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,Category
1,6,tRNA Modification In Nucleus And Cytosol R-HSA-6782315,13/42,3.258899e-06,[tRNA processing]
0,6,tRNA Processing R-HSA-72306,24/105,5.580623e-09,[tRNA processing]
2,6,RNA Polymerase II Transcribes snRNA Genes R-HSA-6807505,15/74,8.137731e-05,[RNA Polymerase II Transcription]


{'RNA Polymerase II Transcription': (1, 0.20270270270270271), 'tRNA processing': (2, 0.25170068027210885)}
Size of community: 754
Number of filtered terms: 112


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,Category
41,7,SLBP Independent Processing Of Histone Pre-mRNAs R-HSA-111367,7/10,3.080156e-07,[Processing of Capped Intronless Pre-mRNA]
113,7,Fibronectin Matrix Formation R-HSA-1566977,4/6,0.0002814269,[Fibronectin matrix formation]
45,7,SLBP Dependent Processing Of Replication-Dependent Histone Pre-mRNAs R-HSA-77588,7/11,7.504515e-07,[Processing of Capped Intronless Pre-mRNA]
88,7,Cross-presentation Of Particulate Exogenous Antigens (Phagosomes) R-HSA-1236973,5/8,4.878075e-05,[Adaptive Immune System]
125,7,Signaling By RNF43 Mutants R-HSA-5340588,4/7,0.0005763452,[Diseases of signal transduction by growth factor receptors and second messengers]
40,7,Signaling By FGFR2 IIIa TM R-HSA-8851708,9/19,2.669407e-07,[Diseases of signal transduction by growth factor receptors and second messengers]
18,7,Synthesis Of Active Ubiquitin: Roles Of E1 And E2 Enzymes R-HSA-8866652,14/30,5.154584e-11,[Post-translational protein modification]
129,7,IRF3-mediated Induction Of Type I IFN R-HSA-3270619,5/13,0.0006554585,[Innate Immune System]
106,7,STING Mediated Induction Of Host Immune Responses R-HSA-1834941,6/16,0.0001727111,[Innate Immune System]
73,7,RHOBTB1 GTPase Cycle R-HSA-9013422,8/23,1.780972e-05,"[Signaling by Rho GTPases, Miro GTPases and RHOBTB3]"


{'Adaptive Immune System': (3, 0.12554112554112554), 'Cell surface interactions at the vascular wall': (1, 0.13432835820895522), 'Chromosome Maintenance': (1, 0.1271186440677966), 'Cytokine Signaling in Immune system': (7, 0.14759036144578314), 'DNA Damage Bypass': (1, 0.22916666666666666), 'DNA Repair': (1, 0.11290322580645161), 'Deadenylation-dependent mRNA decay': (1, 0.17857142857142858), 'Diseases of signal transduction by growth factor receptors and second messengers': (7, 0.17447495961227788), 'ECM proteoglycans': (1, 0.18181818181818182), 'Elastic fibre formation': (1, 0.2564102564102564), 'Extracellular matrix organization': (1, 0.12371134020618557), 'Fibronectin matrix formation': (1, 0.6666666666666666), 'Infectious disease': (15, 0.15186915887850466), 'Innate Immune System': (6, 0.14644970414201183), 'Integrin cell surface interactions': (1, 0.22727272727272727), 'Laminin interactions': (1, 0.3181818181818182), 'MAPK family signaling cascades': (1, 0.10108303249097472), 'Me

Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,Category
8,8,Lysosphingolipid And LPA Receptors R-HSA-419408,10/14,2.888906e-12,[Signaling by GPCR]
16,8,Relaxin Receptors R-HSA-444821,5/8,8.479987e-06,[Signaling by GPCR]
13,8,P2Y Receptors R-HSA-417957,7/12,8.94413e-08,[Signaling by GPCR]
10,8,Nucleotide-like (Purinergic) Receptors R-HSA-418038,9/16,1.001158e-09,[Signaling by GPCR]
17,8,Prostanoid Ligand Receptors R-HSA-391908,5/9,1.76393e-05,[Signaling by GPCR]
19,8,Opsins R-HSA-419771,4/9,0.0006130659,[Signaling by GPCR]
18,8,Eicosanoid Ligand-Binding Receptors R-HSA-391903,5/15,0.000350471,[Signaling by GPCR]
0,8,Class A/1 (Rhodopsin-like Receptors) R-HSA-373076,91/327,3.2101570000000004e-66,[Signaling by GPCR]
4,8,Peptide Ligand-Binding Receptors R-HSA-375276,53/196,1.848471e-37,[Signaling by GPCR]
5,8,G Alpha (Q) Signaling Events R-HSA-416476,46/212,7.309375e-28,[Signaling by GPCR]


{'Infectious disease': (2, 0.14864864864864866), 'Signaling by GPCR': (16, 0.2032206119162641)}
Size of community: 220
Number of filtered terms: 3


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,Category
1,10,Expression And Translocation Of Olfactory Receptors R-HSA-9752946,208/393,0.0,[Olfactory Signaling Pathway]
0,10,Olfactory Signaling Pathway R-HSA-381753,208/401,0.0,[Olfactory Signaling Pathway]
2,10,Sensory Perception R-HSA-9709957,208/616,8.405358e-313,[Sensory Perception]


{'Olfactory Signaling Pathway': (2, 0.5239294710327456), 'Sensory Perception': (1, 0.33766233766233766)}
Size of community: 134
Number of filtered terms: 7


Unnamed: 0,Community Index,Term,Overlap,Adjusted P-value,Category
0,12,Beta Defensins R-HSA-1461957,20/35,1.1479820000000001e-33,[Innate Immune System]
5,12,Sodium/Proton Exchangers R-HSA-425986,4/8,9.622184e-07,[SLC-mediated transmembrane transport]
6,12,Transport Of Fatty Acids R-HSA-804914,4/8,9.622184e-07,[SLC-mediated transmembrane transport]
1,12,Defensins R-HSA-1461973,20/43,1.6257750000000002e-31,[Innate Immune System]
7,12,Bicarbonate Transporters R-HSA-425381,4/10,2.499678e-06,[SLC-mediated transmembrane transport]
2,12,Antimicrobial Peptides R-HSA-6803157,20/89,3.484511e-24,[Innate Immune System]
3,12,Transport Of Inorganic Cations/Anions And Amino Acids/Oligopeptides R-HSA-425393,11/104,1.171183e-09,


{'Innate Immune System': (3, 0.3592814371257485), 'SLC-mediated transmembrane transport': (3, 0.46153846153846156)}
9 out of 15 communities had significant GO terms.


In [35]:
reactome_important_terms

Unnamed: 0,Community Index,Community Size,Term,Overlap,Adjusted P-value,Category,Gene_set,P-value,Old P-value,Old Adjusted P-value,Odds Ratio,Combined Score,Genes,Overlap (value)
0,1,910,Formation Of Cornified Envelope R-HSA-6809371,19/74,3.061773e-08,[Keratinization],Reactome_2022,5.278919e-10,0.0,0.0,7.380165,157.656047,SPRR2F;SPINK6;LCE1E;LCE1C;LCE2D;DSPP;LCE2B;LCE...,0.256757
1,1,910,Keratinization R-HSA-6805567,49/208,7.911049e-20,[Keratinization],Reactome_2022,6.819870e-22,0.0,0.0,6.775937,330.239056,KRTAP24-1;KRTAP26-1;LCE1B;LIPJ;LCE5A;LIPN;KRTA...,0.235577
2,3,883,Metallothioneins Bind Metals R-HSA-5661231,5/11,6.832758e-04,[Response to metal ions],Reactome_2022,6.133191e-05,0.0,0.0,18.138762,175.931670,MT1A;MT1F;MT1H;MT1X;MT1E,0.454545
3,3,883,Response To Metal Ions R-HSA-5660526,6/14,2.089298e-04,[Response to metal ions],Reactome_2022,1.611660e-05,0.0,0.0,16.341790,180.342448,MT1A;CSRP1;MT1F;MT1X;MT1H;MT1E,0.428571
4,3,883,Mitochondrial Translation Elongation R-HSA-538...,28/82,7.446369e-16,[Translation],Reactome_2022,5.221858e-18,0.0,0.0,11.560840,460.048365,MRPS17;MRPS16;MRPS11;MRPS33;MRPL19;MRPS31;MRPL...,0.341463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,12,134,Transport Of Fatty Acids R-HSA-804914,4/8,9.622184e-07,[SLC-mediated transmembrane transport],Reactome_2022,1.320692e-07,0.0,0.0,152.784615,2420.099125,LCN9;LCN15;LCN12;LCN1,0.500000
241,12,134,Defensins R-HSA-1461973,20/43,1.625775e-31,[Innate Immune System],Reactome_2022,6.375587e-33,0.0,0.0,151.357742,11220.578051,DEFB105A;DEFB119;DEFB129;DEFB131A;DEFB116;DEFB...,0.465116
242,12,134,Bicarbonate Transporters R-HSA-425381,4/10,2.499678e-06,[SLC-mediated transmembrane transport],Reactome_2022,3.921063e-07,0.0,0.0,101.846154,1502.407250,SLC4A9;SLC4A10;SLC4A2;SLC4A5,0.400000
243,12,134,Antimicrobial Peptides R-HSA-6803157,20/89,3.484511e-24,[Innate Immune System],Reactome_2022,2.049712e-25,0.0,0.0,50.335622,2861.425458,DEFB105A;DEFB119;DEFB129;DEFB131A;DEFB116;DEFB...,0.224719


### Disease Data Sets

In [36]:
# disease_term_score_cap = 0.001
# disease_percentage = 0.1
# important_diseases = pd.DataFrame(columns=["Community Index","Community Size","Term", "Overlap", "Adjusted P-value"])

In [37]:
# # Disease-gene enrichment libraries
# disease_sets = [
#     'DisGeNET_2020', # curated gene–disease associations
#     'GWAS_Catalog_2023', # genome-wide association hits
#     'OMIM_Disease', # Mendelian disorders
#     'Jensen_DISEASES' # text-mined associations
# ]

# # # Disease-gene enrichment Analysis; save terms with small size and high p-value
# i = 0
# for community in communities_HGNC:
#     # Gene Ontology enrichment
#     enr_disease = gp.enrichr(
#         gene_list=community,
#         gene_sets=disease_sets,
#         organism='Human',
#         outdir=None # don't write to disk
#     )
#     enr_disease_df = enr_disease.results.sort_values('Adjusted P-value')
#     print(f"Size of community: {len(community)}")

#     mask =  (enr_disease_df["Adjusted P-value"] < disease_term_score_cap) & (enr_disease_df["Overlap"].apply(lambda x: int(x.split("/")[0])/int(x.split("/")[1]) > disease_percentage))
        
#     filtered = enr_disease_df[mask].copy()
#     if not filtered.empty:
#         filtered.loc[:, "Community Index"] = i
#         filtered.loc[:, "Community Size"] = len(community)
#         important_diseases = pd.concat([important_diseases, filtered], ignore_index=True)

#     display(HTML(filtered[['Term','Overlap','Adjusted P-value']].head(10).to_html(max_cols=None)))
#     i += 1

# Important Terms Analysis

### Constructing Important Terms df

In [None]:
def comm_similarity_with_term(x,y):
    return 1-(abs(x-y)/max(x,y))

In [None]:
important_terms = pd.DataFrame(columns=["Community Index","Community Size","Term", "Overlap", "Adjusted P-value","Category"])

In [None]:
c = [go_important_terms,kegg_important_terms,reactome_important_terms]
important_terms = pd.concat(c, ignore_index=True)

In [None]:
# important_terms = important_terms.sort_values(by="Overlap (value)",ascending=False)
important_terms = important_terms.sort_values(by="Community Index")
important_terms

In [None]:
# Community id to size dict
com_id_to_size = {i : len(COMMUNITIES_HGNC[i]) for i in range(len(COMMUNITIES_HGNC))}

In [None]:
unique_com_id_to_size = important_terms.drop_duplicates(subset="Community Index", keep="first")

In [None]:
comm_size_dict = dict(zip(unique_com_id_to_size["Community Index"], unique_com_id_to_size["Community Size"]))

In [None]:
important_terms.to_csv(f"../output/{DISEASE}/important_terms_{DISEASE}.csv", index=False)

### Graph Building

In [None]:
# df must have: "Community Index", "Term", "Overlap (value)"
work = important_terms.loc[:, ["Community Index", "Term", "Overlap (value)","Category"]].copy()
work["Overlap (value)"] = work["Overlap (value)"].astype(float)

# Ensure (community, term) uniqueness
dupes = work.duplicated(subset=["Community Index", "Term"], keep=False)
if dupes.any():
    raise ValueError("Duplicated (Community Index, Term) rows found; ensure uniqueness first.")

# --- Build edge weights AND collect contributing terms per pair ---
edge_weights = {}              # (u, v) -> float
edge_counts  = {}              # (u, v) -> int
edge_terms   = {}              # (u, v) -> list[(term, contrib)]

for term, sub in work.groupby("Term", sort=False):
    comms  = sub["Community Index"].to_numpy()
    scores = sub["Overlap (value)"].to_numpy()
    if len(comms) < 2:
        continue
    for i, j in combinations(range(len(comms)), 2):
        u, v = comms[i], comms[j]
        if u > v: u, v = v, u  # canonical ordering
        contrib = comm_similarity_with_term(scores[i], scores[j])

        edge_weights[(u, v)] = edge_weights.get((u, v), 0.0) + contrib
        edge_counts[(u, v)]  = edge_counts.get((u, v), 0)    + 1
        edge_terms.setdefault((u, v), []).append((term, contrib))

# Sort contributing terms by contribution desc for each edge
for key in edge_terms:
    edge_terms[key].sort(key=lambda t: t[1], reverse=True)

# --- Build edge list DataFrame (optional, useful to inspect) ---
edge_df = pd.DataFrame(
    [(u, v, edge_weights[(u, v)], edge_counts[(u, v)], edge_terms.get((u, v), []))
     for (u, v) in edge_weights.keys()],
    columns=["u", "v", "weight", "shared_terms", "terms_contrib"]
).sort_values(["weight", "shared_terms"], ascending=[False, False]).reset_index(drop=True)

# --- Build NetworkX graph with attributes ---
G = nx.Graph()
G.add_nodes_from(pd.unique(work["Community Index"]))
for _, r in edge_df.iterrows():
    G.add_edge(
        int(r.u), int(r.v),
        weight=float(r.weight),
        shared_terms=int(r.shared_terms),
        terms_contrib=r.terms_contrib  # list of (term, contrib) sorted desc
    )

In [None]:
work

### Table

In [None]:
#--------------Table------------------
term_contribs = []

for term, sub in work.groupby("Term", sort=False):
    comms  = sub["Community Index"].to_numpy()
    scores = sub["Overlap (value)"].to_numpy()
    if len(comms) < 2:
        continue
    for i, j in combinations(range(len(comms)), 2):
        u, v = comms[i], comms[j]
        if u > v:
            u, v = v, u
        contrib = comm_similarity_with_term(scores[i], scores[j])
        cat = sub["Category"].iloc[0] if "Category" in sub.columns else None
        term_contribs.append((u, v, term, contrib, cat))

# 2) Build DataFrame
term_df = pd.DataFrame(term_contribs, columns=["u", "v", "Term", "Contribution","Category"])
# 3) Sort and aggregate terms per edge (keep per-term order)
agg_blocks = []
for (u, v), sub in term_df.groupby(["u", "v"]):
    sub_sorted = sub.sort_values("Contribution", ascending=False)
    
    # Create category count dictionary
    category_counts = Counter(
        c
        for cats in sub_sorted["Category"].dropna()
        for c in cats
    )
    category_counts_dict = dict(category_counts)

    # sub_sorted = sub.sort_values(sub_sorted["Category"].apply(tuple), ascending=False)
    block = "\n".join(
        [f"  - {t} {cat} ({c:.3f})"
        for t, c, cat in zip(sub_sorted["Term"], sub_sorted["Contribution"], sub_sorted["Category"])]
    )
    total = sub_sorted["Contribution"].sum()
    agg_blocks.append({
        "u": u,
        "v": v,
        "Total Weight": total,
        "Terms (by contribution)": block,
        "Category Count": category_counts_dict
    })

# 4) Create final block table
block_df = pd.DataFrame(agg_blocks).sort_values("Total Weight", ascending=False).reset_index(drop=True)
# 5) Display nicely
for _, row in block_df.iterrows():
    print(f"Community pair ({row.u}, {row.v}) — Total Weight = {row['Total Weight']:.3f}")
    print(row["Terms (by contribution)"])
    
    print()
    print("Category Count:")
    for key, value in sorted(row["Category Count"].items(), key=lambda x: x[1], reverse=True):
        print(f"{key}: {value}")

    print("-" * 60)

In [None]:
# block_df.to_excel("output.xlsx", index=False)

### Category Counts

In [None]:
def print_category_count_by_comm(category_count_by_comm):
    for comm_id, cat_dict in category_count_by_comm.items():
        print(f"\n🧩 Community {comm_id}")
        print("-" * (14 + len(str(comm_id))))

        if not cat_dict:
            print("  (no categories)")
            continue

        # Sort categories by descending count
        for cat, count in sorted(cat_dict.items(), key=lambda x: x[1], reverse=True):
            print(f"  • {cat:<50} {count}")

In [None]:
category_count_by_comm = {}
for i in range(num_selected_comm):
    comm_cates = go_category_counts_and_overlap_score[i] | kegg_category_counts_and_overlap_score[i] | reactome_category_counts_and_overlap_score[i]
    category_count_by_comm[i] = dict(sorted(comm_cates.items(), key=lambda x: x[1],reverse=True))

In [None]:
print_category_count_by_comm(category_count_by_comm)

### Visualization

# Robustness Analysis

In [None]:
def run_enrichment_func(community,term_score_cap,percentage):
    # GO df
    enr_go = gp.enrichr(
        gene_list=community,
        gene_sets=['GO_Biological_Process_2023',
                'GO_Molecular_Function_2023',
                'GO_Cellular_Component_2023'],
        organism='Human',
        outdir=None # don't write to disk
    )
    GO_df = enr_go.results
    mask =  (GO_df["Adjusted P-value"] < term_score_cap) & (GO_df["Overlap"].apply(lambda x: int(x.split("/")[0])/int(x.split("/")[1]) > percentage))
    GO_df = GO_df[mask].copy()   
    
    # KEGG df
    enr_kegg = gp.enrichr(
        gene_list=community,
        gene_sets=['KEGG_2021_Human'],
        organism='Human',
        outdir=None
    )
    KEGG_df = enr_kegg.results
    mask =  (KEGG_df["Adjusted P-value"] < term_score_cap) & (KEGG_df["Overlap"].apply(lambda x: int(x.split("/")[0])/int(x.split("/")[1]) > percentage))
    KEGG_df = KEGG_df[mask].copy() 
       
    # Reactome df
    enr_reactome = gp.enrichr(
        gene_list=community,
        gene_sets=['Reactome_2022'],
        organism='Human',
        outdir=None
    )
    Reactome_df = enr_reactome.results  
    mask =  (Reactome_df["Adjusted P-value"] < term_score_cap) & (Reactome_df["Overlap"].apply(lambda x: int(x.split("/")[0])/int(x.split("/")[1]) > percentage))
    Reactome_df = Reactome_df[mask].copy()
    
    
    all_df = [GO_df,KEGG_df,Reactome_df]
    # build result df by concatenating
    result = pd.concat(all_df, ignore_index=True)
    return result

In [None]:
from json import JSONDecodeError

# ---------------- 1) Safe wrapper that calls YOUR enrichr function ----------------
_ENR_CACHE = {}  # key: tuple(sorted(genes)) -> DataFrame (copy)

def run_enrichment_safe(run_enrichment_func, community, retries=5, base_sleep=0.8):
    """
    Calls user's run_enrichment_func(community) with retries + memoization.
    Returns a DataFrame (possibly empty). Never raises JSONDecodeError outward.
    """
    # Ensure we always pass a list of gene symbols (never a bare string)
    genes = np.atleast_1d(np.array(community, dtype=object)).tolist()
    if len(genes) == 0:
        return pd.DataFrame()

    key = tuple(sorted(genes))
    if key in _ENR_CACHE:
        return _ENR_CACHE[key].copy()

    for a in range(retries):
        try:
            df = run_enrichment_func(genes,TERM_SCORE_CAP,PERCENTAGE)
            if df is None:
                # treat as transient failure to trigger retry
                raise RuntimeError("run_enrichment_func returned None")
            _ENR_CACHE[key] = df.copy()
            return df
        except (JSONDecodeError, OSError, RuntimeError, ValueError) as e:
            # Transient errors from HTTP/JSON/file handling inside gseapy
            if a == retries - 1:
                # Give up: return empty so pipeline continues
                return pd.DataFrame()
            time.sleep(base_sleep * (2 ** a) + np.random.rand() * 0.3)

    return pd.DataFrame()

# ---------------- 2) Minimal bootstrap to record robust terms ----------------
def get_robust_terms(communities_HGNC, run_enrichment_func,
                     R=50, leaveout=0.10, recurrence_cutoff=0.70, seed=42):
    """
    Uses YOUR run_enrichment_func(community)->DataFrame (already filtered to significant terms).
    Returns DataFrame with columns: community_id, term, recurrence (and Gene_set if available).
    """
    rng = np.random.default_rng(seed)
    rows = []

    for cid, community in enumerate(communities_HGNC):
        n = len(community)
        if n == 0:
            continue
        drop_k = max(1, int(np.floor(leaveout * n)))
        counts = Counter()

        for _ in range(R):
            # Jackknife subset (ensure not empty)
            keep = np.ones(n, dtype=bool)
            keep[rng.choice(n, size=min(drop_k, n), replace=False)] = False
            sub = np.atleast_1d(np.array(community, dtype=object)[keep]).tolist()
            if len(sub) == 0:
                continue

            df = run_enrichment_safe(run_enrichment_func, sub)
            if df is None or df.empty:
                continue

            # Your function already returns significant terms; just count them.
            # If it includes multiple libraries, preserve Gene_set to disambiguate names.
            if 'Term' not in df.columns:
                continue  # be defensive

            if 'Gene_set' in df.columns:
                terms = (df[['Term', 'Gene_set']]
                         .dropna()
                         .drop_duplicates()
                         .apply(lambda r: f"{r['Term']}|{r['Gene_set']}", axis=1)
                         .tolist())
            else:
                terms = df['Term'].dropna().drop_duplicates().tolist()

            counts.update(terms)

            # tiny pause helps with API rate limits if your func calls Enrichr internally
            time.sleep(0.03)

        # Keep only robust terms
        for t, c in counts.items():
            freq = c / max(R, 1)
            if freq >= recurrence_cutoff:
                if '|' in t:
                    term, gene_set = t.split('|', 1)
                    rows.append({'Community Index': cid, 'Term': term, 'recurrence': freq, 'Gene_set': gene_set})
                else:
                    rows.append({'Community Index': cid, 'Term': t, 'recurrence': freq})

    return (pd.DataFrame(rows)
              .sort_values(['Community Index', 'recurrence'], ascending=[True, False])
              .reset_index(drop=True))

In [None]:
twr3 = get_robust_terms([COMMUNITIES_HGNC[1]], run_enrichment_func,
                                R=25, leaveout=0.1, recurrence_cutoff=0)

In [None]:
twr3

In [None]:
terms_with_recurrence = get_robust_terms(COMMUNITIES_HGNC, run_enrichment_func,
                                R=10, leaveout=0.1, recurrence_cutoff=0)

In [None]:
terms_with_recurrence

In [None]:
# rename important terms to match terms_with_recurrence
important_terms = important_terms.rename(columns={'index': 'community_id'})
important_terms = important_terms.rename(columns={'Term': 'term'})

In [None]:
terms_with_rec_merged = important_terms.merge(
    terms_with_recurrence[['community_id', 'term', 'Gene_set', 'recurrence']],
    on=['community_id', 'term', 'Gene_set'],
    how='left'
)

terms_with_rec_merged['recurrence'] = terms_with_rec_merged['recurrence'].fillna(0.0)

terms_with_rec_merged = terms_with_rec_merged.sort_values(
    ['community_id', 'recurrence'],
    ascending=[True, False]
).reset_index(drop=True)

In [None]:
terms_with_rec_merged

In [None]:
community_summary = (
    terms_with_rec_merged
    .groupby("community_id")["recurrence"]
    .agg(mean_recurrence="mean", term_count="count")
    .reset_index()
)

print(community_summary)

In [None]:
display(HTML(terms_with_recurrence.to_html(max_cols=None)))

# Checks!

In [None]:
for c in communities:
    print(len(c))

In [None]:
DGIDB_genes_ncbi = list(DGIDB_gene_to_index.keys())

In [None]:
all_comms_ncbi = index_to_ncbi(communities,index_to_gene_distinct)

In [None]:
print(all_comms_ncbi)

In [None]:
def DGIDB_count(c):
    return len(set(c) & set(DGIDB_genes_ncbi))

In [None]:
for c in all_comms_ncbi:
    print(len(c),DGIDB_count(c))

In [None]:
def tbd(id):
    print(len(communities[id]))
    c8_ncbi = index_to_ncbi([communities[id]])[0]
    print(len(c8_ncbi))
    print(DGIDB_count(c8_ncbi))

In [None]:
def tbd_selected(id):
    print(len(communities_selected[id]))
    c8_ncbi = index_to_ncbi([communities_selected[id]])[0]
    print(len(c8_ncbi))
    print(DGIDB_count(c8_ncbi))

In [None]:
tbd_selected(1)