# Import

In [20]:
import numpy as np
import json
from scipy.sparse import load_npz,save_npz,diags,csr_matrix
import scipy.sparse as sp
import pandas as pd
import os
import requests
from io import BytesIO
from tqdm import tqdm
from scipy.sparse.linalg import eigsh
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
from pathlib import Path
from matplotlib.backends.backend_pdf import PdfPages
from pypdf import PdfReader, PdfWriter
from tempfile import NamedTemporaryFile
import networkx as nx
import pickle
import gseapy as gp
import mygene
from IPython.display import display, HTML
import re

In [21]:
pd.set_option('display.width', None)      # No line-wrapping
pd.set_option('display.max_columns', None)  # Show all columns

# Prep

In [22]:
DISEASE = "BIPOLAR"
DISEASE_FOLDER = f"../output/{DISEASE}/"
RESULT_FOLDER = DISEASE_FOLDER + "leiden_results/"
DGIDB_DIRECTORY = f"../../Gen_Hypergraph/output/DGIDB_{DISEASE}/"
MSIGDB_DIRECTORY = "../../Gen_Hypergraph/output/MSigDB_Full/"

with open(DISEASE_FOLDER + "gene_to_index_distinct.json", "r") as file:
    gene_to_index_distinct = json.load(file)

In [23]:
index_to_gene_distinct = {v: k for k, v in gene_to_index_distinct.items()}

In [24]:
def communities_cutoff(communities, cutoff = 100):
    result = []
    for community in communities:
        if len(community) >= cutoff:
            result.append(community)

    return result, len(result)

def community_central_genes_by_num(G, community_nodes, weight="weight", top_n=20):
    C = set(community_nodes)
    H = G.subgraph(C).copy()                       # induced subgraph
    # within-community (weighted) degree
    k = {u: H.degree(u, weight=weight) for u in H}
    ks = np.array(list(k.values()), dtype=float)
    mu, sigma = ks.mean(), ks.std() if ks.std() > 0 else 1.0
    Z = {u: (k[u] - mu)/sigma for u in H}          # within-module degree z-score

    # rank by z
    ranked = sorted(H.nodes(), key=lambda u: (Z[u]), reverse=True)
    return [u for u in ranked[:top_n]]

def community_central_genes_by_score(G, community_nodes, weight="weight",score_cap = 1):
    C = set(community_nodes)
    H = G.subgraph(C).copy()                       # induced subgraph
    # within-community (weighted) degree
    k = {u: H.degree(u, weight=weight) for u in H}
    ks = np.array(list(k.values()), dtype=float)
    mu, sigma = ks.mean(), ks.std() if ks.std() > 0 else 1.0
    Z = {u: (k[u] - mu)/sigma for u in H}          # within-module degree z-score

    # rank by z
    ranked = sorted(H.nodes(), key=lambda u: (Z[u]), reverse=True)
    return [u for u in ranked if Z[u] >= score_cap]

def community_central_genes_by_pct(G, community_nodes, weight="weight",pct = 0.3):
    C = set(community_nodes)
    H = G.subgraph(C).copy()                       # induced subgraph
    # within-community (weighted) degree
    k = {u: H.degree(u, weight=weight) for u in H}
    ks = np.array(list(k.values()), dtype=float)
    mu, sigma = ks.mean(), ks.std() if ks.std() > 0 else 1.0
    Z = {u: (k[u] - mu)/sigma for u in H}          # within-module degree z-score

    # rank by z
    ranked = sorted(H.nodes(), key=lambda u: (Z[u]), reverse=True)
    top = int(len(ranked)*pct)
    return [u for u in ranked[:top]]

In [25]:
# # Combine indices from both layers, first DGIDB and then MSIGDB
# dgidb_rev = {index : gene for gene,index in dgidb.items()}
# msigdb_rev = {index : gene for gene,index in msigdb.items()}

# a_max = max(dgidb_rev.keys())
# b_min = min(msigdb_rev.keys())
# shift = (a_max + 1) - b_min   # ensures no overlap
# index_to_gene = {**dgidb_rev, **{k + shift: v for k, v in msigdb_rev.items()}}

# print(dgidb_rev)
# print(msigdb_rev)
# print(index_to_gene)

In [26]:
# Loading result graph and communities
with open(f"{RESULT_FOLDER}/result_communities.pkl", "rb") as f:
    communities = pickle.load(f)
with open(f"{RESULT_FOLDER}/result_graph.pkl", "rb") as f:
    graph = pickle.load(f)

In [27]:
# Checking communities
communities_greater_100, num_greater_100 = communities_cutoff(communities,100)
print(communities_greater_100, '\n', num_greater_100)
print(len(communities))

[[11, 12, 21, 39, 43, 47, 50, 52, 54, 55, 58, 59, 62, 67, 68, 82, 83, 94, 101, 102, 112, 113, 114, 119, 120, 122, 129, 132, 138, 142, 144, 145, 146, 148, 153, 154, 159, 160, 161, 164, 188, 195, 202, 203, 204, 205, 206, 207, 208, 212, 213, 217, 218, 220, 221, 222, 224, 234, 235, 237, 238, 239, 240, 241, 250, 255, 258, 259, 263, 265, 267, 271, 272, 274, 278, 279, 281, 284, 287, 288, 289, 291, 292, 295, 296, 297, 299, 300, 303, 307, 308, 320, 328, 337, 349, 389, 390, 398, 399, 400, 401, 402, 403, 404, 405, 407, 409, 411, 412, 413, 414, 415, 416, 421, 424, 426, 427, 440, 445, 447, 448, 449, 451, 452, 458, 459, 460, 462, 463, 464, 465, 467, 468, 469, 470, 472, 475, 477, 478, 479, 482, 484, 486, 487, 491, 494, 495, 498, 499, 501, 505, 518, 522, 530, 535, 538, 539, 540, 541, 562, 575, 579, 580, 581, 583, 585, 590, 594, 595, 596, 602, 603, 605, 606, 608, 610, 611, 614, 618, 619, 620, 621, 622, 625, 626, 627, 629, 634, 641, 642, 643, 645, 647, 648, 649, 652, 655, 659, 661, 664, 669, 674, 675, 6

In [28]:
# Update communities to include only the important genes, determined by a community-size_cap and z-score_cap
size_cap = 50
pct = 0.3
communities_selected = []
for community in communities:
    if (len(community) >= size_cap):
        important_nodes = community_central_genes_by_pct(graph,community,pct = pct)
        communities_selected.append(important_nodes)

In [29]:
print(communities_selected)
print(len(communities_selected))

[[8619, 8888, 12715, 12533, 4207, 8323, 4037, 5853, 8623, 6884, 8954, 8128, 10105, 12866, 7037, 9297, 10481, 6607, 5382, 14651, 8864, 4706, 8552, 5133, 7353, 540, 10632, 7715, 8248, 11116, 10611, 10229, 7818, 9652, 16522, 12157, 4624, 13742, 10946, 10987, 8577, 8580, 10749, 8993, 6697, 9759, 9894, 2299, 8937, 12188, 5859, 5136, 13543, 15550, 10847, 10175, 5663, 8378, 7077, 12345, 14087, 8398, 5064, 7020, 12168, 6490, 7513, 10650, 10468, 7981, 12360, 6486, 6700, 10545, 15165, 2669, 7537, 11785, 6710, 11844, 5720, 6659, 5854, 10531, 14327, 6985, 14626, 6537, 6669, 1910, 6437, 12187, 6973, 6057, 13238, 6018, 4441, 15996, 15825, 11673, 539, 8101, 11825, 3140, 7626, 11212, 3758, 8320, 6580, 6695, 14481, 6655, 6487, 2218, 6595, 15932, 11842, 8187, 2916, 6237, 10062, 10968, 6692, 12420, 6756, 5089, 6252, 8841, 13488, 5758, 7433, 11735, 7359, 6577, 4447, 12882, 11284, 16738, 7607, 9661, 7675, 17203, 13518, 13180, 8811, 7578, 15335, 7716, 9924, 6277, 11130, 8312, 7295, 10027, 10128, 9686, 3131,

In [30]:
# Convert index to ncbi
communities_ncbi = [list(map(index_to_gene_distinct.get, c)) for c in communities_selected]
print(communities_ncbi)

[['23673', '26065', '79016', '64771', '6103', '23271', '5862', '8674', '23682', '9993', '26207', '23034', '51322', '79567', '10181', '27252', '54453', '9679', '7756', '91966', '26036', '6738', '23580', '7325', '10565', '754', '54765', '11030', '23185', '55357', '54726', '51542', '11158', '29896', '157922', '58480', '6642', '83892', '55161', '55207', '23608', '23613', '54918', '26268', '9779', '30851', '51068', '3300', '26156', '58517', '8682', '7328', '81671', '128338', '55031', '51444', '8434', '23347', '10228', '64062', '84668', '23369', '7205', '10160', '58493', '9532', '10776', '54800', '54431', '22836', '64089', '9528', '9782', '54542', '117854', '3842', '10807', '57132', '9792', '57222', '8507', '9736', '8675', '54520', '85377', '10123', '91746', '9590', '9747', '2803', '9467', '58516', '10106', '8934', '80114', '8882', '6426', '143686', '138151', '56938', '753', '22998', '57187', '4651', '10923', '55616', '5495', '23268', '9648', '9777', '90355', '9732', '9529', '3184', '9666', 

In [31]:
# NCBI to HGNC symbol
communities_HGNC = []
missed = []


for community in communities_ncbi:
    mg = mygene.MyGeneInfo()
    entrez_ids = [str(e) for e in community]

    results = mg.querymany(
        entrez_ids,
        scopes="entrezgene",
        fields="symbol",
        species="human"
    )

    # Build a mapping: input ID -> symbol (or None)
    id_to_symbol = {}
    for r in results:
        q = str(r.get("query"))
        id_to_symbol[q] = r.get("symbol") if not r.get("notfound") else None

    # Preserve original order
    symbols = [id_to_symbol.get(str(e), None) for e in entrez_ids]
    communities_HGNC.append(symbols)


Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequen

In [32]:
print(communities_HGNC)

[['STX12', 'LSM14A', 'DDA1', 'ILRUN', 'RPGR', 'CAMSAP2', 'RAB2A', 'VAMP4', 'RAB38', 'DGCR2', 'PITPNC1', 'SAMD4A', 'WAC', 'RIPOR1', 'RBM5', 'KLHL20', 'RIN2', 'FAM53B', 'ZNF207', 'EOLA1', 'ZNF451', 'RO60', 'CDC42EP4', 'UBE2E2', 'ARFGEF1', 'PTTG1IP', 'TRIM44', 'RBPMS', 'LARP4B', 'TBC1D2', 'OTUD4', 'VPS54', 'RABL2B', 'TRA2A', 'CAMSAP1', 'RHOU', 'SNX1', 'KCTD10', 'TMEM33', 'ARL8B', 'MKRN1', 'ZMYND8', 'CMTM6', 'FBXO9', 'TBC1D5', 'TAX1BP3', 'NMD3', 'DNAJB2', 'RSL1D1', 'RBM25', 'PEA15', 'UBE2H', 'VMP1', 'DRAM2', 'USP47', 'RNF138', 'RECK', 'SMCHD1', 'STX6', 'RBM26', 'HYCC1', 'PUM2', 'TRIP6', 'FARP1', 'INIP', 'BAG2', 'ARPP19', 'KLHL24', 'DNAJC10', 'RHOBTB3', 'SNX16', 'TMEM59', 'MATR3', 'RC3H2', 'TRIM6', 'TNPO1', 'ENTR1', 'CHMP1B', 'SERTAD2', 'ERGIC1', 'ENC1', 'USP34', 'STX16', 'CCDC93', 'MICALL1', 'ARL4C', 'YTHDC1', 'AKAP12', 'TCAF1', 'GOLGA4', 'SH3BP5', 'SINHCAF', 'CTDSP2', 'RAB29', 'BICC1', 'ZPR1', 'SRSF1', 'SESN3', 'NACC2', 'BMAL2', 'LDLRAD4', 'LIMCH1', 'THOC2', 'MYO10', 'SUB1', 'ASAP3', 'PPM

# Categoization Prep

### GO-slim

In [35]:
# pip install goatools
from goatools.obo_parser import GODag

DATA_DIRECTORY = "../../data"
GO_OBO = f"{DATA_DIRECTORY}/GO/go-basic.obo"            # put the file in your working dir (or give full path)
GOSLIM_OBO = f"{DATA_DIRECTORY}/GO/goslim_generic.obo"  # swap to another slim if you prefer

go = GODag(GO_OBO)
slim = GODag(GOSLIM_OBO)
SLIM_IDS = set(slim.keys())

../../data/GO/go-basic.obo: fmt(1.2) rel(2025-10-10) 42,666 Terms
../../data/GO/goslim_generic.obo: fmt(1.2) rel(go/2025-10-10/subsets/goslim_generic.owl) 205 Terms


In [36]:
GO_RE = re.compile(r"(GO:\d{7})")

def get_goid(term: str):
    if isinstance(term, str):
        m = GO_RE.search(term)
        if m:
            return m.group(1)
    return None

def map_to_goslim(go_id):
    """Return all GO-Slim IDs that are ancestors (or self) of the GO term."""
    if not go_id or go_id not in go:
        return []
    node = go[go_id]
    parents = node.get_all_parents()
    if parents and not isinstance(next(iter(parents)), str):
        anc_ids = {node.id} | {t.id for t in parents}
    else:
        anc_ids = {node.id} | set(parents)
    return sorted(anc_ids & SLIM_IDS)

### KEGG

In [37]:
def build_kegg_name_to_id(species="hsa"):
    """Map KEGG pathway name -> 'hsaXXXXX' (species-specific)."""
    lines = requests.get(f"https://rest.kegg.jp/list/pathway/{species}").text.strip().splitlines()
    name_to_id = {}
    for ln in lines:
        pid, raw = ln.split("\t")
        pid = pid.replace("path:", "")  # e.g. hsa03010
        # strip " - Homo sapiens (human)" suffix
        name = re.sub(r"\s*-\s*Homo sapiens.*$", "", raw).strip()
        name_to_id[name.lower()] = pid
    return name_to_id

name_to_id = build_kegg_name_to_id("hsa")

In [67]:
def get_kegg_level2(hsa_id: str) -> str | None:
    """
    Return the KEGG Level 2 category for a pathway like 'hsa03040'.
    Example: get_kegg_level2("hsa03040") -> 'Transcription'
    """
    url = f"http://rest.kegg.jp/get/{hsa_id}"
    try:
        text = requests.get(url, timeout=10).text
    except Exception:
        return None

    for line in text.splitlines():
        if line.startswith("CLASS"):
            # CLASS line looks like: CLASS       Genetic Information Processing; Transcription
            parts = [p.strip() for p in line.split(";", maxsplit=2)]
            if len(parts) >= 2:
                return parts[1]
            elif len(parts) == 1:
                return parts[0].replace("CLASS", "").strip()
    return None

# Run Enrichment Analysis

### GO

In [6]:
# term_size_cap = 20
term_score_cap = 0.001
percentage = 0.1
important_terms = pd.DataFrame(columns=["Community Index","Community Size","Term", "Overlap", "Adjusted P-value"])

In [None]:
# GO Analysis; save terms with small size and high p-value
i = 0
for community in communities_HGNC:
    # Gene Ontology enrichment
    enr_go = gp.enrichr(
        gene_list=community,
        gene_sets=['GO_Biological_Process_2023',
                'GO_Molecular_Function_2023',
                'GO_Cellular_Component_2023'],
        organism='Human',
        outdir=None # don't write to disk
    )
    go_df = enr_go.results
    go_df = go_df.sort_values('Adjusted P-value')
    print(f"Size of community: {len(community)}")

    # Filter by overlap percentage and adjusted p-value
    mask =  (go_df["Adjusted P-value"] < term_score_cap) & (go_df["Overlap"].apply(lambda x: int(x.split("/")[0])/int(x.split("/")[1]) > percentage))
    filtered = go_df[mask].copy()
    
    # Categorization from GO-Slim
    filtered["GO_ID"] = filtered["Term"].apply(get_goid)
    filtered["Slim_IDs"] = filtered["GO_ID"].apply(map_to_goslim)
    filtered["Slim_Names"] = filtered["Slim_IDs"].apply(lambda ids: [go[i].name for i in ids])
    
    # # How many slim terms loaded?
    # print("len(SLIM_IDS) =", len(SLIM_IDS))  # should be > 0 (typically dozens+)

    # # Do you actually have GO IDs in your DF?
    # print("GO_ID non-null rows =", filtered["GO_ID"].notna().sum(), " / ", len(filtered))

    # # Try one known GO ID end-to-end:
    # test_id = "GO:0008150"  # 'biological_process' (should definitely be in DAG)
    # print("test_id in go?", test_id in go)
    # print("map_to_goslim(test_id) ->", map_to_goslim(test_id))   
    
    # print number of filtered terms
    print(f"Number of filtered terms: {len(filtered)}")
    
    # Add results to important terms
    if not filtered.empty:
        filtered.loc[:, "Community Index"] = i
        filtered.loc[:, "Community Size"] = len(community)
        important_terms = pd.concat([important_terms, filtered], ignore_index=True)

    display(HTML(filtered[['Term','Overlap','Adjusted P-value','GO_ID',"Slim_IDs","Slim_Names"]].to_html(max_cols=None)))
    i += 1



Size of community: 1265
Number of filtered terms: 516


Unnamed: 0,Term,Overlap,Adjusted P-value,GO_ID,Slim_IDs,Slim_Names
4081,Intracellular Membrane-Bounded Organelle (GO:0043231),624/5175,4.7773e-74,GO:0043231,[GO:0043226],[organelle]
4082,Nucleus (GO:0005634),561/4487,1.0425000000000001e-69,GO:0005634,"[GO:0005634, GO:0043226]","[nucleus, organelle]"
0,"mRNA Splicing, Via Spliceosome (GO:0000398)",86/211,4.374253e-44,GO:0000398,[GO:0016071],[mRNA metabolic process]
1,mRNA Processing (GO:0006397),86/214,8.603822e-44,GO:0006397,[GO:0016071],[mRNA metabolic process]
2,"RNA Splicing, Via Transesterification Reactions With Bulged Adenosine As Nucleophile (GO:0000377)",77/180,1.643948e-41,GO:0000377,[],[]
3,Mitotic Sister Chromatid Segregation (GO:0000070),60/111,1.6109240000000002e-39,GO:0000070,[GO:0007059],[chromosome segregation]
3543,RNA Binding (GO:0003723),222/1411,3.909492e-36,GO:0003723,[GO:0003723],[RNA binding]
4083,U2-type Spliceosomal Complex (GO:0005684),51/90,1.205366e-35,GO:0005684,[],[]
4084,U2-type Precatalytic Spliceosome (GO:0071005),37/49,7.525361000000001e-33,GO:0071005,[],[]
4085,Precatalytic Spliceosome (GO:0071011),38/52,7.525361000000001e-33,GO:0071011,[],[]


Size of community: 1181
Number of filtered terms: 27


Unnamed: 0,Term,Overlap,Adjusted P-value,GO_ID,Slim_IDs,Slim_Names
2796,UDP-xylosyltransferase Activity (GO:0035252),7/8,5e-06,GO:0035252,"[GO:0003824, GO:0016740]","[catalytic activity, transferase activity]"
0,Endoplasmic Reticulum Tubular Network Organization (GO:0071786),10/16,8e-06,GO:0071786,[],[]
1,Intracellular Protein Transport (GO:0006886),47/325,1.6e-05,GO:0006886,[GO:0006886],[intracellular protein transport]
3,Endoplasmic Reticulum Organization (GO:0007029),16/54,3.5e-05,GO:0007029,[],[]
2,Protein Localization (GO:0008104),48/351,3.5e-05,GO:0008104,[],[]
4,Protein O-linked Mannosylation (GO:0035269),9/16,3.7e-05,GO:0035269,"[GO:0009101, GO:1901135]","[glycoprotein biosynthetic process, carbohydrate derivative metabolic process]"
5,Protein Mannosylation (GO:0035268),10/21,4.3e-05,GO:0035268,[],[]
6,tRNA Processing (GO:0008033),15/50,4.3e-05,GO:0008033,[GO:0006399],[tRNA metabolic process]
7,Protein Targeting (GO:0006605),25/129,4.3e-05,GO:0006605,[],[]
2797,Xylosyltransferase Activity (GO:0042285),6/7,4.8e-05,GO:0042285,"[GO:0003824, GO:0016740]","[catalytic activity, transferase activity]"


Size of community: 720
Number of filtered terms: 0


Unnamed: 0,Term,Overlap,Adjusted P-value,GO_ID,Slim_IDs,Slim_Names


Size of community: 691
Number of filtered terms: 0


Unnamed: 0,Term,Overlap,Adjusted P-value,GO_ID,Slim_IDs,Slim_Names


Size of community: 677
Number of filtered terms: 5


Unnamed: 0,Term,Overlap,Adjusted P-value,GO_ID,Slim_IDs,Slim_Names
1117,Semaphorin Receptor Binding (GO:0030215),7/22,0.000637,GO:0030215,[],[]
1116,Muscle Alpha-Actinin Binding (GO:0051371),6/14,0.000637,GO:0051371,[GO:0008092],[cytoskeletal protein binding]
1118,Chemorepellent Activity (GO:0045499),7/24,0.000813,GO:0045499,"[GO:0048018, GO:0098772]","[receptor ligand activity, molecular function regulator activity]"
0,Semaphorin-Plexin Signaling Pathway (GO:0071526),9/34,0.000973,GO:0071526,[],[]
1,Negative Regulation Of Axon Extension Involved In Axon Guidance (GO:0048843),7/19,0.000973,GO:0048843,[],[]


Size of community: 553
Number of filtered terms: 32


Unnamed: 0,Term,Overlap,Adjusted P-value,GO_ID,Slim_IDs,Slim_Names
0,Mitochondrial Translation (GO:0032543),65/98,4.654217000000001e-75,GO:0032543,[],[]
1,Mitochondrial Gene Expression (GO:0140053),64/103,2.381156e-71,GO:0140053,[GO:0140053],[mitochondrial gene expression]
709,RNA Binding (GO:0003723),156/1411,1.757739e-51,GO:0003723,[GO:0003723],[RNA binding]
2,Translation (GO:0006412),66/234,1.767872e-45,GO:0006412,[],[]
923,Organelle Inner Membrane (GO:0019866),71/398,3.531926e-35,GO:0019866,[],[]
922,Mitochondrial Inner Membrane (GO:0005743),69/370,3.531926e-35,GO:0005743,[],[]
924,Mitochondrial Membrane (GO:0031966),74/540,4.8875190000000003e-29,GO:0031966,[],[]
925,Mitochondrial Ribosome (GO:0005761),17/22,2.096145e-21,GO:0005761,"[GO:0005840, GO:0043226]","[ribosome, organelle]"
3,Gene Expression (GO:0010467),44/296,1.0339370000000001e-17,GO:0010467,[],[]
4,Peptide Biosynthetic Process (GO:0043043),31/158,8.694116e-16,GO:0043043,[],[]


Size of community: 420
Number of filtered terms: 7


Unnamed: 0,Term,Overlap,Adjusted P-value,GO_ID,Slim_IDs,Slim_Names
704,Double-Stranded DNA Binding (GO:0003690),73/650,2.5313609999999998e-30,GO:0003690,[GO:0003677],[DNA binding]
705,Sequence-Specific DNA Binding (GO:0043565),74/717,1.179953e-28,GO:0043565,[GO:0003677],[DNA binding]
706,Sequence-Specific Double-Stranded DNA Binding (GO:1990837),73/715,4.137790000000001e-28,GO:1990837,[GO:0003677],[DNA binding]
2,Neuron Differentiation (GO:0030182),21/173,2.460907e-08,GO:0030182,[GO:0030154],[cell differentiation]
711,E-box Binding (GO:0070888),11/51,1.522589e-07,GO:0070888,[GO:0003677],[DNA binding]
4,Axon Development (GO:0061564),12/99,0.0001574986,GO:0061564,[],[]
5,Sensory Organ Development (GO:0007423),10/70,0.0002267056,GO:0007423,[GO:0048856],[anatomical structure development]


Size of community: 250
Number of filtered terms: 0


Unnamed: 0,Term,Overlap,Adjusted P-value,GO_ID,Slim_IDs,Slim_Names


Size of community: 141
Number of filtered terms: 2


Unnamed: 0,Term,Overlap,Adjusted P-value,GO_ID,Slim_IDs,Slim_Names
0,Intermediate Filament Organization (GO:0045109),10/68,3.69631e-09,GO:0045109,[GO:0007010],[cytoskeleton organization]
82,Lipoprotein Lipase Activity (GO:0004465),3/6,0.0001621026,GO:0004465,"[GO:0003824, GO:0016787]","[catalytic activity, hydrolase activity]"


Size of community: 113
Number of filtered terms: 0


Unnamed: 0,Term,Overlap,Adjusted P-value,GO_ID,Slim_IDs,Slim_Names


Size of community: 110
Number of filtered terms: 0


Unnamed: 0,Term,Overlap,Adjusted P-value,GO_ID,Slim_IDs,Slim_Names


Size of community: 76
Number of filtered terms: 5


Unnamed: 0,Term,Overlap,Adjusted P-value,GO_ID,Slim_IDs,Slim_Names
7,Olfactory Receptor Activity (GO:0004984),64/362,2.315239e-101,GO:0004984,[GO:0060089],[molecular transducer activity]
0,Sensory Perception Of Smell (GO:0007608),36/230,2.5900079999999998e-49,GO:0007608,[GO:0050877],[nervous system process]
1,Detection Of Chemical Stimulus Involved In Sensory Perception Of Smell (GO:0050911),19/139,2.3644089999999998e-24,GO:0050911,[],[]
2,Detection Of Chemical Stimulus Involved In Sensory Perception (GO:0050907),19/141,2.3644089999999998e-24,GO:0050907,[],[]
3,Sensory Perception Of Chemical Stimulus (GO:0007606),17/110,5.431795000000001e-23,GO:0007606,[GO:0050877],[nervous system process]


Size of community: 72
Number of filtered terms: 0


Unnamed: 0,Term,Overlap,Adjusted P-value,GO_ID,Slim_IDs,Slim_Names


Size of community: 61
Number of filtered terms: 5


  self.results = pd.concat(self.results, ignore_index=True)


Unnamed: 0,Term,Overlap,Adjusted P-value,GO_ID,Slim_IDs,Slim_Names
4,Olfactory Receptor Activity (GO:0004984),53/362,2.2667099999999998e-85,GO:0004984,[GO:0060089],[molecular transducer activity]
0,Sensory Perception Of Smell (GO:0007608),30/230,6.415215e-42,GO:0007608,[GO:0050877],[nervous system process]
1,Sensory Perception Of Chemical Stimulus (GO:0007606),16/110,7.437819000000001e-23,GO:0007606,[GO:0050877],[nervous system process]
2,Detection Of Chemical Stimulus Involved In Sensory Perception Of Smell (GO:0050911),15/139,1.316926e-19,GO:0050911,[],[]
3,Detection Of Chemical Stimulus Involved In Sensory Perception (GO:0050907),15/141,1.316926e-19,GO:0050907,[],[]


Size of community: 51
Number of filtered terms: 0


Unnamed: 0,Term,Overlap,Adjusted P-value,GO_ID,Slim_IDs,Slim_Names


### KEGG

In [38]:
term_score_cap = 0.001
percentage = 0.1

In [69]:
# KEGG
i = 0
for community in communities_HGNC:
    enr_path = gp.enrichr(
        gene_list=community,
        gene_sets=['KEGG_2021_Human'],
        organism='Human',
        outdir=None
    )
    KEGG_df = enr_path.results
    KEGG_df = KEGG_df.sort_values('Adjusted P-value')

    # Filter by overlap percentage and adjusted p-value
    mask =  (KEGG_df["Adjusted P-value"] < term_score_cap) & (KEGG_df["Overlap"].apply(lambda x: int(x.split("/")[0])/int(x.split("/")[1]) > percentage))
    filtered = KEGG_df[mask].copy()
    
    # Categorization from KEGG Level 2
    filtered["KEGG_ID"] = filtered["Term"].str.replace(r"\s*-\s*Homo sapiens.*$", "", regex=True).str.lower().map(name_to_id)
    filtered["Level_2_Category"] = filtered["KEGG_ID"].map(get_kegg_level2)
    # print number of filtered terms
    print(f"Number of filtered terms: {len(filtered)}")
    
    # Add results to important terms
    if not filtered.empty:
        filtered.loc[:, "Community Index"] = i
        filtered.loc[:, "Community Size"] = len(community)
        important_terms = pd.concat([important_terms, filtered], ignore_index=True)

    print(f"Size of community: {len(community)}")
    display(HTML(filtered[['Term','Overlap','Adjusted P-value',"KEGG_ID","Level_2_Category"]].head(10).to_html(max_cols=None)))
    i += 1

Number of filtered terms: 5
Size of community: 1265


Unnamed: 0,Term,Overlap,Adjusted P-value,KEGG_ID,Level_2_Category
0,Spliceosome,67/150,3.6958549999999997e-38,hsa03040,Transcription
1,Endocytosis,46/252,5.451858e-09,hsa04144,Transport and catabolism
2,Ubiquitin mediated proteolysis,32/140,8.26866e-09,hsa04120,"Folding, sorting and degradation"
3,RNA degradation,21/79,5.166295e-07,hsa03018,"Folding, sorting and degradation"
4,SNARE interactions in vesicular transport,11/33,0.0001112171,hsa04130,"Folding, sorting and degradation"


Number of filtered terms: 0
Size of community: 1181


Unnamed: 0,Term,Overlap,Adjusted P-value,KEGG_ID,Level_2_Category


Number of filtered terms: 0
Size of community: 720


Unnamed: 0,Term,Overlap,Adjusted P-value,KEGG_ID,Level_2_Category


Number of filtered terms: 1
Size of community: 691


Unnamed: 0,Term,Overlap,Adjusted P-value,KEGG_ID,Level_2_Category
0,Herpes simplex virus 1 infection,50/498,4.16743e-10,hsa05168,Infectious disease: viral


Number of filtered terms: 0
Size of community: 677


Unnamed: 0,Term,Overlap,Adjusted P-value,KEGG_ID,Level_2_Category


Number of filtered terms: 2
Size of community: 553


Unnamed: 0,Term,Overlap,Adjusted P-value,KEGG_ID,Level_2_Category
0,Ribosome,38/158,2.084311e-23,hsa03010,Translation
1,Ribosome biogenesis in eukaryotes,16/108,1.120277e-06,hsa03008,Translation


Number of filtered terms: 0
Size of community: 420


Unnamed: 0,Term,Overlap,Adjusted P-value,KEGG_ID,Level_2_Category


Number of filtered terms: 0
Size of community: 250


Unnamed: 0,Term,Overlap,Adjusted P-value,KEGG_ID,Level_2_Category


Number of filtered terms: 0
Size of community: 141


Unnamed: 0,Term,Overlap,Adjusted P-value,KEGG_ID,Level_2_Category


Number of filtered terms: 0
Size of community: 113


Unnamed: 0,Term,Overlap,Adjusted P-value,KEGG_ID,Level_2_Category


Number of filtered terms: 0
Size of community: 110


Unnamed: 0,Term,Overlap,Adjusted P-value,KEGG_ID,Level_2_Category


Number of filtered terms: 1
Size of community: 76


Unnamed: 0,Term,Overlap,Adjusted P-value,KEGG_ID,Level_2_Category
0,Olfactory transduction,74/440,9.826462e-123,hsa04740,Sensory system


Number of filtered terms: 0
Size of community: 72


Unnamed: 0,Term,Overlap,Adjusted P-value,KEGG_ID,Level_2_Category


Number of filtered terms: 1
Size of community: 61


Unnamed: 0,Term,Overlap,Adjusted P-value,KEGG_ID,Level_2_Category
0,Olfactory transduction,61/440,1.048704e-103,hsa04740,Sensory system


Number of filtered terms: 0
Size of community: 51


Unnamed: 0,Term,Overlap,Adjusted P-value,KEGG_ID,Level_2_Category


### Reactome

In [None]:
# Reactome enrichment
i = 0
for community in communities_HGNC:
    enr_path = gp.enrichr(
        gene_list=community,
        gene_sets=['Reactome_2022'],
        organism='Human',
        outdir=None
    )
    enr_path_df = enr_path.results
    enr_path_df = enr_path_df.sort_values('Adjusted P-value')

    mask =  (go_df["Adjusted P-value"] < term_score_cap) & (go_df["Overlap"].apply(lambda x: int(x.split("/")[0])/int(x.split("/")[1]) > percentage))
        
    filtered = enr_path_df[mask].copy()
    if not filtered.empty:
        filtered.loc[:, "Community Index"] = i
        filtered.loc[:, "Community Size"] = len(community)
        important_terms = pd.concat([important_terms, filtered], ignore_index=True)

    print(f"Size of community: {len(community)}")
    display(HTML(filtered[['Term','Overlap','Adjusted P-value']].head(10).to_html(max_cols=None)))
    i += 1

### Disease Data Sets

In [None]:
disease_size_cap = 200
disease_score_cap = 0.0001
important_diseases = pd.DataFrame(columns=["Community Index","Community Size","Term", "Overlap", "Adjusted P-value"])

In [None]:
# Disease-gene enrichment libraries
disease_sets = [
    'DisGeNET_2020', # curated gene–disease associations
    'GWAS_Catalog_2023', # genome-wide association hits
    'OMIM_Disease', # Mendelian disorders
    'Jensen_DISEASES' # text-mined associations
]

# # Disease-gene enrichment Analysis; save terms with small size and high p-value
i = 0
for community in communities_HGNC:
    # Gene Ontology enrichment
    enr_disease = gp.enrichr(
        gene_list=community,
        gene_sets=disease_sets,
        organism='Human',
        outdir=None # don't write to disk
    )
    enr_disease_df = enr_disease.results.sort_values('Adjusted P-value')
    print(f"Size of community: {len(community)}")

    mask = ((enr_disease_df["Overlap"].apply(lambda x: int(x.split("/")[1]) < disease_size_cap)) 
    & (enr_disease_df["Adjusted P-value"] < disease_score_cap))
        
    filtered = enr_disease_df[mask].copy()
    if not filtered.empty:
        filtered.loc[:, "Community Index"] = i
        filtered.loc[:, "Community Size"] = len(community)
        important_diseases = pd.concat([important_diseases, filtered], ignore_index=True)

    display(HTML(enr_disease_df[['Term','Overlap','Adjusted P-value']].head(10).to_html(max_cols=None)))
    i += 1

# Important Terms Analysis

In [None]:
important_terms = important_terms.sort_values(by="Overlap")
display(HTML(important_terms[["Community Index","Community Size",'Term','Overlap','Adjusted P-value',"P-value"]].head(50).to_html(max_cols=None)))

# Important Diseases Analysis

In [None]:
important_diseases = important_diseases.sort_values(by="Community Index")
display(HTML(important_diseases[["Community Index","Community Size",'Term','Overlap','Adjusted P-value']].head(50).to_html(max_cols=None)))