In [2]:
import numpy as np
import json
from scipy.sparse import load_npz,save_npz,diags,csr_matrix
import scipy.sparse as sp
import pandas as pd
import os
from io import BytesIO
from tqdm import tqdm
from scipy.sparse.linalg import eigsh
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
from pathlib import Path
from matplotlib.backends.backend_pdf import PdfPages
from pypdf import PdfReader, PdfWriter
from tempfile import NamedTemporaryFile
import networkx as nx
import pickle
import gseapy as gp
import mygene
from IPython.display import display, HTML

In [3]:
pd.set_option('display.width', None)      # No line-wrapping
pd.set_option('display.max_columns', None)  # Show all columns

In [4]:
DISEASE = "BIPOLAR"
RESULT_FOLDER = f"../output/{DISEASE}/leiden_results"
DGIDB_DIRECTORY = f"../../Gen_Hypergraph/output/DGIDB_{DISEASE}/"
MSIGDB_DIRECTORY = "../../Gen_Hypergraph/output/MSigDB_Full/"

with open(DGIDB_DIRECTORY + "gene_to_index.json", "r") as file:
    dgidb = json.load(file)
with open(MSIGDB_DIRECTORY + "gene_to_index.json", "r") as file:
    msigdb = json.load(file)

In [5]:
def communities_cutoff(communities, cutoff = 100):
    result = []
    for community in communities:
        if len(community) >= cutoff:
            result.append(community)

    return result, len(result)

def community_central_genes_by_num(G, community_nodes, weight="weight", top_n=20):
    C = set(community_nodes)
    H = G.subgraph(C).copy()                       # induced subgraph
    # within-community (weighted) degree
    k = {u: H.degree(u, weight=weight) for u in H}
    ks = np.array(list(k.values()), dtype=float)
    mu, sigma = ks.mean(), ks.std() if ks.std() > 0 else 1.0
    Z = {u: (k[u] - mu)/sigma for u in H}          # within-module degree z-score

    # rank by z
    ranked = sorted(H.nodes(), key=lambda u: (Z[u]), reverse=True)
    return [u for u in ranked[:top_n]]

def community_central_genes_by_score(G, community_nodes, weight="weight",score_cap = 1):
    C = set(community_nodes)
    H = G.subgraph(C).copy()                       # induced subgraph
    # within-community (weighted) degree
    k = {u: H.degree(u, weight=weight) for u in H}
    ks = np.array(list(k.values()), dtype=float)
    mu, sigma = ks.mean(), ks.std() if ks.std() > 0 else 1.0
    Z = {u: (k[u] - mu)/sigma for u in H}          # within-module degree z-score

    # rank by z
    ranked = sorted(H.nodes(), key=lambda u: (Z[u]), reverse=True)
    return [u for u in ranked if Z[u] >= score_cap]

def community_central_genes_by_pct(G, community_nodes, weight="weight",pct = 0.3):
    C = set(community_nodes)
    H = G.subgraph(C).copy()                       # induced subgraph
    # within-community (weighted) degree
    k = {u: H.degree(u, weight=weight) for u in H}
    ks = np.array(list(k.values()), dtype=float)
    mu, sigma = ks.mean(), ks.std() if ks.std() > 0 else 1.0
    Z = {u: (k[u] - mu)/sigma for u in H}          # within-module degree z-score

    # rank by z
    ranked = sorted(H.nodes(), key=lambda u: (Z[u]), reverse=True)
    top = int(len(ranked)*pct)
    return [u for u in ranked[:top]]

In [6]:
# Combine indices from both layers, first DGIDB and then MSIGDB
dgidb_rev = {index : gene for gene,index in dgidb.items()}
msigdb_rev = {index : gene for gene,index in msigdb.items()}

a_max = max(dgidb_rev.keys())
b_min = min(msigdb_rev.keys())
shift = (a_max + 1) - b_min   # ensures no overlap
index_to_gene = {**dgidb_rev, **{k + shift: v for k, v in msigdb_rev.items()}}

print(dgidb_rev)
print(msigdb_rev)
print(index_to_gene)

{0: '1565', 1: '5468', 2: '79915', 3: '5999', 4: '5594', 5: '185', 6: '3791', 7: '142', 8: '318', 9: '1838', 10: '2159', 11: '6263', 12: '3815', 13: '6351', 14: '1437', 15: '29994', 16: '3725', 17: '10919', 18: '283106', 19: '153', 20: '6336', 21: '358', 22: '2261', 23: '155', 24: '5340', 25: '3752', 26: '6158', 27: '2099', 28: '51426', 29: '9290', 30: '3091', 31: '1813', 32: '303', 33: '3276', 34: '2260', 35: '9734', 36: '2908', 37: '10966', 38: '776', 39: '9971', 40: '4088', 41: '1436', 42: '817', 43: '3570', 44: '6198', 45: '147746', 46: '7468', 47: '1559', 48: '5742', 49: '360', 50: '552', 51: '3156', 52: '9682', 53: '29126', 54: '2651', 55: '10298', 56: '2064', 57: '427', 58: '2906', 59: '1906', 60: '354', 61: '3952', 62: '2648', 63: '5893', 64: '5467', 65: '1395', 66: '2568', 67: '926', 68: '5605', 69: '2357', 70: '1080', 71: '2185', 72: '256815', 73: '5604', 74: '2740', 75: '146850', 76: '5916', 77: '695', 78: '3480', 79: '4193', 80: '7015', 81: '1956', 82: '238', 83: '2237', 84

In [7]:
# Sanity Check; Results should be equal
print(index_to_gene[12678+len(dgidb_rev)],
msigdb_rev[12678])
print(index_to_gene[26754])

65996 65996
129810495


In [8]:
# Loading result graph and communities
with open(f"{RESULT_FOLDER}/result_communities.pkl", "rb") as f:
    communities = pickle.load(f)
with open(f"{RESULT_FOLDER}/result_graph.pkl", "rb") as f:
    graph = pickle.load(f)

In [9]:
# Checking communities
communities_greater_100, num_greater_100 = communities_cutoff(communities,100)
print(communities_greater_100, '\n', num_greater_100)
print(len(communities))

[[1, 8, 10, 13, 16, 18, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 38, 39, 43, 45, 49, 57, 77, 87, 88, 89, 90, 91, 92, 100, 113, 115, 116, 117, 119, 125, 127, 130, 131, 137, 139, 141, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 161, 162, 163, 164, 167, 169, 170, 171, 172, 174, 176, 177, 185, 186, 196, 204, 209, 210, 220, 222, 224, 225, 226, 235, 237, 243, 245, 246, 247, 248, 253, 254, 255, 260, 266, 271, 272, 274, 285, 295, 296, 302, 304, 305, 320, 325, 326, 327, 335, 337, 344, 346, 348, 350, 353, 356, 358, 363, 375, 376, 378, 383, 384, 385, 387, 392, 393, 402, 405, 408, 417, 419, 420, 432, 437, 440, 446, 455, 456, 457, 459, 470, 489, 494, 506, 515, 518, 519, 535, 536, 545, 547, 549, 556, 560, 561, 563, 565, 566, 567, 569, 571, 585, 586, 587, 589, 590, 594, 595, 600, 602, 610, 615, 616, 621, 626, 631, 633, 634, 635, 637, 647, 653, 654, 658, 692, 695, 707, 711, 718, 721, 724, 748, 766, 769, 780, 789, 793, 795, 798, 802, 803, 808, 812, 820, 821, 822, 823, 828, 832, 833

In [10]:
# Update communities to include only the important genes, determined by a community-size_cap and z-score_cap
size_cap = 50
pct = 0.3
communities_selected = []
for community in communities:
    if (len(community) >= size_cap):
        important_nodes = community_central_genes_by_pct(graph,community,pct = pct)
        communities_selected.append(important_nodes)

In [11]:
print(communities_selected)
print(len(communities_selected))

[[7473, 6261, 5260, 12411, 4420, 9023, 602, 1925, 4012, 7756, 7609, 13327, 5665, 3265, 12394, 7552, 8546, 11552, 4781, 3979, 7727, 6847, 12724, 3990, 11858, 4353, 13301, 6814, 5534, 18805, 8659, 7627, 13686, 3753, 8879, 4884, 3432, 10180, 6660, 5333, 5027, 3649, 19516, 13137, 4188, 5509, 3666, 10226, 9597, 12820, 9354, 243, 7219, 10447, 1307, 10804, 2235, 7620, 7249, 2314, 6246, 6981, 12407, 12725, 405, 2874, 5687, 8153, 11047, 13573, 8693, 18139, 5893, 12593, 3771, 6686, 8878, 6324, 6836, 6882, 3148, 6419, 7296, 8880, 12963, 11086, 14841, 10713, 6003, 2635, 12396, 11040, 8414, 5361, 8105, 4310, 6253, 12853, 8617, 8632, 5540, 10698, 10134, 8711, 12304, 12376, 12505, 6698, 3561, 14912, 5073, 6772, 16587, 8412, 6980, 7813, 17199, 4187, 6571, 14932, 9014, 11362, 18708, 7532, 11373, 12306, 6576, 13483, 13609, 16006, 10282, 14386, 7341, 16383, 11237, 10594, 11250, 11794, 3310, 3818, 9705, 13266, 12904, 5187, 16128, 11011, 14533, 16597, 9916, 11411, 3992, 6873, 8922, 6620, 6538, 795, 6762, 1

In [12]:
# Convert index to ncbi
communities_ncbi = [list(map(index_to_gene.get, c)) for c in communities_selected]
print(communities_ncbi)

[['3887', '2098', '677', '10938', '8858', '6167', '9340', '1215', '4926', '4294', '4085', '23581', '1234', '6203', '10916', '4007', '5515', '9871', '14', '7184', '4242', '2998', '11342', '4325', '10235', '5229', '23544', '2959', '1028', '84539', '5648', '4107', '26118', '84870', '5934', '154', '6514', '7789', '2777', '777', '344', '6281', '93058', '23326', '1769', '1001', '3113', '7932', '6892', '22913', '6595', '2328', '3565', '8446', '28', '8896', '6716', '4099', '3605', '1768', '2066', '3170', '10933', '11343', '5747', '2239', '1272', '4993', '9253', '25940', '5693', '80349', '1551', '11159', '7433', '2805', '5933', '2182', '2984', '3040', '2593', '2306', '3667', '5935', '23113', '9311', '51283', '8789', '1730', '6810', '10919', '9244', '5330', '816', '4914', '22916', '2073', '22971', '5595', '5610', '1036', '8771', '7712', '5713', '10799', '10897', '11051', '2822', '130', '51374', '399', '2906', '57171', '5328', '3169', '4437', '64241', '4281', '2645', '51411', '6156', '9656', '842

In [13]:
# NCBI to HGNC symbol
communities_HGNC = []
missed = []


for community in communities_ncbi:
    mg = mygene.MyGeneInfo()
    entrez_ids = [str(e) for e in community]

    results = mg.querymany(
        entrez_ids,
        scopes="entrezgene",
        fields="symbol",
        species="human"
    )

    # Build a mapping: input ID -> symbol (or None)
    id_to_symbol = {}
    for r in results:
        q = str(r.get("query"))
        id_to_symbol[q] = r.get("symbol") if not r.get("notfound") else None

    # Preserve original order
    symbols = [id_to_symbol.get(str(e), None) for e in entrez_ids]
    communities_HGNC.append(symbols)


Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
22 input query terms found dup hits:	[('2998', 2), ('2959', 2), ('3113', 2), ('5933', 2), ('2593', 2), ('1805', 2), ('2581', 2), ('3690',
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
12 input query terms found dup hits:	[('9947', 2), ('8638', 2), ('2035', 2), ('51311', 2), ('1889', 2), ('2204', 2), ('1369', 2), ('5395'
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
13 input query terms found dup hits:	[('1376', 2), ('3213', 2), ('8503', 2), ('23461', 2), ('2580', 2), ('597', 2), 

In [14]:
print(communities_HGNC)

[['KRT81', 'ESD', 'ZFP36L1', 'EHD1', 'PROZ', 'RPL37', 'GLP2R', 'CMA1', 'NUMA1', 'MAP3K10', 'MAD2L1', 'CASP14', 'CCR5', 'RPS9', 'MAGED2', 'PRICKLE3', 'PPP2CA', 'SEC24D', 'AAMP', 'HSP90B1', 'MFNG', 'GYS2', 'RNF13', 'MMP16', 'RASGRP2', 'PGGT1B', 'SEZ6L', 'GTF2B', 'CDKN1C', 'MCHR2', 'MASP1', 'MAGEA8', 'WSB1', 'RSPO3', 'RBL2', 'ADRB2', 'SLC2A2', 'ZXDA', 'GNAQP1', 'CACNA1E', 'APOC2', 'S100A10', 'COQ10A', 'USP22', 'DNAH8', 'CDH3', 'HLA-DPA1', 'OR2H2', 'TAPBP', 'RALY', 'SMARCA2', 'FMO3', 'IL4', 'DUSP11', 'ABO', 'BUD31', 'SRD5A2', 'MAG', 'IL17A', 'DNAH6', 'ERBB4', 'FOXA2', 'MORF4L1', 'MGLL', 'PTK2', 'GPC4', 'CNTN1', 'OR2C1', 'NUMBL', 'FAM98A', 'PSMB5', 'SKIC8', 'CYP3A7', 'RABL2A', 'VIPR1', 'GOT1', 'RBL1', 'ACSL4', 'GUCY2C', 'HBA2', 'GAMT', 'FOXD2', 'IRS1', 'RBM3', 'CUL9', 'ASIC3', 'BFAR', 'FBP2', 'DIAPH2', 'STX4', 'EHMT2', 'CRLF1', 'PLCB2', 'CAMK2B', 'NTRK1', 'NCBP2', 'ERCC5', 'RPL41P1', 'MAPK3', 'EIF2AK2', 'CDO1', 'TNFRSF6B', 'ZNF157', 'PSMD7', 'RPP40', 'YIF1A', 'NUDT21', 'GPLD1', 'ADH6', 'ATR

In [15]:
# term_size_cap = 20
term_score_cap = 0.001
percentage = 0.3
important_terms = pd.DataFrame(columns=["Community Index","Community Size","Term", "Overlap", "Adjusted P-value"])

In [16]:
# GO Analysis; save terms with small size and high p-value
i = 0
for community in communities_HGNC:
    # Gene Ontology enrichment
    enr_go = gp.enrichr(
        gene_list=community,
        gene_sets=['GO_Biological_Process_2023',
                'GO_Molecular_Function_2023',
                'GO_Cellular_Component_2023'],
        organism='Human',
        outdir=None # don't write to disk
    )
    go_df = enr_go.results
    go_df = go_df.sort_values('Adjusted P-value')
    print(f"Size of community: {len(community)}")

    mask =  (go_df["Adjusted P-value"] < term_score_cap) & (go_df["Overlap"].apply(lambda x: int(x.split("/")[0])/int(x.split("/")[1]) > percentage))
    filtered = go_df[mask].copy()
    if not filtered.empty:
        filtered.loc[:, "Community Index"] = i
        filtered.loc[:, "Community Size"] = len(community)
        important_terms = pd.concat([important_terms, filtered], ignore_index=True)

    display(HTML(go_df[['Term','Overlap','Adjusted P-value']].head(10).to_html(max_cols=None)))
    i += 1



Size of community: 1193


  important_terms = pd.concat([important_terms, filtered], ignore_index=True)


Unnamed: 0,Term,Overlap,Adjusted P-value
0,Regulation Of Cell Population Proliferation (GO:0042127),83/766,0.000154
3873,Zinc Ion Binding (GO:0008270),45/341,0.00022
2,Positive Regulation Of Cellular Process (GO:0048522),66/594,0.0005
3,Receptor Signaling Pathway Via JAK-STAT (GO:0007259),12/36,0.0005
1,Positive Regulation Of Cell Population Proliferation (GO:0008284),57/483,0.0005
3874,Transition Metal Ion Binding (GO:0046914),53/456,0.000578
5,Positive Regulation Of Cell Motility (GO:2000147),32/221,0.001289
4,Positive Regulation Of Protein Phosphorylation (GO:0001934),46/377,0.001289
6,Positive Regulation Of Cell Migration (GO:0030335),36/272,0.002218
7,Receptor Signaling Pathway Via STAT (GO:0097696),10/31,0.002899


Size of community: 957


Unnamed: 0,Term,Overlap,Adjusted P-value
0,Positive Regulation Of Cytokine Production (GO:0001819),40/320,7.4e-05
3635,DNA Binding (GO:0003677),73/846,0.000351
1,Regulation Of DNA-templated Transcription (GO:0006355),135/1922,0.002708
4381,Intracellular Membrane-Bounded Organelle (GO:0043231),301/5175,0.00332
4382,Ficolin-1-Rich Granule (GO:0101002),23/184,0.00332
2,Positive Regulation Of DNA-templated Transcription (GO:0045893),94/1243,0.00403
3,Positive Regulation Of Macromolecule Biosynthetic Process (GO:0010557),19/122,0.00403
4383,Nucleus (GO:0005634),263/4487,0.004408
4384,Intracellular Organelle Lumen (GO:0070013),66/856,0.005145
6,Protein Phosphorylation (GO:0006468),46/500,0.006768


Size of community: 755


Unnamed: 0,Term,Overlap,Adjusted P-value
2468,"RNA Exonuclease Activity, Producing 5'-Phosphomonoesters (GO:0016896)",2/30,0.825893
2458,"Hydrolase Activity, Acting On Carbon-Nitrogen (But Not Peptide) Bonds, In Linear Amidines (GO:0016813)",1/9,0.825893
2459,Inhibitory MHC Class I Receptor Activity (GO:0032396),1/9,0.825893
2460,Nucleoside Triphosphate Diphosphatase Activity (GO:0047429),1/9,0.825893
2461,"Oxidoreductase Activity, Acting On NAD(P)H, Heme Protein As Acceptor (GO:0016653)",1/9,0.825893
2462,ATPase Binding (GO:0051117),4/73,0.825893
2463,"Phosphatidylinositol-4,5-Bisphosphate Binding (GO:0005546)",4/73,0.825893
2464,Acylglycerol O-acyltransferase Activity (GO:0016411),2/29,0.825893
2465,miRNA Binding (GO:0035198),2/29,0.825893
2466,Cysteine-Type Deubiquitinase Activity (GO:0004843),5/98,0.825893


Size of community: 667


Unnamed: 0,Term,Overlap,Adjusted P-value
0,Protein Phosphorylation (GO:0006468),43/500,2.6e-05
1,Positive Regulation Of Protein Phosphorylation (GO:0001934),35/377,4.7e-05
2,Positive Regulation Of DNA-templated Transcription (GO:0045893),77/1243,4.7e-05
3733,Neuron Projection (GO:0043005),43/557,5.7e-05
3111,Protein Serine/Threonine Kinase Activity (GO:0004674),32/342,6.5e-05
3734,Secretory Granule Lumen (GO:0034774),29/316,9.5e-05
5,Positive Regulation Of Intracellular Signal Transduction (GO:1902533),40/525,0.000364
3,Positive Regulation Of Cell Population Proliferation (GO:0008284),38/483,0.000364
4,Anterograde Trans-Synaptic Signaling (GO:0098916),22/199,0.000364
7,Positive Regulation Of MAPK Cascade (GO:0043410),28/310,0.000462


Size of community: 657


Unnamed: 0,Term,Overlap,Adjusted P-value
2954,Double-Stranded DNA Binding (GO:0003690),44/650,0.002664
0,Potassium Ion Transmembrane Transport (GO:0071805),17/137,0.007197
2955,Delayed Rectifier Potassium Channel Activity (GO:0005251),7/28,0.007538
2956,Voltage-Gated Potassium Channel Activity (GO:0005249),11/80,0.008588
2957,Protein Serine/Threonine Kinase Activity (GO:0004674),26/342,0.008588
2958,Activin Receptor Activity (GO:0017002),4/8,0.008588
1,Potassium Ion Import Across Plasma Membrane (GO:1990573),9/42,0.010011
2,Negative Regulation Of Cellular Biosynthetic Process (GO:0031327),16/137,0.010366
3,Positive Regulation Of Interleukin-5 Production (GO:0032754),4/6,0.011837
4,Positive Regulation Of Cell Differentiation (GO:0045597),24/283,0.012397


Size of community: 625


Unnamed: 0,Term,Overlap,Adjusted P-value
3046,Protein Homodimerization Activity (GO:0042803),45/662,0.000422
3047,Receptor Ligand Activity (GO:0048018),26/319,0.002163
3598,Secretory Granule Lumen (GO:0034774),25/316,0.005133
3048,G Protein-Coupled Receptor Activity (GO:0004930),21/250,0.005248
3049,Purine Ribonucleoside Triphosphate Binding (GO:0035639),32/476,0.005248
0,Regulation Of Nitric Oxide Biosynthetic Process (GO:0045428),9/39,0.005658
1,Positive Regulation Of Macromolecule Metabolic Process (GO:0010604),29/364,0.005658
3599,Focal Adhesion (GO:0005925),27/387,0.007603
3600,Polymeric Cytoskeletal Fiber (GO:0099513),21/265,0.007603
3601,Cell-Substrate Junction (GO:0030055),27/395,0.007618


Size of community: 525


Unnamed: 0,Term,Overlap,Adjusted P-value
2608,Membrane Attack Complex (GO:0005579),3/6,0.0851
2183,Carbonate Dehydratase Activity (GO:0004089),4/13,0.118115
2184,Magnesium Ion Binding (GO:0000287),11/155,0.401701
2195,Phosphatidylinositol Phosphate 4-Phosphatase Activity (GO:0034596),2/7,0.401701
2194,Serine-Type Peptidase Activity (GO:0008236),9/142,0.401701
2193,Metal Ion Binding (GO:0046872),23/521,0.401701
2192,Dopamine Receptor Binding (GO:0050780),2/6,0.401701
2196,"Phosphatidylinositol-4,5-Bisphosphate 5-Phosphatase Activity (GO:0004439)",2/7,0.401701
2190,1-Phosphatidylinositol-4-Phosphate 3-Kinase Activity (GO:0035005),2/6,0.401701
2189,Ubiquitin-Like Protein Peptidase Activity (GO:0019783),4/32,0.401701


Size of community: 368


Unnamed: 0,Term,Overlap,Adjusted P-value
2447,Muscle Myosin Complex (GO:0005859),5/14,0.000749
0,Muscle Filament Sliding (GO:0030049),5/10,0.000978
1,Actin-Myosin Filament Sliding (GO:0033275),5/12,0.001491
2448,Myosin Filament (GO:0032982),4/15,0.013698
2449,Intracellular Organelle Lumen (GO:0070013),31/856,0.019127
2450,Dendrite (GO:0030425),13/270,0.064837
2451,Vesicle Membrane (GO:0012506),6/69,0.064837
2452,Myofibril (GO:0030016),4/29,0.064837
2040,Purine Ribonucleoside Triphosphate Binding (GO:0035639),21/476,0.087943
2041,Solute:Proton Symporter Activity (GO:0015295),4/20,0.088035


Size of community: 270


Unnamed: 0,Term,Overlap,Adjusted P-value
981,Calcium Ion Transmembrane Transporter Activity (GO:0015085),5/68,0.214606
982,Sphingosine N-acyltransferase Activity (GO:0050291),2/6,0.214606
983,fatty-acyl-CoA Synthase Activity (GO:0004321),2/6,0.214606
984,Deoxycytidine Deaminase Activity (GO:0047844),2/7,0.223332
985,cAMP Response Element Binding Protein Binding (GO:0008140),2/8,0.236105
986,Calcium:Sodium Antiporter Activity (GO:0005432),2/9,0.250724
987,C-acyltransferase Activity (GO:0016408),2/11,0.28222
988,Cytidine Deaminase Activity (GO:0004126),2/11,0.28222
989,N-acyltransferase Activity (GO:0016410),3/37,0.366911
1029,UDP-xylosyltransferase Activity (GO:0035252),1/8,0.479121


Size of community: 148


Unnamed: 0,Term,Overlap,Adjusted P-value
731,Protein Tyrosine/Serine/Threonine Phosphatase Activity (GO:0008138),2/23,0.346322
761,Estradiol 17-Beta-Dehydrogenase [NAD(P)] Activity (GO:0004303),1/9,0.346322
760,Cyclin-Dependent Protein Serine/Threonine Kinase Inhibitor Activity (GO:0004861),1/9,0.346322
759,Histone H3K9 Methyltransferase Activity (GO:0046974),1/9,0.346322
758,Amino Acid:Monoatomic Cation Symporter Activity (GO:0005416),1/9,0.346322
757,ADP Transmembrane Transporter Activity (GO:0015217),1/9,0.346322
756,Glycine Transmembrane Transporter Activity (GO:0015187),1/9,0.346322
755,Amino Acid Transmembrane Transporter Activity (GO:0015171),2/56,0.346322
754,Antiporter Activity (GO:0015297),2/53,0.346322
753,Lysophospholipid Acyltransferase Activity (GO:0071617),1/8,0.346322


Size of community: 75


Unnamed: 0,Term,Overlap,Adjusted P-value
534,Olfactory Receptor Binding (GO:0031849),1/5,0.250617
535,CoA Carboxylase Activity (GO:0016421),1/5,0.250617
555,Transcription Regulatory Region Nucleic Acid Binding (GO:0001067),3/224,0.250617
553,Lipid Phosphatase Activity (GO:0042577),1/13,0.250617
552,"Oxidoreductase Activity, Acting On The CH-NH Group Of Donors, NAD Or NADP As Acceptor (GO:0016646)",1/13,0.250617
551,Solute:Monoatomic Cation Symporter Activity (GO:0015294),1/12,0.250617
550,Protein Kinase A Catalytic Subunit Binding (GO:0034236),1/12,0.250617
549,Intracellular Ligand-Gated Monoatomic Ion Channel Activity (GO:0005217),1/12,0.250617
548,Toll-like Receptor Binding (GO:0035325),1/11,0.250617
547,Monoatomic Cation:Bicarbonate Symporter Activity (GO:0140410),1/11,0.250617


Size of community: 73


Unnamed: 0,Term,Overlap,Adjusted P-value
323,Monocarboxylate:Sodium Symporter Activity (GO:0140161),2/12,0.078769
324,Bile Acid:Sodium Symporter Activity (GO:0008508),1/5,0.246345
355,Poly-Pyrimidine Tract Binding (GO:0008187),1/25,0.246345
354,poly(A) Binding (GO:0008143),1/25,0.246345
353,Lysine N-methyltransferase Activity (GO:0016278),1/25,0.246345
352,Mannosyltransferase Activity (GO:0000030),1/23,0.246345
351,Protein Tyrosine/Serine/Threonine Phosphatase Activity (GO:0008138),1/23,0.246345
349,poly(U) RNA Binding (GO:0008266),1/22,0.246345
348,Single-Stranded DNA Helicase Activity (GO:0017116),1/22,0.246345
347,Cyclic Nucleotide Binding (GO:0030551),1/21,0.246345


Size of community: 66


Unnamed: 0,Term,Overlap,Adjusted P-value
211,Sodium:Proton Antiporter Activity (GO:0015385),2/14,0.074204
212,Metal Cation:Proton Antiporter Activity (GO:0051139),2/21,0.084354
289,Nuclear Outer Membrane (GO:0005640),2/19,0.086669
0,N-terminal Protein Amino Acid Acetylation (GO:0006474),2/15,0.099362
1,Cell Surface Toll-Like Receptor Signaling Pathway (GO:0140895),2/16,0.099362
2,Toll-Like Receptor 4 Signaling Pathway (GO:0034142),2/17,0.099362
213,Sodium Ion Transmembrane Transporter Activity (GO:0015081),2/34,0.135727
214,Polyubiquitin Modification-Dependent Protein Binding (GO:0031593),2/53,0.135727
215,Protein Phosphatase 2B Binding (GO:0030346),1/5,0.135727
216,Pyruvate Transmembrane Transporter Activity (GO:0050833),1/5,0.135727


Size of community: 57


Unnamed: 0,Term,Overlap,Adjusted P-value
353,Cysteine-Type Endopeptidase Activity (GO:0004197),3/106,0.155688
355,Cysteine-Type Peptidase Activity (GO:0008234),3/144,0.155688
354,Calcium-Dependent Phospholipid Binding (GO:0005544),2/47,0.155688
356,Chitinase Activity (GO:0004568),1/5,0.183892
357,Olfactory Receptor Activity (GO:0004984),4/362,0.183892
358,Nucleotide Transmembrane Transporter Activity (GO:0015215),1/7,0.183892
359,GTPase Activator Activity (GO:0005096),3/211,0.183892
360,Store-Operated Calcium Channel Activity (GO:0015279),1/9,0.183892
52,Ubiquinone Biosynthetic Process (GO:0006744),1/13,0.21459
51,Activation Of GTPase Activity (GO:0090630),2/102,0.21459


Size of community: 46


Unnamed: 0,Term,Overlap,Adjusted P-value
370,Ubiquitin Protein Ligase Activity (GO:0061630),4/311,0.075878
385,Growth Factor Receptor Binding (GO:0070851),2/96,0.075878
384,Interleukin-1 Receptor Binding (GO:0005149),1/9,0.075878
383,siRNA Binding (GO:0035197),1/9,0.075878
382,L-lysine Transmembrane Transporter Activity (GO:0015189),1/8,0.075878
381,Interleukin-17 Receptor Activity (GO:0030368),1/7,0.075878
380,Unmethylated CpG Binding (GO:0045322),1/7,0.075878
379,Histone Ubiquitin Ligase Activity (GO:0140852),1/7,0.075878
377,Ubiquitin-Protein Transferase Activity (GO:0004842),4/412,0.075878
376,"Oxidoreductase Activity, Acting On A Sulfur Group Of Donors, Disulfide As Acceptor (GO:0016671)",1/6,0.075878


Size of community: 36


Unnamed: 0,Term,Overlap,Adjusted P-value
169,Diphosphotransferase Activity (GO:0016778),1/5,0.111222
171,BMP Receptor Binding (GO:0070700),1/10,0.111222
172,Transmembrane Receptor Protein Serine/Threonine Kinase Binding (GO:0070696),1/10,0.111222
173,RNA Polymerase I Activity (GO:0001054),1/11,0.111222
174,Sequence-Specific mRNA Binding (GO:1990825),1/11,0.111222
170,pre-mRNA Intronic Binding (GO:0097157),1/8,0.111222
175,Acylglycerol Lipase Activity (GO:0047372),1/14,0.121016
178,mRNA 3'-UTR AU-rich Region Binding (GO:0035925),1/23,0.138081
176,Triglyceride Lipase Activity (GO:0004806),1/21,0.138081
177,Sequence-Specific DNA Binding (GO:0043565),4/717,0.138081


Size of community: 26


Unnamed: 0,Term,Overlap,Adjusted P-value
295,Axon (GO:0030424),3/205,0.030724
294,Synaptic Vesicle Membrane (GO:0030672),2/54,0.030724
293,Exocytic Vesicle Membrane (GO:0099501),2/53,0.030724
292,Phagocytic Vesicle Membrane (GO:0030670),2/45,0.030724
0,Negative Regulation Of Viral-Induced Cytoplasmic Pattern Recognition Receptor Signaling Pathway (GO:0039532),2/16,0.046665
296,Neuron Projection (GO:0043005),4/557,0.057948
297,Phagocytic Vesicle (GO:0045335),2/100,0.065711
298,Spectrin-Associated Cytoskeleton (GO:0014731),1/7,0.068641
8,Axon Development (GO:0061564),2/99,0.091701
30,Regulation Of Axon Regeneration (GO:0048679),1/10,0.091701


Size of community: 15


Unnamed: 0,Term,Overlap,Adjusted P-value
175,4-galactosyl-N-acetylglucosaminide 3-alpha-L-fucosyltransferase Activity (GO:0017083),1/5,0.028744
176,Serine-Type Endopeptidase Activity (GO:0004252),2/125,0.028744
185,Metalloendopeptidase Inhibitor Activity (GO:0008191),1/13,0.028744
184,GDP-dissociation Inhibitor Activity (GO:0005092),1/12,0.028744
183,P-type Ion Transporter Activity (GO:0015662),1/11,0.028744
182,Fucosyltransferase Activity (GO:0008417),1/11,0.028744
186,Solute:Inorganic Anion Antiporter Activity (GO:0005452),1/14,0.028744
180,P-type Calcium Transporter Activity (GO:0005388),1/8,0.028744
179,Alpha-(1->3)-Fucosyltransferase Activity (GO:0046920),1/7,0.028744
178,Natural Killer Cell Lectin-Like Receptor Binding (GO:0046703),1/7,0.028744


In [None]:
# KEGG and Reactome enrichment
i = 0
for community in communities_HGNC:
    enr_path = gp.enrichr(
        gene_list=community,
        gene_sets=['KEGG_2021_Human','Reactome_2022'],
        organism='Human',
        outdir=None
    )
    enr_path_df = enr_path.results
    enr_path_df = enr_path_df.sort_values('Adjusted P-value')

    mask = (enr_path_df["Overlap"].apply(lambda x: int(x.split("/")[1]) <= term_size_cap)) & (enr_path_df["Adjusted P-value"] <= term_score_cap)
        
    filtered = enr_path_df[mask].copy()
    if not filtered.empty:
        filtered.loc[:, "Community Index"] = i
        filtered.loc[:, "Community Size"] = len(community)
        important_terms = pd.concat([important_terms, filtered], ignore_index=True)

    print(f"Size of community: {len(community)}")
    display(HTML(enr_path_df[['Term','Overlap','Adjusted P-value']].head(10).to_html(max_cols=None)))
    i += 1

Size of community: 1334


Unnamed: 0,Term,Overlap,Adjusted P-value
204,Protein Methylation R-HSA-8876725,7/17,0.043365
205,Detoxification Of Reactive Oxygen Species R-HSA-3299685,8/34,0.440789
206,Serine Biosynthesis R-HSA-977347,4/9,0.440789
207,Miscellaneous Transport And Binding Events R-HSA-5223345,6/23,0.573481
208,eNOS Activation R-HSA-203615,4/11,0.592016
209,RUNX3 Regulates Immune Response And Cell Migration R-HSA-8949275,3/6,0.592016
0,Peroxisome,13/82,0.595353
211,TP53 Regulates Transcription Of Cell Death Genes R-HSA-5633008,8/44,0.678663
212,TP53 Regulates Transcription Of Genes Involved In Cytochrome C Release R-HSA-6803204,5/20,0.678663
210,Smooth Muscle Contraction R-HSA-445355,8/43,0.678663


Size of community: 1188


Unnamed: 0,Term,Overlap,Adjusted P-value
211,Signal Transduction R-HSA-162582,282/2465,1.932896e-26
212,Immune System R-HSA-168256,230/1943,6.003656000000001e-23
213,"Signaling By Rho GTPases, Miro GTPases And RHOBTB3 R-HSA-9716542",109/660,2.752168e-20
214,RHO GTPase Cycle R-HSA-9012999,85/441,4.9336199999999995e-20
215,Signaling By Rho GTPases R-HSA-194315,106/644,8.748492e-20
0,MAPK signaling pathway,63/294,5.3327000000000006e-17
216,Signaling By Receptor Tyrosine Kinases R-HSA-9006934,84/496,2.976105e-16
217,Adaptive Immune System R-HSA-1280218,99/733,1.392912e-12
218,Class I MHC Mediated Antigen Processing And Presentation R-HSA-983169,64/378,2.954294e-12
219,Antigen Processing: Ubiquitination And Proteasome Degradation R-HSA-983168,56/307,4.531411e-12


Size of community: 985


Unnamed: 0,Term,Overlap,Adjusted P-value
61,B-WICH Complex Positively Regulates rRNA Expression R-HSA-5250924,1/59,0.999996
89,Signaling By NOTCH R-HSA-157118,1/203,0.999996
88,Interferon Signaling R-HSA-913531,1/200,0.999996
87,Intra-Golgi And Retrograde Golgi-to-ER Traffic R-HSA-6811442,1/181,0.999996
86,SUMOylation R-HSA-2990846,1/174,0.999996
85,SUMO E3 Ligases SUMOylate Target Proteins R-HSA-3108232,1/168,0.999996
84,S Phase R-HSA-69242,1/161,0.999996
83,Mitotic G1 Phase And G1/S Transition R-HSA-453279,1/147,0.999996
82,Cell Surface Interactions At Vascular Wall R-HSA-202733,1/134,0.999996
81,G1/S Transition R-HSA-69206,1/129,0.999996


Size of community: 892


Unnamed: 0,Term,Overlap,Adjusted P-value
116,Metabolism Of RNA R-HSA-8953854,87/666,6.097938000000001e-17
117,rRNA Modification In Nucleus And Cytosol R-HSA-6790901,23/60,8.268696e-14
119,Major Pathway Of rRNA Processing In Nucleolus And Cytosol R-HSA-6791226,34/179,8.191116e-11
118,rRNA Processing In Nucleus And Cytosol R-HSA-8868773,35/189,8.191116e-11
120,rRNA Processing R-HSA-72312,35/199,3.095749e-10
0,Protein export,12/23,5.734975e-09
121,mRNA Splicing R-HSA-72172,30/189,1.212048e-07
122,Processing Of Capped Intron-Containing Pre-mRNA R-HSA-72203,34/242,2.259206e-07
123,G0 And Early G1 R-HSA-1538133,9/27,9.921838e-05
124,mRNA Splicing - Major Pathway R-HSA-72163,24/181,0.0001046144


Size of community: 779


Unnamed: 0,Term,Overlap,Adjusted P-value
0,D-Glutamine and D-glutamate metabolism,1/5,0.999996
50,Breast cancer,1/147,0.999996
49,Apoptosis,1/142,0.999996
48,Apelin signaling pathway,1/137,0.999996
47,Natural killer cell mediated cytotoxicity,1/131,0.999996
46,Purine metabolism,1/129,0.999996
45,Cell cycle,1/124,0.999996
51,Gastric cancer,1/149,0.999996
44,Influenza A,2/172,0.999996
42,Leukocyte transendothelial migration,1/114,0.999996


Size of community: 652


Unnamed: 0,Term,Overlap,Adjusted P-value
194,Cytokine Signaling In Immune System R-HSA-1280215,87/702,1.332682e-24
195,Extracellular Matrix Organization R-HSA-1474244,53/291,2.9000380000000003e-22
196,Signaling By Receptor Tyrosine Kinases R-HSA-9006934,66/496,2.780136e-20
197,Signaling By Interleukins R-HSA-449147,59/453,1.2391250000000001e-17
0,Focal adhesion,38/201,2.230156e-16
198,Immune System R-HSA-168256,136/1943,4.364797e-16
1,JAK-STAT signaling pathway,33/162,2.074922e-15
2,Cytokine-cytokine receptor interaction,42/295,5.111131e-14
3,T cell receptor signaling pathway,25/104,1.377719e-13
199,Collagen Formation R-HSA-1474290,23/90,9.721555e-13


Size of community: 555


Unnamed: 0,Term,Overlap,Adjusted P-value
123,Class A/1 (Rhodopsin-like Receptors) R-HSA-373076,90/327,9.027175e-62
124,GPCR Ligand Binding R-HSA-500792,100/458,8.175338e-59
125,Signaling By GPCR R-HSA-372790,116/689,3.773363e-56
0,Neuroactive ligand-receptor interaction,85/341,1.292849e-54
126,GPCR Downstream Signaling R-HSA-388396,101/619,3.562449e-47
127,Peptide Ligand-Binding Receptors R-HSA-375276,51/196,2.4648820000000002e-33
128,G Alpha (Q) Signaling Events R-HSA-416476,43/212,2.708344e-23
129,G Alpha (I) Signaling Events R-HSA-418594,41/312,4.813094e-15
130,Neuronal System R-HSA-112316,44/386,6.496674e-14
131,ADORA2B Mediated Anti-Inflammatory Cytokine Production R-HSA-9660821,26/131,7.78381e-14


Size of community: 512


Unnamed: 0,Term,Overlap,Adjusted P-value
174,Nuclear Receptor Transcription Pathway R-HSA-383280,10/53,0.000376
2,Renin-angiotensin system,5/23,0.014361
0,Cell adhesion molecules,13/148,0.014361
1,Cytokine-cytokine receptor interaction,19/295,0.014361
175,Adherens Junctions Interactions R-HSA-418990,6/29,0.018403
176,Cell-cell Junction Organization R-HSA-421270,8/61,0.02389
177,Hemostasis R-HSA-109582,30/576,0.02389
178,Regulation Of IGF Transport And Uptake By IGFBPs R-HSA-381426,11/123,0.030359
179,Insulin Receptor Recycling R-HSA-77387,5/26,0.035438
180,Cell Surface Interactions At Vascular Wall R-HSA-202733,11/134,0.045002


Size of community: 315


Unnamed: 0,Term,Overlap,Adjusted P-value
92,Metabolism R-HSA-1430728,100/2049,9.923158e-24
93,Biological Oxidations R-HSA-211859,33/218,5.852059e-21
94,Citric Acid (TCA) Cycle And Respiratory Electron Transport R-HSA-1428517,29/163,1.622095e-20
95,Phase I - Functionalization Of Compounds R-HSA-211945,23/104,1.643182e-18
96,Fatty Acid Metabolism R-HSA-8978868,22/173,1.841523e-12
97,Pyruvate Metabolism And Citric Acid (TCA) Cycle R-HSA-71406,13/54,5.813726e-11
98,Cytochrome P450 - Arranged By Substrate Type R-HSA-211897,13/65,5.851105e-10
99,"Respiratory Electron Transport, ATP Synthesis By Chemiosmotic Coupling, Heat Production By Uncoupling Proteins R-HSA-163200",16/112,5.851105e-10
100,Pyruvate Metabolism R-HSA-70268,10/31,5.869726e-10
101,Metabolism Of Lipids R-HSA-556833,37/732,8.343551e-09


Size of community: 251


  self.results = pd.concat(self.results, ignore_index=True)


Unnamed: 0,Term,Overlap,Adjusted P-value
0,MicroRNAs in cancer,4/310,0.913647
1,Cushing syndrome,1/155,0.913647
2,Kaposi sarcoma-associated herpesvirus infection,1/193,0.913647


Size of community: 137


Unnamed: 0,Term,Overlap,Adjusted P-value
0,Steroid hormone biosynthesis,1/61,0.814966
1,Metabolism of xenobiotics by cytochrome P450,1/76,0.814966
2,Chemical carcinogenesis,1/239,0.96877
3,Herpes simplex virus 1 infection,1/498,0.96877
4,Carboxyterminal Post-Translational Modifications Of Tubulin R-HSA-8955332,1/41,0.999995
5,Generic Transcription Pathway R-HSA-212436,1/1190,0.999995
6,RNA Polymerase II Transcription R-HSA-73857,1/1312,0.999995
7,Post-translational Protein Modification R-HSA-597592,1/1383,0.999995
8,Gene Expression (Transcription) R-HSA-74160,1/1449,0.999995
9,Metabolism Of Proteins R-HSA-392499,1/1890,0.999995


Size of community: 74


Unnamed: 0,Term,Overlap,Adjusted P-value
0,Olfactory transduction,74/440,3.5686550000000003e-126
1,Expression And Translocation Of Olfactory Receptors R-HSA-9752946,71/393,1.615209e-119
2,Olfactory Signaling Pathway R-HSA-381753,71/401,3.895043e-119
3,Sensory Perception R-HSA-9709957,71/616,4.812431e-105


Size of community: 68


Unnamed: 0,Term,Overlap,Adjusted P-value


Size of community: 61


Unnamed: 0,Term,Overlap,Adjusted P-value
2,Keratinization R-HSA-6805567,56/208,1.533436e-107
3,Developmental Biology R-HSA-1266738,56/1073,4.094386e-65
4,Formation Of Cornified Envelope R-HSA-6809371,12/74,1.2979970000000001e-17
0,Staphylococcus aureus infection,7/95,3.08537e-08
1,Estrogen signaling pathway,7/137,1.944051e-07
5,Alpha-oxidation Of Phytanate R-HSA-389599,1/6,0.04540785
6,Peroxisomal Lipid Metabolism R-HSA-390918,1/29,0.1696629
7,Peroxisomal Protein Import R-HSA-9033241,1/63,0.2921762
8,Protein Localization R-HSA-9609507,1/164,0.5147912
9,Fatty Acid Metabolism R-HSA-8978868,1/173,0.5147912


Size of community: 46


Unnamed: 0,Term,Overlap,Adjusted P-value
1,Expression And Translocation Of Olfactory Receptors R-HSA-9752946,46/393,6.26153e-80
2,Olfactory Signaling Pathway R-HSA-381753,46/401,8.375979e-80
0,Olfactory transduction,46/440,5.112601e-78
3,Sensory Perception R-HSA-9709957,46/616,5.521353e-71


Size of community: 37


  self.results = pd.concat(self.results, ignore_index=True)


Unnamed: 0,Term,Overlap,Adjusted P-value
0,RNA transport,1/186,0.493494
1,Amyotrophic lateral sclerosis,1/364,0.493494


Size of community: 25


Unnamed: 0,Term,Overlap,Adjusted P-value
111,Notch-HLH Transcription Pathway R-HSA-350054,10/28,2.810563e-21
112,NOTCH1 Intracellular Domain Regulates Transcription R-HSA-2122947,10/48,6.911064999999999e-19
113,Constitutive Signaling By NOTCH1 HD+PEST Domain Mutants R-HSA-2894862,10/58,3.650886e-18
114,Signaling By NOTCH1 R-HSA-1980143,10/74,3.729275e-17
0,Alcoholism,11/186,1.474819e-14
1,Neutrophil extracellular trap formation,10/189,7.167032e-13
2,Viral carcinogenesis,10/203,9.835364e-13
115,Signaling By NOTCH R-HSA-157118,10/203,9.941692e-13
116,Diseases Of Signal Transduction By Growth Factor Receptors And Second Messengers R-HSA-5663202,11/424,3.635017e-11
117,HDACs Deacetylate Histones R-HSA-3214815,5/60,2.645048e-07


In [26]:
important_terms = important_terms.sort_values(by="Overlap")
display(HTML(important_terms[["Community Index","Community Size",'Term','Overlap','Adjusted P-value',"P-value"]].head(50).to_html(max_cols=None)))

Unnamed: 0,Community Index,Community Size,Term,Overlap,Adjusted P-value,P-value
13,1,1188,Regulation Of Cell-Substrate Junction Assembly (GO:0090109),11/21,1.925791e-07,6.301672e-09
22,1,1188,Positive Regulation Of p38MAPK Cascade (GO:1900745),11/24,9.193603e-07,3.778523e-08
21,1,1188,Peptidyl-Tyrosine Dephosphorylation (GO:0035335),12/29,9.02513e-07,3.685655e-08
26,6,555,Postsynaptic Specialization Membrane (GO:0099634),12/38,3.663528e-08,2.598247e-10
18,1,1188,Negative Regulation Of Mitotic Cell Cycle (GO:0045930),13/34,7.908138e-07,3.084588e-08
23,5,652,Phosphotyrosine Residue Binding (GO:0001784),13/39,4.533111e-08,1.552435e-10
25,6,555,Neuropeptide Receptor Activity (GO:0008188),14/36,9.31231e-12,2.956289e-13
20,1,1188,Regulation Of Microtubule Polymerization (GO:0031113),14/40,8.450087e-07,3.40658e-08
14,1,1188,Negative Regulation Of I-kappaB kinase/NF-kappaB Signaling (GO:0043124),15/42,2.385459e-07,7.99316e-09
19,1,1188,Regulation Of Epidermal Growth Factor Receptor Signaling Pathway (GO:0042058),15/46,8.31889e-07,3.310134e-08


In [18]:
disease_size_cap = 200
disease_score_cap = 0.0001
important_diseases = pd.DataFrame(columns=["Community Index","Community Size","Term", "Overlap", "Adjusted P-value"])

In [19]:
# Disease-gene enrichment libraries
disease_sets = [
    'DisGeNET_2020', # curated gene–disease associations
    'GWAS_Catalog_2023', # genome-wide association hits
    'OMIM_Disease', # Mendelian disorders
    'Jensen_DISEASES' # text-mined associations
]

# # Disease-gene enrichment Analysis; save terms with small size and high p-value
i = 0
for community in communities_HGNC:
    # Gene Ontology enrichment
    enr_disease = gp.enrichr(
        gene_list=community,
        gene_sets=disease_sets,
        organism='Human',
        outdir=None # don't write to disk
    )
    enr_disease_df = enr_disease.results.sort_values('Adjusted P-value')
    print(f"Size of community: {len(community)}")

    mask = ((enr_disease_df["Overlap"].apply(lambda x: int(x.split("/")[1]) < disease_size_cap)) 
    & (enr_disease_df["Adjusted P-value"] < disease_score_cap))
        
    filtered = enr_disease_df[mask].copy()
    if not filtered.empty:
        filtered.loc[:, "Community Index"] = i
        filtered.loc[:, "Community Size"] = len(community)
        important_diseases = pd.concat([important_diseases, filtered], ignore_index=True)

    display(HTML(enr_disease_df[['Term','Overlap','Adjusted P-value']].head(10).to_html(max_cols=None)))
    i += 1



Size of community: 1334


Unnamed: 0,Term,Overlap,Adjusted P-value
3042,Peroxisomal disease,6/14,0.09275
3043,Vesicoureteral reflux,8/26,0.09275
3044,Amelogenesis imperfecta,6/18,0.245657
0,Vaginal Microbiome MetaCyc Pathway (PWY-1622|formaldehyde Assimilation I (Serine Pathway)),8/24,0.322647
3045,Hermansky-Pudlak syndrome,4/9,0.34146
3046,Aicardi syndrome,5/15,0.34146
3047,Mitral valve prolapse,5/15,0.34146
3048,Intermediate coronary syndrome,3/5,0.349828
1,2-Hydroxy-3-Methylvalerate Levels In Elite Athletes,4/6,0.397831
3049,Zellweger syndrome,4/11,0.494349




Size of community: 1188


Unnamed: 0,Term,Overlap,Adjusted P-value
0,Height,489/6159,1.078122e-11
2657,Cancer,55/300,5.216836e-11
2658,Carcinoma,785/11318,1.734815e-09
2,Eosinophil Counts,84/780,7.052708e-05
1,Mean Corpuscular Hemoglobin,96/930,7.052708e-05
3,Vertex-wise Cortical Thickness,52/405,8.622437e-05
4,Platelet Distribution Width,53/422,0.0001071056
2659,Kidney cancer,211/2584,0.000164402
5,Mean Reticulocyte Volume,53/452,0.0007599382
6,Vertex-wise Sulcal Depth,63/576,0.0007747735




Size of community: 985


Unnamed: 0,Term,Overlap,Adjusted P-value
1282,leukemia,1/78,0.980688
1281,mental retardation,2/114,0.980688
1280,retinitis pigmentosa,1/51,0.980688
0,Lung Function In Never Smokers (Low FEV1 Vs Average FEV1),2/5,0.999996
859,Coronary Artery Disease Or Factor VIII Levels (Pleiotropy),2/71,0.999996
858,Squamous Cell Lung Carcinoma,2/71,0.999996
857,Hand Grip Strength,4/125,0.999996
856,Parental Extreme Longevity (95 Years And Older),1/40,0.999996
855,Concentration Of Small HDL Particles,1/40,0.999996
854,Bone Mineral Density (Spine),1/40,0.999996




Size of community: 892


Unnamed: 0,Term,Overlap,Adjusted P-value
1536,fanconi anemia,3/13,0.235068
1619,Donohue Syndrome,1/5,0.847572
1684,Hypertrichosis,2/24,0.847572
1685,46 XY gonadal dysgenesis,1/8,0.847572
1686,Anogenital venereal wart,1/8,0.847572
1687,Oral squamous cell carcinoma,1/8,0.847572
1689,Brain glioma,1/8,0.847572
1690,Central nervous system disease,1/8,0.847572
1691,Cervical dystonia,1/8,0.847572
1692,Chromosome 22q11.2 microduplication syndrome,1/8,0.847572




Size of community: 779


Unnamed: 0,Term,Overlap,Adjusted P-value
1736,Proctitis,2/5,0.855989
1853,Ebola hemorrhagic fever,1/9,0.855989
1854,Specific developmental disorder,1/9,0.855989
1855,Mood disorder,1/9,0.855989
1856,Subependymal giant cell astrocytoma,1/9,0.855989
1857,Bradyopsia,1/9,0.855989
1858,"blepharophimosis, ptosis, and epicanthus inversus syndrome",1/9,0.855989
1859,Corneal ectasia,1/9,0.855989
1860,Crimean-Congo hemorrhagic fever,1/9,0.855989
1852,Cerebral palsy,2/28,0.855989




Size of community: 652


  important_diseases = pd.concat([important_diseases, filtered], ignore_index=True)


Unnamed: 0,Term,Overlap,Adjusted P-value
0,Vertex-wise Sulcal Depth,53/576,2.425726e-08
1,Height,275/6159,3.320534e-07
2,Vertex-wise Cortical Surface Area,39/427,5.788528e-06
3,Blood Protein Levels,60/890,3.472324e-05
4,Vertical Cup-Disc Ratio,23/193,3.472324e-05
5,Cortical Surface Area,41/506,3.472324e-05
6,Lung Function (FEV1/FVC),45/589,3.873304e-05
7,Vertical Cup-Disc Ratio (Adjusted For Vertical Disc Diameter),12/55,4.526814e-05
10,Vertical Cup-Disc Ratio (Multi-Trait Analysis),12/58,6.119801e-05
9,Respiratory Diseases,14/80,6.119801e-05




Size of community: 555


Unnamed: 0,Term,Overlap,Adjusted P-value
1448,Syndactyly,8/53,0.049135
1449,Meningocele,3/5,0.050761
1450,Gitelman syndrome,3/6,0.066284
1451,Impetigo,3/7,0.085204
1452,Hypogonadotropism,3/8,0.106817
0,"1,5-Anhydroglucitol (1,5-AG) Levels In Elite Athletes",5/17,0.108198
1453,Pituitary adenoma,5/31,0.116068
1454,Blepharophimosis,4/19,0.116068
1455,obsessive-compulsive disorder,5/33,0.121992
1456,Lens subluxation,3/10,0.121992




Size of community: 512


Unnamed: 0,Term,Overlap,Adjusted P-value
2180,Ewing sarcoma,4/6,0.003979
2181,POEMS syndrome,3/5,0.052244
2182,Esophageal atresia,5/23,0.05373
2183,Chickenpox,3/7,0.087982
2184,Polyneuropathy,5/29,0.100253
2185,Tonsillitis,4/18,0.105918
0,Blood Protein Levels,43/890,0.112421
1,"COVID-19 (Severe Vs Tested, Not Severe)",7/44,0.125288
2186,Diabetes insipidus,3/10,0.154645
2188,Keratopathy,3/11,0.154645




Size of community: 315


Unnamed: 0,Term,Overlap,Adjusted P-value
954,Cholestasis,8/67,0.001906
946,myocardial infarction,2/16,0.205226
10,Smoking Behaviour (Cigarettes Smoked Per Day),4/41,0.33232
9,Endothelial Monocyte-Activating Polypeptide 2 Levels,2/6,0.33232
8,PH And SEC7 Domain-Containing Protein 2 Levels,2/6,0.33232
7,Lipoprotein (A) Levels,5/63,0.33232
0,Triglycerides To Total Lipids Ratio In Chylomicrons And Extremely Large VLDL,3/13,0.33232
5,X-16935 Levels,2/5,0.33232
4,Glycochenodeoxycholate 3-Sulfate Levels,2/5,0.33232
3,PH And SEC7 Domain-Containing Protein 2 Levels (PSD2.9118.7.3),2/5,0.33232




Size of community: 251


  self.results = pd.concat(self.results, ignore_index=True)


Unnamed: 0,Term,Overlap,Adjusted P-value
295,Congenital aphakia,1/5,0.430264
311,Chromophobe adenocarcinoma,1/12,0.430264
310,Motion sickness,1/12,0.430264
309,Hepatoblastoma,1/11,0.430264
308,Intestinal obstruction,1/11,0.430264
307,Cornea plana,1/11,0.430264
305,Eclampsia,1/8,0.430264
304,Multiple personality disorder,1/7,0.430264
306,Hydrocele,1/9,0.430264
302,Otitis externa,1/6,0.430264




Size of community: 137


Unnamed: 0,Term,Overlap,Adjusted P-value
419,leber amaurosis,3/13,0.000257
420,retinitis pigmentosa,3/51,0.007726
422,Leber congenital amaurosis,3/24,0.038373
423,Fundus dystrophy,4/78,0.056913
424,Retinitis pigmentosa,4/83,0.056913
421,cone-rod dystrophy,1/15,0.097999
425,Cone dystrophy,3/54,0.098776
426,Ciliopathy,3/58,0.098776
427,Cystic kidney disease,2/24,0.130075
437,Cryptorchidism,2/58,0.251405




Size of community: 74


  self.results = pd.concat(self.results, ignore_index=True)


Unnamed: 0,Term,Overlap,Adjusted P-value
36,Hemolytic anemia,5/72,8.4e-05
37,Carcinoma,56/11318,0.003007
38,Thalassemia,2/45,0.048259
39,Lung cancer,5/400,0.048902
40,Polycythemia,1/16,0.138254
7,Pyridoxate Levels In Elite Athletes,1/10,0.16379
0,Odorant Perception,1/5,0.16379
5,Anxiety Disorder,1/8,0.16379
4,Loneliness (MTAG),1/8,0.16379
3,Diabetic Retinopathy (Moderate NPDR And PDR),1/7,0.16379




Size of community: 68


  self.results = pd.concat(self.results, ignore_index=True)


Unnamed: 0,Term,Overlap,Adjusted P-value
0,Hip Circumference Variance,1/9,0.505423
1,Waist-to-hip Ratio Adjusted For BMI (Age >50),1/9,0.505423
2,Schizophrenia Vs Tourette's Syndrome And Other Tic Disorders (Ordinary Least Squares (OLS)),1/13,0.505423
3,Caudate Nucleus Volume,1/19,0.527684
4,Type 1 Diabetes (Age At Diagnosis),1/23,0.527684
5,Cognitive Decline Rate In Late Mild Cognitive Impairment,1/37,0.691237
6,Waist-to-hip Ratio Adjusted For BMI X Sex X Age Interaction (4Df Test),1/49,0.769359
7,Rosacea Symptom Severity,1/62,0.833945
8,Alzheimer's Disease Or Family History Of Alzheimer's Disease,1/79,0.845277
9,Photoreceptor Cell Layer Thickness Phenotypes (MTAG),1/81,0.845277




Size of community: 61


Unnamed: 0,Term,Overlap,Adjusted P-value
41,Ichthyosis vulgaris,2/14,0.006642
40,Monilethrix,2/14,0.006642
39,Eosinophilic esophagitis,2/12,0.006642
38,Seborrheic dermatitis,2/11,0.006642
37,Epidermolytic hyperkeratosis,2/10,0.006642
36,Eczema herpeticum,2/7,0.006642
42,Lichen planus,2/17,0.008458
43,Keratosis,2/27,0.01873
0,"Tinnitus (Chronic, Bothersome)",2/14,0.028464
35,ichthyosis,1/17,0.050623




Size of community: 46


  self.results = pd.concat(self.results, ignore_index=True)


Unnamed: 0,Term,Overlap,Adjusted P-value
26,Carcinoma,44/11318,1.848486e-08
0,Plasma Protease C1 Inhibitor Levels,2/9,0.004794374
27,Oropharynx cancer,1/6,0.04259636
28,Cervical dystonia,1/8,0.04259636
2,N-acetyl-aspartyl-glutamate (NAAG) Levels In Elite Athletes,1/6,0.1186613
3,Vaginal Microbiome MetaCyc Pathway (GLUCOSE1PMETAB-PWY|glucose And Glucose-1-Phosphate Degradation),1/8,0.1186613
1,N-acetylglycine Levels In Elite Athletes,1/5,0.1186613
4,Hypertrophic Cardiomyopathy (Sarcomere Positive),1/14,0.1517616
5,Smoking Status (Ever Vs Never Smokers),2/128,0.1517616
6,Tea Intake (UKB Data Field 1488),1/25,0.2078999




Size of community: 37


  self.results = pd.concat(self.results, ignore_index=True)


Unnamed: 0,Term,Overlap,Adjusted P-value
0,Lymphoma,1/5,0.153869
1,Asthma And Major Depressive Disorder,1/5,0.153869
2,F-savour/caloric Food Liking (Derived Food-Liking Factor),1/7,0.153869
3,F-highly Palatable Foods Liking (Derived Food-Liking Factor),1/10,0.153869
4,F-cake/biscuits Liking (Derived Food-Liking Factor),1/10,0.153869
5,Angioedema In Response To Angiotensin-Converting Enzyme Inhibitor And/Or Angiotensin Receptor Blocker,1/10,0.153869
6,Suicidal Thoughts And Behaviors,1/11,0.153869
7,Gait Speed In Old Age,1/12,0.153869
8,Caffeine Consumption From Tea,1/18,0.204058
9,Gut Microbiome Composition (Summer),1/22,0.223661




Size of community: 25


Unnamed: 0,Term,Overlap,Adjusted P-value
402,Succinic semialdehyde dehydrogenase deficiency,2/16,0.016315
403,Endometrial cancer,4/292,0.016315
404,Large intestine cancer,3/122,0.016315
405,DOID:9917,2/30,0.016925
392,ehlers-danlos,1/11,0.030989
394,long qt syndrome,1/12,0.030989
395,melanoma,1/13,0.030989
396,cone-rod dystrophy,1/15,0.030989
397,hypogonadism,1/15,0.030989
393,osteoporosis,1/11,0.030989


In [20]:
important_diseases = important_diseases.sort_values(by="Community Index")
display(HTML(important_diseases[["Community Index","Community Size",'Term','Overlap','Adjusted P-value']].head(50).to_html(max_cols=None)))

Unnamed: 0,Community Index,Community Size,Term,Overlap,Adjusted P-value
0,5,652,Vertical Cup-Disc Ratio,23/193,3.5e-05
1,5,652,Vertical Cup-Disc Ratio (Adjusted For Vertical Disc Diameter),12/55,4.5e-05
2,5,652,Vertical Cup-Disc Ratio (Multi-Trait Analysis),12/58,6.1e-05
3,5,652,Respiratory Diseases,14/80,6.1e-05
4,5,652,Glaucoma (Primary Open-Angle),15/92,6.1e-05
5,11,74,Hemolytic anemia,5/72,8.4e-05
