In [None]:
import json
from scipy.sparse import load_npz,save_npz,diags,csr_matrix,issparse
from matplotlib.patches import Wedge, Patch
import numpy as np
import networkx as nx
import pickle
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
DISEASE = "BIPOLAR"
DISEASE_FOLDER = f"../output/{DISEASE}/"
RESULT_FOLDER = DISEASE_FOLDER + "leiden_results"
DGIDB_DIRECTORY = f"../../Gen_Hypergraph/output/DGIDB_{DISEASE}/"
MSIGDB_DIRECTORY = "../../Gen_Hypergraph/output/MSigDB_Full/"
RESULT_COMMUNITIES = "result_communities_agg"
RESULT_GRAPH = "result_graph_agg"

with open(DISEASE_FOLDER + "gene_to_index_distinct.json", "r") as file:
    gene_to_index_distinct = json.load(file)
    
try:
    with open(DGIDB_DIRECTORY + f"gene_to_index_{DISEASE}.json", "r") as file:
        DGIDB_gene_to_index = json.load(file)
except FileNotFoundError:
    DGIDB_gene_to_index = {}
    print("File not found. Setting DGIDB_gene_to_index to be {}.")
    
    
sim_mat = load_npz(f"{DISEASE_FOLDER}/agg_sim_mat.npz")

# Load communities and graph

In [None]:
# Loading result graph and communities
with open(f"{RESULT_FOLDER}/{RESULT_COMMUNITIES}.pkl", "rb") as f:
    communities = pickle.load(f)
with open(f"{RESULT_FOLDER}/{RESULT_GRAPH}.pkl", "rb") as f:
    graph = pickle.load(f)

# Plot zscore and degree distributions

In [None]:
# Z-score distribution
def compute_within_community_zscores(G, communities):
    """
    Compute within-community degree z-scores for each node.

    Parameters
    ----------
    G : networkx.Graph
        Graph whose nodes match those in `communities`.
        Uses edge weight from 'weight' attribute if present; otherwise 1.0.
    communities : list of list
        communities[c] is a list of node IDs in community c.

    Returns
    -------
    pandas.DataFrame
        Columns: ['node', 'community', 'k_in', 'zscore']
    """
    # 1. Node -> community map
    node_to_comm = {
        node: cid
        for cid, comm in enumerate(communities)
        for node in comm
    }

    # 2. Initialize within-community degree for every node
    k_in = {node: 0.0 for node in node_to_comm.keys()}

    # 3. Iterate over edges once, add weight if endpoints in same community
    for u, v, data in G.edges(data=True):
        if u not in node_to_comm or v not in node_to_comm:
            continue  # skip nodes not in any community (if that happens)

        cu = node_to_comm[u]
        cv = node_to_comm[v]

        if cu == cv:
            w = data.get("weight", 1.0)
            k_in[u] += w
            k_in[v] += w

    # 4. Build DataFrame
    rows = [
        (node, node_to_comm[node], k_val)
        for node, k_val in k_in.items()
    ]
    df = pd.DataFrame(rows, columns=["node", "community", "k_in"])

    # 5. Compute z-scores per community using the helper
    df["zscore"] = df.groupby("community")["k_in"].transform(zscore)

    return df

def plot_zscore_hist_all(df_z, bins=20, per_row=4, max_communities=None):
    communities = sorted(df_z["community"].unique())
    if max_communities is not None:
        communities = communities[:max_communities]

    n = len(communities)
    nrows = int(np.ceil(n / per_row))
    ncols = per_row

    fig, axes = plt.subplots(nrows, ncols, figsize=(4*ncols, 3*nrows))
    axes = np.array(axes).flatten()

    for ax, cid in zip(axes, communities):
        zvals = df_z.loc[df_z["community"] == cid, "zscore"]
        ax.hist(zvals, bins=bins)
        ax.set_title(f"Community {cid}")
        ax.set_xlabel("z-score")
        ax.set_ylabel("Frequency")

    for ax in axes[len(communities):]:
        ax.axis("off")

    plt.tight_layout()
    plt.show()

In [None]:
communities_df = compute_within_community_zscores(graph, communities)
plot_zscore_hist_all(communities_df, bins=20, per_row=4,max_communities=20)

In [None]:
# Degree distribution
def compute_within_community_degrees(G, communities):
    """Compute within-community degrees for each node."""
    node_to_comm = {node: cid for cid, comm in enumerate(communities) for node in comm}
    k_in = {node: 0.0 for node in node_to_comm}

    for u, v, data in G.edges(data=True):
        if u not in node_to_comm or v not in node_to_comm:
            continue
        cu, cv = node_to_comm[u], node_to_comm[v]
        if cu == cv:
            w = data.get("weight", 1.0)
            k_in[u] += w
            k_in[v] += w

    df = pd.DataFrame([(n, node_to_comm[n], d) for n, d in k_in.items()],
                      columns=["node", "community", "k_in"])
    return df


def plot_topk_community_degree_distributions(G, communities, top_k=20, normalize=True, bins=20):
    """
    Plot within-community degree distributions for the top_k largest communities.

    Parameters
    ----------
    G : networkx.Graph
        Input graph.
    communities : list[list]
        List of node lists (each representing a community).
    top_k : int, optional
        Number of largest communities (by size) to plot. Default is 20.
    normalize : bool, optional
        If True, y-axis shows probability instead of count. Default is True.
    bins : int, optional
        Number of bins in each histogram. Default is 20.
    """
    df = compute_within_community_degrees(G, communities)

    # Select top_k largest communities by node count
    comm_sizes = df["community"].value_counts().sort_values(ascending=False)
    top_comms = comm_sizes.index[:top_k]

    # Determine subplot grid (square-ish)
    ncols = 5
    nrows = int(np.ceil(top_k / ncols))
    fig, axes = plt.subplots(nrows, ncols, figsize=(4*ncols, 3*nrows))
    axes = axes.flatten()

    for i, cid in enumerate(top_comms):
        ax = axes[i]
        vals = df.loc[df["community"] == cid, "k_in"].values
        if len(vals) == 0:
            continue

        weights = np.ones_like(vals) / len(vals) if normalize else None
        ax.hist(vals, bins=bins, weights=weights, color="steelblue", alpha=0.7)
        ax.set_title(f"Community {cid} (n={len(vals)})", fontsize=10)
        ax.set_xlabel("$k_{in}$", fontsize=9)
        ax.set_ylabel("Prob." if normalize else "Count", fontsize=9)
        ax.tick_params(axis="both", labelsize=8)

    # Turn off unused axes
    for j in range(i + 1, len(axes)):
        axes[j].axis("off")

    fig.suptitle(f"Within-community degree distributions (Top {top_k})", fontsize=14)
    plt.tight_layout(rect=[0, 0, 1, 0.97])
    plt.show()

In [None]:
plot_topk_community_degree_distributions(graph, communities, top_k=20, normalize=True)

# Select Communities

In [None]:
def zscore(values):
    arr = np.asarray(values, dtype=float)
    if arr.size == 0:
        return arr

    mean = arr.mean()
    std = arr.std(ddof=0)

    if std == 0 or np.isnan(std):
        # no variation: all z-scores = 0
        return np.zeros_like(arr)

    return (arr - mean) / std

In [None]:
def community_central_genes_by_score(G, community_nodes, weight="weight",score_cap = 1):
    C = set(community_nodes)
    H = G.subgraph(C).copy()                       # induced subgraph
    # within-community (weighted) degree
    k = {u: H.degree(u, weight=weight) for u in H}
    ks = np.array(list(k.values()), dtype=float)
    zscore_list = zscore(ks)
    Z = dict(zip(H,zscore_list))        # within-module degree z-score

    # rank by z
    ranked = sorted(H.nodes(), key=lambda u: (Z[u]), reverse=True)
    return [u for u in ranked if Z[u] >= score_cap]

In [None]:
# Update communities to include only the important genes, determined by a community-size_cap and z-score_cap
SIZE_CAP = 30
SCORE_CAP = 0

communities_selected = []
for community in communities:
    if (len(community) >= SIZE_CAP):
        important_nodes = community_central_genes_by_score(graph,community,score_cap = SCORE_CAP)
        a = len(important_nodes)
        b = len(community)
        print(a,b,a/b)
        communities_selected.append(important_nodes)
        
with open(f"{RESULT_FOLDER}/result_communities_selected.pkl", "wb") as f:
    pickle.dump(communities_selected, f)

In [None]:
print(communities_selected)
print(len(communities_selected))

In [None]:
# Filter the empty communities after first filter
i = 0
for i in range(len(communities_selected)):
    cl = len(communities_selected[i])
    if (cl == 0): del(communities_selected[i])

In [None]:
print(communities_selected)
print(len(communities_selected))