In [None]:
from scipy.sparse import load_npz,save_npz,diags,csr_matrix,issparse
import json
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Choose the disease to analyze
DISEASE = "LEUKEMIA" 

In [None]:
# Load all the matrices needed
OUTPUT_DIRECTORY = f"../output/{DISEASE}/"
DGIDB_DIRECTORY = f"../Gen_Hypergraph/output/DGIDB_{DISEASE}/"
MSIGDB_DIRECTORY = "../Gen_Hypergraph/output/MSigDB_Full/"
DISEASE_FOLDER = f"../Methods/output/{DISEASE}/"

## DGIDB
DGIDB_binary_matrix = load_npz(DGIDB_DIRECTORY + "hypergraph_incidence_matrix_binary.npz")
DGIDB_weighted_matrix = load_npz(DGIDB_DIRECTORY + "hypergraph_incidence_matrix_weighted.npz")
DGIDB_gene_weight_diag_matrix = load_npz(DGIDB_DIRECTORY + "diag_gene_weight_matrix.npz")
DGIDB_diag_node_degree_matrix = load_npz(DGIDB_DIRECTORY + "diag_node_degree_matrix.npz")
DGIDB_inverse_diag_edge_degree_matrix = load_npz(
    DGIDB_DIRECTORY + "inverse_diag_edge_degree_matrix.npz"
    )

## MSIGDB
MSIGDB_binary_matrix = load_npz(MSIGDB_DIRECTORY + "hypergraph_incidence_matrix_binary.npz")
MSIGDB_weighted_matrix = load_npz(MSIGDB_DIRECTORY + "hypergraph_incidence_matrix_weighted.npz")
MSIGDB_gene_weight_diag_matrix = load_npz(MSIGDB_DIRECTORY + "gene_weight_diag_matrix.npz")
MSIGDB_diag_node_degree_matrix = load_npz(MSIGDB_DIRECTORY + "diag_node_degree_matrix.npz")
MSIGDB_inverse_diag_edge_degree_matrix = load_npz(
    MSIGDB_DIRECTORY + "inverse_diag_edge_degree_matrix.npz"
    )

with open(DISEASE_FOLDER + "gene_to_index_distinct.json", "r") as file:
    gene_to_index_distinct = json.load(file)

In [None]:
# Number of Genes
print("Number of genes:",len(gene_to_index_distinct))

In [None]:
# Number of DGIDB Genes
print("Number of DGIDB genes:",DGIDB_binary_matrix.shape[0])

In [None]:
# Number of MSigDB Genes
print("Number of MSigDB genes:",MSIGDB_binary_matrix.shape[0])

In [None]:
# Percent of shared genes between DGIDB and MSigDB

# Open the JSON file and load its content into a dictionary
with open(DGIDB_DIRECTORY + f"gene_to_index_{DISEASE}.json", "r") as file:
    DGIDB_gene_to_index = json.load(file)
with open(MSIGDB_DIRECTORY + "gene_to_index.json", "r") as file:
    MSIGDB_gene_to_index = json.load(file)

DGIDB_index_to_gene = {index: gene for gene, index in DGIDB_gene_to_index.items()}   
# Jump probability for matching genes
w = 1

# Number of genes (assuming they are both of same size or matchable)
num_genes_dgidb = len(DGIDB_gene_to_index)
num_genes_msigdb = len(MSIGDB_gene_to_index)

# Initialize the inter-layer matrix with zeros
interlayer_transition_matrix = np.zeros((num_genes_msigdb,num_genes_dgidb))
i = 0
# Build the inter-layer matrix
for gene_dgidb, idx_dgidb in DGIDB_gene_to_index.items():
    # If the gene exists in both gene-to-index mappings
    if gene_dgidb in MSIGDB_gene_to_index:      
        idx_msigdb = MSIGDB_gene_to_index[gene_dgidb]
        interlayer_transition_matrix[idx_msigdb,idx_dgidb] = w  # Set jump probability
        i += 1

print(f"{i/len(gene_to_index_distinct):.2%}", "of genes are shared between the two layers")

In [None]:
# Number of DGIDB edges
print("Number of DGIDB edges:",DGIDB_binary_matrix.shape[1])

In [None]:
# Number of MSigDB edges
print("Number of MSigDB edges:",MSIGDB_binary_matrix.shape[1])

In [None]:
def plot_two_hypergraph_distributions(H1, H2, savepath=None, title=DISEASE):
    """
    Plot hyperedge-size and node-degree distributions for TWO incidence matrices,
    arranged vertically as two big panels.

    Layout:
        Panel 1: H1 hyperedge size (left), H1 node degree (right)
        Panel 2: H2 hyperedge size (left), H2 node degree (right)
    """

    def compute_sizes_degrees(H):
        if issparse(H):
            H_bool = H.astype(bool)
            sizes = np.asarray(H_bool.sum(axis=0)).ravel()
            degrees = np.asarray(H_bool.sum(axis=1)).ravel()
        else:
            H_bool = (np.asarray(H) != 0)
            sizes = H_bool.sum(axis=0)
            degrees = H_bool.sum(axis=1)
        return sizes, degrees

    # Compute for both matrices
    sizes1, degrees1 = compute_sizes_degrees(H1)
    sizes2, degrees2 = compute_sizes_degrees(H2)

    # Bins (discrete integer bins)
    def make_bins(x):
        if len(x) == 0:
            return np.arange(0.5, 2.5, 1)
        m = int(x.max())
        return np.arange(0.5, m + 1.5, 1)

    bins_sizes1   = make_bins(sizes1)
    bins_degrees1 = make_bins(degrees1)
    bins_sizes2   = make_bins(sizes2)
    bins_degrees2 = make_bins(degrees2)

    # Percentage weights for y-axis
    w_sizes1   = np.ones_like(sizes1, dtype=float) / len(sizes1) * 100
    w_degrees1 = np.ones_like(degrees1, dtype=float) / len(degrees1) * 100
    w_sizes2   = np.ones_like(sizes2, dtype=float) / len(sizes2) * 100
    w_degrees2 = np.ones_like(degrees2, dtype=float) / len(degrees2) * 100

    # ------ Create a 2x2 grid ------
    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    
    # spacing between rows and columns
    fig.subplots_adjust(
        top=0.88,     # leave room for suptitle
        hspace=0.35,  # vertical spacing between rows
        wspace=0.25   # horizontal spacing between columns
    )
    
    axes[0][0].set_title("Layer 1", fontsize=14, fontweight="bold", pad=30)
    axes[1][0].set_title("Layer 2", fontsize=14, fontweight="bold", pad=30)
    
    fig.suptitle(title, fontsize=16, y=0.98)

    # ================= PANEL 1: H1 =================
    # Hyperedge sizes
    axes[0][0].hist(sizes1, bins=bins_sizes1, weights=w_sizes1, edgecolor="black")
    axes[0][0].set_title("DGIDB Hyperedge Size Distribution")
    axes[0][0].set_xlabel("Hyperedge Size (# of Nodes)")
    axes[0][0].set_ylabel("Percentage (%)")

    # Node degrees
    axes[0][1].hist(degrees1, bins=bins_degrees1, weights=w_degrees1, edgecolor="black")
    axes[0][1].set_title("DGIDB Node Degree Distribution")
    axes[0][1].set_xlabel("Node Degree (# of Hyperedges)")
    axes[0][1].set_ylabel("Percentage (%)")

    # ================= PANEL 2: H2 =================
    axes[1][0].hist(sizes2, bins=bins_sizes2, weights=w_sizes2, edgecolor="black")
    axes[1][0].set_title("MSigDB Hyperedge Size Distribution")
    axes[1][0].set_xlabel("Hyperedge Size (# of Nodes)")
    axes[1][0].set_ylabel("Percentage (%)")

    axes[1][1].hist(degrees2, bins=bins_degrees2, weights=w_degrees2, edgecolor="black")
    axes[1][1].set_title("MSigDB Node Degree Distribution")
    axes[1][1].set_xlabel("Node Degree (# of Hyperedges)")
    axes[1][1].set_ylabel("Percentage (%)")

    if savepath:
        fig.savefig(savepath, dpi=300, bbox_inches='tight')

    return fig, axes


In [None]:
plot_two_hypergraph_distributions(DGIDB_binary_matrix,MSIGDB_binary_matrix, title = "Schizophrenia", savepath = f"../Graphs/hypergraph_stats_{DISEASE}")
plt.show()