In [1]:
import pandas as pd
import ast
import os
from scipy.sparse import dok_matrix, save_npz, diags
import json
import numpy as np
OUTPUT_FOLDER = "./output/MSigDB_FULL/"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
MSIGDB = pd.read_csv("../Data/MSigDB/PathwayToGene.tsv", sep="\t")
HUMANNET = pd.read_csv("../Data/HumanNet/HumanNet-GSP.tsv", sep="\t")

In [2]:
# Convert stringified lists to actual lists
MSIGDB["ncbi_gene_ids"] = MSIGDB["ncbi_gene_ids"].apply(ast.literal_eval)
MSIGDB["gene_names"] = MSIGDB["gene_names"].apply(ast.literal_eval)
MSIGDB["ncbi_gene_ids"] = MSIGDB["ncbi_gene_ids"].apply(lambda genes: [str(g) for g in genes])

In [None]:
# Checking certain NCBI ids Ex: '1','2','3'
i = 0
for gene_list in MSIGDB["ncbi_gene_ids"]:
    if '3' in gene_list:
        print(MSIGDB.iloc[i])
    i += 1

pathway                CARRILLOREIXACH_HEPATOBLASTOMA_VS_NORMAL_DN
gene_names       [A1BG, A2MP1, AADAT, ABAT, ABCA13, ABCA6, ABCA...
ncbi_gene_ids    [1, 3, 51166, 18, 154664, 23460, 10350, 8647, ...
Name: 1503, dtype: object
pathway                      SAKAI_TUMOR_INFILTRATING_MONOCYTES_DN
gene_names       [A2MP1, AK6, ARIH1, CCNC, CCNG2, CCT4, CCT8, C...
ncbi_gene_ids    [3, 102157402, 25820, 892, 901, 10575, 10694, ...
Name: 4704, dtype: object


In [3]:

# Get unique genes
all_genes = sorted(
    set(gene for gene_list in MSIGDB["ncbi_gene_ids"] for gene in gene_list),
    key=int
)
# Get unique pathways
all_pathways = MSIGDB["pathway"].tolist()

In [4]:
#checkpoint to preview variables
print(all_genes)
print(all_pathways)

['1', '2', '3', '9', '10', '12', '13', '14', '15', '16', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '43', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '58', '59', '60', '69', '70', '71', '72', '81', '83', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '97', '98', '100', '101', '102', '103', '104', '105', '107', '108', '109', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '123', '124', '125', '126', '127', '128', '130', '131', '132', '133', '134', '135', '136', '137', '140', '141', '142', '143', '146', '147', '148', '150', '151', '152', '153', '154', '155', '156', '157', '158', '159', '160', '161', '162', '163', '164', '165', '166', '167', '172', '173', '174', '175', '176', '177', '178', '181', '182', '183', '185', '186', '187', '189', '190', '191', '196', '197', '199', '202', '203', '204', '205', '207', '208', '210', '211', '212', '213'

In [5]:

# Create mappings
gene_to_index = {gene: i for i, gene in enumerate(all_genes)}
pathway_to_index = {pathway: j for j, pathway in enumerate(all_pathways)}
# Define file paths
gene_to_index_path = OUTPUT_FOLDER + "gene_to_index.json"
pathway_to_index_path = OUTPUT_FOLDER + "pathway_to_index.json"

# Save gene_to_index mapping
with open(gene_to_index_path, 'w') as gene_file:
    json.dump(gene_to_index, gene_file, indent=4)

# Save drug_to_index mapping
with open(pathway_to_index_path, 'w') as pathway_file:
    json.dump(pathway_to_index, pathway_file, indent=4)
print(f"Mappings saved to {gene_to_index_path} and {pathway_to_index_path}.")


Mappings saved to ./output/MSigDB_FULL/gene_to_index.json and ./output/MSigDB_FULL/pathway_to_index.json.


In [6]:
# Calculate gene degrees in HumanNet
genes_in_humannet = pd.unique(HUMANNET[['Gene1', 'Gene2']].values.ravel())
gene_to_degree = {gene: 0 for gene in genes_in_humannet}

for _, row in HUMANNET.iterrows():
    gene_to_degree[row["Gene1"]] += 1
    gene_to_degree[row["Gene2"]] += 1

In [7]:
first_key, first_value = next(iter(gene_to_degree.items()))

print(f"Key: {first_key!r} ({type(first_key).__name__}), Value: {first_value!r} ({type(first_value).__name__})")

first_key, first_value = next(iter(gene_to_index.items()))

print(f"Key: {first_key!r} ({type(first_key).__name__}), Value: {first_value!r} ({type(first_value).__name__})")

Key: 54936 (int64), Value: 48 (int)
Key: '1' (str), Value: 0 (int)


In [8]:
# Construct gene weight diagonal matrix with 0.01 for genes not in HumanNet
gene_weights = np.zeros(len(all_genes))
index_to_gene = {i: int(gene) for gene, i in gene_to_index.items()}
for index in range(len(all_genes)):
    gene = index_to_gene[index]
    if gene in gene_to_degree:
        gene_weights[index] = gene_to_degree[gene]
    else:
        gene_weights[index] = 0.01  # Assign a small weight to genes not in HumanNet
gene_weight_diag_matrix = diags(gene_weights)
save_npz(OUTPUT_FOLDER + "gene_weight_diag_matrix.npz", gene_weight_diag_matrix)
print(f"Gene weight diagonal matrix saved as {OUTPUT_FOLDER}gene_weight_diag_matrix.npz'.")

Gene weight diagonal matrix saved as ./output/MSigDB_FULL/gene_weight_diag_matrix.npz'.


In [9]:
print(np.sum(gene_weights != 0.01), "out of", len(gene_weights), "genes have non-default weights (aka are found in HumanNet).")

8743 out of 21981 genes have non-default weights (aka are found in HumanNet).


In [10]:
# Initialize DOK matrix
binary_incidence_matrix = dok_matrix((len(all_genes), len(all_pathways)), dtype=np.int8)
weighted_incidence_matrix = dok_matrix((len(all_genes), len(all_pathways)), dtype=float)

# Initialize degree diagonal matrix
hypernode_degree = np.zeros(len(all_genes))
hyperedge_degree = np.zeros(len(all_pathways))
hyperedge_degree_weightless = np.zeros(len(all_pathways))

# Initialize MSIGDB ncbi_to_gene
ncbi_to_gene = {}

#outsider = set()
# Populate the matrix
for _, row in MSIGDB.iterrows():
    j = pathway_to_index[row["pathway"]] # Column index for pathway
    index_in_pathway_list = 0
    for gene in row["ncbi_gene_ids"]:
        ncbi_to_gene[gene] = row["gene_names"][index_in_pathway_list]  # Map ncbi_gene_id to gene_name
        i = gene_to_index[gene]  # Row index for gene
        gene_degree = gene_to_degree.get(int(gene), 0.01) # Get current gene degree, default to 0.01 if not found
        # if (gene_degree == 0.01):
        #     outsider.add(gene)
        hypernode_degree[i] += 1  # Increment degree of the hypernode
        hyperedge_degree[j] += gene_degree  # Increment degree of the hyperedge
        hyperedge_degree_weightless[j] += 1

        binary_incidence_matrix[i, j] = 1  # Binary presence
        weighted_incidence_matrix[i, j] = gene_degree

        index_in_pathway_list += 1

# print(len(outsider), "genes in MSigDB are not found in HumanNet.")

# Save ncbi_to_gene mapping
ncbi_to_gene_path = OUTPUT_FOLDER + "ncbi_to_gene.json"
with open(ncbi_to_gene_path, 'w') as ncbi_file:
    json.dump(ncbi_to_gene, ncbi_file, indent=4)
print(f"NCBI to gene mapping saved to {ncbi_to_gene_path}")

# Build inverse diagonal degree matrix
diag_node_degree_matrix = diags(hypernode_degree)
inverse_hypernode_degree = np.reciprocal(hypernode_degree, where=hypernode_degree!=0)
inverse_diag_node_degree_matrix = diags(inverse_hypernode_degree)

inverse_hyperedge_degree = np.reciprocal(hyperedge_degree, where=hyperedge_degree!=0)
inverse_diag_edge_degree_matrix = diags(inverse_hyperedge_degree)

inverse_hyperedge_degrees_weightless = np.reciprocal(hyperedge_degree_weightless, where=hyperedge_degree_weightless!=0)
inverse_diag_edge_degree_weightless_matrix = diags(inverse_hyperedge_degrees_weightless)

# Convert the DOK matrix to CSR format
csr_matrix = weighted_incidence_matrix.tocsr()
binary_csr_matrix = binary_incidence_matrix.tocsr()

# Save the matrix as .npz file
save_npz(OUTPUT_FOLDER + "hypergraph_incidence_matrix_weighted.npz", csr_matrix)
save_npz(OUTPUT_FOLDER + "hypergraph_incidence_matrix_binary.npz", binary_csr_matrix)
save_npz(OUTPUT_FOLDER + "diag_node_degree_matrix.npz", diag_node_degree_matrix)
save_npz(OUTPUT_FOLDER + "inverse_diag_node_degree_matrix.npz", inverse_diag_node_degree_matrix)
save_npz(OUTPUT_FOLDER + "inverse_diag_edge_degree_matrix.npz", inverse_diag_edge_degree_matrix)
save_npz(OUTPUT_FOLDER + "inverse_diag_edge_degree_weightless_matrix.npz", inverse_diag_edge_degree_weightless_matrix)

# Print confirmation
print(f"Weighted incidence matrix saved as {OUTPUT_FOLDER}/hypergraph_incidence_matrix_weighted.npz'.")
print(f"Binary incidence matrix saved as {OUTPUT_FOLDER}/hypergraph_incidence_matrix_binary.npz'.")
print(f"Diagonal node degree matrix saved as {OUTPUT_FOLDER}diag_node_degree_matrix.npz'.")
print(f"Inverse diagonal node degree matrix saved as {OUTPUT_FOLDER}inverse_diag_node_degree_matrix.npz'.")
print(f"Inverse diagonal edge degree matrix saved as {OUTPUT_FOLDER}inverse_diag_edge_degree_matrix.npz'.")
print(f"Inverse diagonal edge degree weightless matrix saved as {OUTPUT_FOLDER}inverse_diag_edge_degree_weightless_matrix.npz'.")

NCBI to gene mapping saved to ./output/MSigDB_FULL/ncbi_to_gene.json
Weighted incidence matrix saved as ./output/MSigDB_FULL//hypergraph_incidence_matrix_weighted.npz'.
Binary incidence matrix saved as ./output/MSigDB_FULL//hypergraph_incidence_matrix_binary.npz'.
Diagonal node degree matrix saved as ./output/MSigDB_FULL/diag_node_degree_matrix.npz'.
Inverse diagonal node degree matrix saved as ./output/MSigDB_FULL/inverse_diag_node_degree_matrix.npz'.
Inverse diagonal edge degree matrix saved as ./output/MSigDB_FULL/inverse_diag_edge_degree_matrix.npz'.
Inverse diagonal edge degree weightless matrix saved as ./output/MSigDB_FULL/inverse_diag_edge_degree_weightless_matrix.npz'.
