In [None]:
OUTPUT_FOLDER = "./output/DGIDB_BIPOLAR/"
# Leave blank for the all drugs
DISEASE = "BIPOLAR"
SNOMED_DISEASE_CODES = [13746004] #choose the corresponding SNOMED id for the disease


In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import dok_matrix, save_npz, diags
import os
import json
DGIDB = pd.read_csv("../Data/DGIDB/DrugToGene.tsv", sep="\t")
HUMANNET = pd.read_csv("../Data/HumanNet/HumanNet-GSP.tsv", sep="\t")
DDDB = pd.read_csv("../Data/DDDB/DrugToDisease_DGIDB_naming.tsv", sep="\t")
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

In [None]:
#Any data preview happens here
# DGIDB.head(15)
# HUMANNET.head(10) 

In [None]:
#extracting the drugs related to that specific disease
specific_disease_drugs = DDDB.loc[DDDB['SNOMED'].isin(SNOMED_DISEASE_CODES), 'ndfrt_preferred_label'].dropna().unique().tolist()
print(specific_disease_drugs)

In [None]:
# Filter only the relevant genes
if specific_disease_drugs:
    relevant_rows = DGIDB[DGIDB['drug_name'].isin(specific_disease_drugs)].copy()
else:
    relevant_rows = DGIDB.copy()

In [None]:
relevant_rows

In [None]:
# Create mappings for vertices and hyperedges
relevant_rows['ncbi_gene_id'] = relevant_rows['ncbi_gene_id'].astype(str)
genes = relevant_rows['ncbi_gene_id'].unique()
drugs = relevant_rows['drug_name'].unique()
gene_to_index = {gene: i for i, gene in enumerate(genes)}
drug_to_index = {drug: i for i, drug in enumerate(drugs)}
# Define file paths
gene_to_index_path = OUTPUT_FOLDER + f"gene_to_index_{DISEASE}.json"
drug_to_index_path = OUTPUT_FOLDER + f"drug_to_index_{DISEASE}.json"

# Save gene_to_index mapping
with open(gene_to_index_path, 'w') as gene_file:
    json.dump(gene_to_index, gene_file, indent=4)

# Save drug_to_index mapping
with open(drug_to_index_path, 'w') as drug_file:
    json.dump(drug_to_index, drug_file, indent=4)

print(f"Mappings saved to {gene_to_index_path} and {drug_to_index_path}.")

In [None]:
len(gene_to_index)

In [None]:
print(drug_to_index)
print(gene_to_index)
print("Number of relevant drugs: " + str(len(drugs)))
print("Number of relevant genes: " + str(len(genes)))

In [None]:
# Calculate gene degrees in HumanNet
genes_in_humannet = pd.unique(HUMANNET[['Gene1', 'Gene2']].values.ravel())
gene_to_degree = {gene: 0 for gene in genes_in_humannet}

for _, row in HUMANNET.iterrows():
    gene_to_degree[row["Gene1"]] += 1
    gene_to_degree[row["Gene2"]] += 1

gene_to_degree = {str(gene): degree for gene, degree in gene_to_degree.items()}

In [None]:
gene_to_degree['1544']

In [None]:
# Construct gene weight diagonal matrix with 0.01 for genes not in HumanNet
gene_weights = np.zeros(len(genes))
index_to_gene = {i: gene for gene, i in gene_to_index.items()}
for index in range(len(genes)):
    gene = index_to_gene[index]
    if gene in gene_to_degree:
        print("FOUND: " + gene)
        gene_weights[index] = gene_to_degree[gene]
    else:
        gene_weights[index] = 0.01  # Assign a small weight to genes not in HumanNet
diag_gene_weight_matrix = diags(gene_weights,dtype = np.float32)
save_npz(OUTPUT_FOLDER + "diag_gene_weight_matrix.npz", diag_gene_weight_matrix)
print(f"Gene weight diagonal matrix saved as {OUTPUT_FOLDER} + diag_gene_weight_matrix.npz")

In [None]:
gene_weights

In [None]:
# Add degrees to DGIDB with fallback to 0.01 for missing genes
relevant_rows['degree'] = relevant_rows['ncbi_gene_id'].map(gene_to_degree).fillna(0.01)

In [None]:
relevant_rows

In [None]:
num_of_irre_degree = len(relevant_rows[relevant_rows['degree'] == 0.01])
num_relevant_rows_entries = len(relevant_rows)
print(f"Number of terms with filled degree (0.01): {num_of_irre_degree}")
print(f"Percentage of terms with filled degree (0.01): {num_of_irre_degree / num_relevant_rows_entries}")

In [None]:
print(len(genes), len(drugs))

In [None]:
# Initialize a sparse incidence matrix
incidence_matrix = dok_matrix((len(genes), len(drugs)), dtype=np.float32)
binary_incidence_matrix = dok_matrix((len(genes), len(drugs)), dtype=int)

# Initialize degree diagonal matrix
hypernode_degree = np.zeros(len(genes))
hyperedge_degree = np.zeros(len(drugs))
hyperedge_degree_weightless = np.zeros(len(drugs))

In [None]:
relevant_rows.columns

In [None]:
# Populate the matrices by processing the relevant rows in DGIDB
i = 0
repeated_rows = []
for _, row in relevant_rows.iterrows():
    gene_idx = gene_to_index[row['ncbi_gene_id']]
    drug_idx = drug_to_index[row['drug_name']]
    
    if (incidence_matrix[gene_idx, drug_idx] != 0):
        repeated_rows.append((row['ncbi_gene_id'], row['drug_name'],i))
    else:
        hypernode_degree[gene_idx] += 1
        hyperedge_degree[drug_idx] += row['degree']
        hyperedge_degree_weightless[drug_idx] += 1
        incidence_matrix[gene_idx, drug_idx] = row['degree']
        binary_incidence_matrix[gene_idx, drug_idx] = 1
    i += 1

In [None]:
# Sanity Cheeeeeeeeeeeeeeeeck
print(binary_incidence_matrix.shape)
print(binary_incidence_matrix.nnz)
print(len(repeated_rows))
print(len(relevant_rows), "(Should be the sum of the two numbers above)")

In [None]:
len(hypernode_degree)

In [None]:



# # Show nonzero rows sum
# row_sums = np.sum(incidence_matrix.T, axis=1)
# nonzero_row_sums = row_sums[row_sums != 0]
# print(nonzero_row_sums)
# print(hyperedge_degree[hyperedge_degree.nonzero()])

# Build inverse diagonal degree matrix
diag_node_degree_matrix = diags(hypernode_degree,dtype = np.float32)
inverse_hypernode_degrees = np.reciprocal(hypernode_degree, where=hypernode_degree!=0,dtype = np.float32)
inverse_diag_node_degree_matrix = diags(inverse_hypernode_degrees,dtype = np.float32)

inverse_hyperedge_degrees = np.reciprocal(hyperedge_degree, where=hyperedge_degree!=0,dtype = np.float32)
inverse_diag_edge_degree_matrix = diags(inverse_hyperedge_degrees,dtype = np.float32)

inverse_hyperedge_degrees_weightless = np.reciprocal(hyperedge_degree_weightless, where=hyperedge_degree_weightless!=0,dtype = np.float32)
inverse_diag_edge_degree_weightless_matrix = diags(inverse_hyperedge_degrees_weightless,dtype = np.float32)

# Convert the DOK matrix to CSR format
incidence_matrix = incidence_matrix.tocsr()
binary_incidence_matrix = binary_incidence_matrix.tocsr()


# Save the matrix as .npz file
save_npz(OUTPUT_FOLDER + "hypergraph_incidence_matrix_weighted.npz", incidence_matrix)
save_npz(OUTPUT_FOLDER + "hypergraph_incidence_matrix_binary.npz", binary_incidence_matrix)
save_npz(OUTPUT_FOLDER + "diag_node_degree_matrix.npz", diag_node_degree_matrix)
save_npz(OUTPUT_FOLDER + "inverse_diag_node_degree_matrix.npz", inverse_diag_node_degree_matrix)
save_npz(OUTPUT_FOLDER + "inverse_diag_edge_degree_matrix.npz", inverse_diag_edge_degree_matrix)
save_npz(OUTPUT_FOLDER + "inverse_diag_edge_degree_weightless_matrix.npz", inverse_diag_edge_degree_weightless_matrix)
# Print confirmation
print(f"Weighted incidence matrix saved as {OUTPUT_FOLDER}hypergraph_incidence_matrix_weighted.npz'.")
print(f"Binary incidence matrix saved as {OUTPUT_FOLDER}hypergraph_incidence_matrix_binary.npz'.")
print(f"Diagonal node degree matrix saved as {OUTPUT_FOLDER}inverse_diag_node_degree_matrix.npz'.")
print(f"Inverse diagonal node degree matrix saved as {OUTPUT_FOLDER}inverse_diag_node_degree_matrix.npz'.")
print(f"Inverse diagonal edge degree matrix saved as {OUTPUT_FOLDER}inverse_diag_edge_degree_matrix.npz'.")
print(f"Inverse diagonal edge degree weightless matrix saved as {OUTPUT_FOLDER}inverse_diag_edge_degree_weightless_matrix.npz'.")


In [None]:
print(incidence_matrix)
print(binary_incidence_matrix)
print(diag_node_degree_matrix)
print(inverse_diag_node_degree_matrix)
print(inverse_diag_edge_degree_matrix)
print(inverse_diag_edge_degree_weightless_matrix)

In [None]:
print(incidence_matrix)

In [None]:
print(np.sum(gene_weights != 0.01), "out of", len(gene_weights), "genes have non-default weights (aka are found in HumanNet).")

In [None]:
row_sums = np.sum(np.abs(binary_incidence_matrix), axis=1)

# Indices of zero rows
zero_row_indices = np.where(row_sums == 0)[0]

# Count
num_zero_rows = len(zero_row_indices)

print("Zero row indices:", zero_row_indices)
print("Number of zero rows:", num_zero_rows)

In [None]:
for drug in specific_disease_drugs:
    if drug in drug_to_index:
        idx = drug_to_index[drug]
        print(f"Drug: {drug}, Index: {idx}")
    else:
        print(f"Drug: {drug} not found in drug_to_index.")


In [None]:
# # Compute gene-gene adjacency matrix by projecting via shared drugs
# adj_matrix = adj_matrix = binary_csr_matrix @ binary_csr_matrix.T  # Matrix multiplication: shared drugs
# adj_matrix.setdiag(0)
# adj_matrix.eliminate_zeros()

# # --- Step 2: Extract Edgelist from Upper Triangle Only ---
# # Use sparse coo_matrix to iterate efficiently
# from scipy.sparse import triu

# adj_matrix_upper = triu(adj_matrix, k=1)  # upper triangle, no diag
# adj_coo = adj_matrix_upper.tocoo()

# # Optional: if you have gene names
# # gene_names = ['TP53', 'EGFR', 'BRCA1', ...]
# # Otherwise use indices as names

# edges = []
# for i, j, v in zip(adj_coo.row, adj_coo.col, adj_coo.data):
#     edges.append((i, j, v))  # replace i/j with gene_names[i] if available

# # Convert to DataFrame and save
# edge_df = pd.DataFrame(edges, columns=["Gene1", "Gene2", "Weight"])

# # If you have gene names, map them:
# # edge_df["Gene1"] = edge_df["Gene1"].map(lambda i: gene_names[i])
# # edge_df["Gene2"] = edge_df["Gene2"].map(lambda i: gene_names[i])

# edge_df.to_csv("gene_gene_edgelist.csv", index=False)

In [None]:
# import pandas as pd

# if 'NCBI_INFO' not in globals():
#     print("Reading gene2refseq.gz...")
#     NCBI_INFO = pd.read_csv("../Data/ncbi/gene2refseq.gz", sep='\t', compression='gzip')
# else:
#     print("NCBI_INFO already loaded.")

In [None]:
# index_to_ncbi = {idx: gene for gene, idx in gene_to_index.items()}
# human_gene2refseq = NCBI_INFO[NCBI_INFO['#tax_id'] == 9606]
# id_to_gene_claim = pd.Series(human_gene2refseq.Symbol.values, index=human_gene2refseq.GeneID).to_dict()

# # Your existing function to get common gene name from ncbi gene id
# def get_gene_claim_name(ncbi_gene_id):
#     try:
#         ncbi_gene_id = int(ncbi_gene_id)
#         result = id_to_gene_claim.get(ncbi_gene_id, None)
#         return result if result else "Gene name not found"
#     except:
#         return "Gene name not found"

In [None]:
# # Step 1: Map index → NCBI gene ID
# edge_df['Gene1_ncbi'] = edge_df['Gene1'].map(index_to_ncbi)
# edge_df['Gene2_ncbi'] = edge_df['Gene2'].map(index_to_ncbi)

# # Step 2: Map NCBI gene ID → gene symbol
# edge_df['Gene1'] = edge_df['Gene1_ncbi'].apply(get_gene_claim_name)
# edge_df['Gene2'] = edge_df['Gene2_ncbi'].apply(get_gene_claim_name)

# # Step 3: Drop temp NCBI ID columns
# edge_df = edge_df.drop(columns=['Gene1_ncbi', 'Gene2_ncbi'])

# # Optional: Save to CSV
# edge_df.to_csv('gene_gene_edgelist_named.csv', index=False)
