In [17]:
OUTPUT_FOLDER = "./output/DGIDB_BIPOLAR/"
# Leave blank for the all drugs
DISEASE = "BIPOLAR"
SNOMED_DISEASE_CODES = [13746004] #choose the corresponding SNOMED id for the disease


In [2]:
import numpy as np
import pandas as pd
from scipy.sparse import dok_matrix, save_npz, diags
import os
import json
DGIDB = pd.read_csv("../Data/DGIDB/DrugToGene.tsv", sep="\t")
HUMANNET = pd.read_csv("../Data/HumanNet/HumanNet-GSP.tsv", sep="\t")
DDDB = pd.read_csv("../Data/DDDB/DrugToDisease_DGIDB_naming.tsv", sep="\t")
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

In [3]:
#Any data preview happens here
# DGIDB.head(15)
# HUMANNET.head(10) 

In [4]:
#extracting the drugs related to that specific disease
specific_disease_drugs = DDDB.loc[DDDB['SNOMED'].isin(SNOMED_DISEASE_CODES), 'ndfrt_preferred_label'].dropna().unique().tolist()
print(specific_disease_drugs)

['CLONAZEPAM', 'BUPROPION HYDROCHLORIDE', 'SERTRALINE HYDROCHLORIDE', 'OLANZAPINE', 'ZIPRASIDONE', 'QUETIAPINE FUMARATE', 'CHLORPROMAZINE', 'CARBAMAZEPINE', 'ALLOPURINOL', 'PERPHENAZINE', 'VALPROIC ACID', 'CLOZAPINE', 'LITHIUM', 'RISPERIDONE', 'LAMOTRIGINE', 'ARIPIPRAZOLE LAUROXIL']


In [5]:
# Filter only the relevant genes
if specific_disease_drugs:
    relevant_rows = DGIDB[DGIDB['drug_name'].isin(specific_disease_drugs)].copy()
else:
    relevant_rows = DGIDB.copy()

In [6]:
relevant_rows

Unnamed: 0,gene_claim_name,gene_concept_id,gene_name,interaction_source_db_name,interaction_source_db_version,interaction_type,interaction_score,drug_claim_name,drug_concept_id,drug_name,approved,immunotherapy,anti_neoplastic,ncbi_gene_id
74,NCBIGENE:926,hgnc:1707,CD8B,GuideToPharmacology,2024.1,inhibitor,0.083478,IUPHAR.LIGAND:7135,rxcui:203204,BUPROPION HYDROCHLORIDE,True,False,False,926
207,TNIK,hgnc:30765,TNIK,PharmGKB,4/5/24,,0.391848,risperidone,rxcui:35636,RISPERIDONE,True,False,False,23043
404,DRD2,hgnc:3023,DRD2,DTC,9/2/20,,0.014977,CLOZAPINE,rxcui:2626,CLOZAPINE,True,False,False,1813
430,FAM178B,hgnc:28036,FAM178B,PharmGKB,4/5/24,,2.500365,lithium,rxcui:6448,LITHIUM,True,False,False,51252
497,NCBIGENE:262,hgnc:457,AMD1,GuideToPharmacology,2024.1,inhibitor,0.016765,IUPHAR.LIGAND:50,rxcui:221153,QUETIAPINE FUMARATE,True,False,False,262
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88064,HTT,hgnc:4851,HTT,PharmGKB,4/5/24,,0.010380,risperidone,rxcui:35636,RISPERIDONE,True,False,False,3064
88065,TJP1,hgnc:11827,TJP1,PharmGKB,4/5/24,,0.391848,risperidone,rxcui:35636,RISPERIDONE,True,False,False,7082
88066,PPA2,hgnc:28883,PPA2,PharmGKB,4/5/24,,1.567393,risperidone,rxcui:35636,RISPERIDONE,True,False,False,27068
88263,NCBIGENE:3,hgnc:8,A2MP1,GuideToPharmacology,2024.1,agonist,0.017364,IUPHAR.LIGAND:50,rxcui:221153,QUETIAPINE FUMARATE,True,False,False,3


In [7]:
# Create mappings for vertices and hyperedges
relevant_rows['ncbi_gene_id'] = relevant_rows['ncbi_gene_id'].astype(str)
genes = relevant_rows['ncbi_gene_id'].unique()
drugs = relevant_rows['drug_name'].unique()
gene_to_index = {gene: i for i, gene in enumerate(genes)}
drug_to_index = {drug: i for i, drug in enumerate(drugs)}
# Define file paths
gene_to_index_path = OUTPUT_FOLDER + f"gene_to_index_{DISEASE}.json"
drug_to_index_path = OUTPUT_FOLDER + f"drug_to_index_{DISEASE}.json"

# Save gene_to_index mapping
with open(gene_to_index_path, 'w') as gene_file:
    json.dump(gene_to_index, gene_file, indent=4)

# Save drug_to_index mapping
with open(drug_to_index_path, 'w') as drug_file:
    json.dump(drug_to_index, drug_file, indent=4)

print(f"Mappings saved to {gene_to_index_path} and {drug_to_index_path}.")

Mappings saved to ./output/DGIDB_BIPOLAR/gene_to_index_BIPOLAR.json and ./output/DGIDB_BIPOLAR/drug_to_index_BIPOLAR.json.


In [8]:
print(drug_to_index)
print(gene_to_index)
print("Number of relevant drugs: " + str(len(drugs)))
print("Number of relevant genes: " + str(len(genes)))

{'BUPROPION HYDROCHLORIDE': 0, 'RISPERIDONE': 1, 'CLOZAPINE': 2, 'LITHIUM': 3, 'QUETIAPINE FUMARATE': 4, 'ALLOPURINOL': 5, 'OLANZAPINE': 6, 'PERPHENAZINE': 7, 'CHLORPROMAZINE': 8, 'SERTRALINE HYDROCHLORIDE': 9, 'LAMOTRIGINE': 10, 'VALPROIC ACID': 11, 'ZIPRASIDONE': 12, 'CARBAMAZEPINE': 13, 'ARIPIPRAZOLE LAUROXIL': 14, 'CLONAZEPAM': 15}
{'926': 0, '23043': 1, '1813': 2, '51252': 3, '262': 4, '360158': 5, '3356': 6, '1544': 7, '2913': 8, '1559': 9, '6714': 10, '10369': 11, '64816': 12, '9429': 13, '40': 14, '25970': 15, '9135': 16, '7905': 17, '54658': 18, '1543': 19, '7366': 20, '8841': 21, '55869': 22, '9734': 23, '1129': 24, '6323': 25, '11280': 26, '7498': 27, '12': 28, '147': 29, '3107': 30, '1565': 31, '1': 32, '22854': 33, '3': 34, '6331': 35, '54576': 36, '2952': 37, '3417': 38, '1385': 39, '107': 40, '119679': 41, '6326': 42, '5460': 43, '79718': 44, '54600': 45, '718': 46, '7365': 47, '4363': 48, '3127': 49, '2904': 50, '1562': 51, '2908': 52, '3066': 53, '2561': 54, '2555': 55

In [9]:
# Calculate gene degrees in HumanNet
genes_in_humannet = pd.unique(HUMANNET[['Gene1', 'Gene2']].values.ravel())
gene_to_degree = {gene: 0 for gene in genes_in_humannet}

for _, row in HUMANNET.iterrows():
    gene_to_degree[row["Gene1"]] += 1
    gene_to_degree[row["Gene2"]] += 1

gene_to_degree = {str(gene): degree for gene, degree in gene_to_degree.items()}

In [10]:
gene_to_degree['1544']

50

In [31]:
# Construct gene weight diagonal matrix with 0.01 for genes not in HumanNet
gene_weights = np.zeros(len(genes))
index_to_gene = {i: gene for gene, i in gene_to_index.items()}
for index in range(len(genes)):
    gene = index_to_gene[index]
    if gene in gene_to_degree:
        print("FOUND: " + gene)
        gene_weights[index] = gene_to_degree[gene]
    else:
        gene_weights[index] = 0.01  # Assign a small weight to genes not in HumanNet
diag_gene_weight_matrix = diags(gene_weights,dtype = np.float32)
save_npz(OUTPUT_FOLDER + "diag_gene_weight_matrix.npz", diag_gene_weight_matrix)
print(f"Gene weight diagonal matrix saved as {OUTPUT_FOLDER} + diag_gene_weight_matrix.npz")

FOUND: 23043
FOUND: 1813
FOUND: 262
FOUND: 3356
FOUND: 1544
FOUND: 1559
FOUND: 6714
FOUND: 9429
FOUND: 40
FOUND: 9135
FOUND: 54658
FOUND: 1543
FOUND: 7366
FOUND: 8841
FOUND: 55869
FOUND: 9734
FOUND: 1129
FOUND: 6323
FOUND: 7498
FOUND: 147
FOUND: 3107
FOUND: 1565
FOUND: 6331
FOUND: 54576
FOUND: 3417
FOUND: 1385
FOUND: 5460
FOUND: 79718
FOUND: 54600
FOUND: 718
FOUND: 4363
FOUND: 2904
FOUND: 1562
FOUND: 2908
FOUND: 3066
FOUND: 2561
FOUND: 2555
FOUND: 3350
FOUND: 3352
FOUND: 3362
FOUND: 150
FOUND: 367
FOUND: 7157
FOUND: 2740
FOUND: 1814
FOUND: 2
FOUND: 2668
FOUND: 4524
FOUND: 1557
FOUND: 2548
FOUND: 4908
FOUND: 5617
FOUND: 4907
FOUND: 613
FOUND: 4781
FOUND: 54659
FOUND: 51564
FOUND: 1812
FOUND: 1815
FOUND: 54578
FOUND: 2895
FOUND: 2917
FOUND: 3358
FOUND: 24
FOUND: 215
FOUND: 217
FOUND: 3123
FOUND: 1576
FOUND: 151
FOUND: 7166
FOUND: 2944
FOUND: 3105
FOUND: 54657
FOUND: 5311
FOUND: 3354
FOUND: 148
FOUND: 265
FOUND: 2646
FOUND: 3065
FOUND: 2166
FOUND: 64388
FOUND: 26
FOUND: 2562
FOUND: 3351
F

In [12]:
gene_weights

array([1.00e-02, 2.09e+02, 4.70e+01, 1.00e-02, 1.00e+01, 1.00e-02,
       8.80e+01, 5.00e+01, 1.00e-02, 4.90e+01, 4.58e+02, 1.00e-02,
       1.00e-02, 2.50e+01, 4.00e+00, 1.00e-02, 1.20e+01, 1.00e-02,
       2.90e+01, 7.80e+01, 2.70e+01, 1.36e+02, 1.05e+02, 1.00e+02,
       1.00e+00, 1.20e+01, 1.00e-02, 1.66e+02, 1.00e-02, 5.90e+01,
       2.00e+00, 5.20e+01, 1.00e-02, 1.00e-02, 1.00e-02, 1.17e+02,
       3.30e+01, 1.00e-02, 5.00e+00, 1.30e+01, 1.00e-02, 1.00e-02,
       1.00e-02, 5.30e+01, 5.50e+01, 3.70e+01, 3.20e+01, 1.00e-02,
       1.40e+01, 1.00e-02, 1.00e+01, 1.00e+01, 5.50e+01, 7.90e+01,
       7.00e+00, 2.60e+01, 1.00e-02, 1.00e-02, 7.40e+01, 1.80e+01,
       2.30e+01, 2.54e+02, 1.06e+02, 5.05e+02, 1.00e-02, 8.20e+01,
       1.52e+02, 1.00e+00, 1.00e-02, 7.80e+01, 2.20e+01, 4.30e+01,
       4.00e+01, 1.87e+02, 5.90e+01, 3.70e+01, 1.00e-02, 6.80e+01,
       4.10e+01, 1.00e-02, 2.90e+01, 6.80e+01, 1.00e-02, 7.30e+01,
       7.00e+01, 3.40e+01, 7.00e+00, 5.00e+01, 4.70e+01, 7.00e

In [13]:
# Add degrees to DGIDB with fallback to 0.01 for missing genes
relevant_rows['degree'] = relevant_rows['ncbi_gene_id'].map(gene_to_degree).fillna(0.01)

In [14]:
relevant_rows

Unnamed: 0,gene_claim_name,gene_concept_id,gene_name,interaction_source_db_name,interaction_source_db_version,interaction_type,interaction_score,drug_claim_name,drug_concept_id,drug_name,approved,immunotherapy,anti_neoplastic,ncbi_gene_id,degree
74,NCBIGENE:926,hgnc:1707,CD8B,GuideToPharmacology,2024.1,inhibitor,0.083478,IUPHAR.LIGAND:7135,rxcui:203204,BUPROPION HYDROCHLORIDE,True,False,False,926,0.01
207,TNIK,hgnc:30765,TNIK,PharmGKB,4/5/24,,0.391848,risperidone,rxcui:35636,RISPERIDONE,True,False,False,23043,209.00
404,DRD2,hgnc:3023,DRD2,DTC,9/2/20,,0.014977,CLOZAPINE,rxcui:2626,CLOZAPINE,True,False,False,1813,47.00
430,FAM178B,hgnc:28036,FAM178B,PharmGKB,4/5/24,,2.500365,lithium,rxcui:6448,LITHIUM,True,False,False,51252,0.01
497,NCBIGENE:262,hgnc:457,AMD1,GuideToPharmacology,2024.1,inhibitor,0.016765,IUPHAR.LIGAND:50,rxcui:221153,QUETIAPINE FUMARATE,True,False,False,262,10.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88064,HTT,hgnc:4851,HTT,PharmGKB,4/5/24,,0.010380,risperidone,rxcui:35636,RISPERIDONE,True,False,False,3064,155.00
88065,TJP1,hgnc:11827,TJP1,PharmGKB,4/5/24,,0.391848,risperidone,rxcui:35636,RISPERIDONE,True,False,False,7082,61.00
88066,PPA2,hgnc:28883,PPA2,PharmGKB,4/5/24,,1.567393,risperidone,rxcui:35636,RISPERIDONE,True,False,False,27068,18.00
88263,NCBIGENE:3,hgnc:8,A2MP1,GuideToPharmacology,2024.1,agonist,0.017364,IUPHAR.LIGAND:50,rxcui:221153,QUETIAPINE FUMARATE,True,False,False,3,0.01


In [15]:
num_of_irre_degree = len(relevant_rows[relevant_rows['degree'] == 0.01])
num_relevant_rows_entries = len(relevant_rows)
print(f"Number of terms with filled degree (0.01): {num_of_irre_degree}")
print(f"Percentage of terms with filled degree (0.01): {num_of_irre_degree / num_relevant_rows_entries}")

Number of terms with filled degree (0.01): 213
Percentage of terms with filled degree (0.01): 0.2642679900744417


In [16]:
print(len(genes), len(drugs))

359 16


In [18]:
# Initialize a sparse incidence matrix
incidence_matrix = dok_matrix((len(genes), len(drugs)), dtype=np.float32)
binary_incidence_matrix = dok_matrix((len(genes), len(drugs)), dtype=int)

# Initialize degree diagonal matrix
hypernode_degree = np.zeros(len(genes))
hyperedge_degree = np.zeros(len(drugs))
hyperedge_degree_weightless = np.zeros(len(drugs))

In [19]:
relevant_rows.columns

Index(['gene_claim_name', 'gene_concept_id', 'gene_name',
       'interaction_source_db_name', 'interaction_source_db_version',
       'interaction_type', 'interaction_score', 'drug_claim_name',
       'drug_concept_id', 'drug_name', 'approved', 'immunotherapy',
       'anti_neoplastic', 'ncbi_gene_id', 'degree'],
      dtype='object')

In [20]:
# Populate the matrices by processing the relevant rows in DGIDB
i = 0
repeated_rows = []
for _, row in relevant_rows.iterrows():
    gene_idx = gene_to_index[row['ncbi_gene_id']]
    drug_idx = drug_to_index[row['drug_name']]
    
    if (incidence_matrix[gene_idx, drug_idx] != 0):
        repeated_rows.append((row['ncbi_gene_id'], row['drug_name'],i))
    else:
        hypernode_degree[gene_idx] += 1
        hyperedge_degree[drug_idx] += row['degree']
        hyperedge_degree_weightless[drug_idx] += 1
        incidence_matrix[gene_idx, drug_idx] = row['degree']
        binary_incidence_matrix[gene_idx, drug_idx] = 1
    i += 1

In [21]:
# Sanity Cheeeeeeeeeeeeeeeeck
print(binary_incidence_matrix.shape)
print(binary_incidence_matrix.nnz)
print(len(repeated_rows))
print(len(relevant_rows), "(Should be the sum of the two numbers above)")

(359, 16)
632
174
806 (Should be the sum of the two numbers above)


In [22]:
len(hypernode_degree)

359

In [23]:



# # Show nonzero rows sum
# row_sums = np.sum(incidence_matrix.T, axis=1)
# nonzero_row_sums = row_sums[row_sums != 0]
# print(nonzero_row_sums)
# print(hyperedge_degree[hyperedge_degree.nonzero()])

# Build inverse diagonal degree matrix
diag_node_degree_matrix = diags(hypernode_degree,dtype = np.float32)
inverse_hypernode_degrees = np.reciprocal(hypernode_degree, where=hypernode_degree!=0,dtype = np.float32)
inverse_diag_node_degree_matrix = diags(inverse_hypernode_degrees,dtype = np.float32)

inverse_hyperedge_degrees = np.reciprocal(hyperedge_degree, where=hyperedge_degree!=0,dtype = np.float32)
inverse_diag_edge_degree_matrix = diags(inverse_hyperedge_degrees,dtype = np.float32)

inverse_hyperedge_degrees_weightless = np.reciprocal(hyperedge_degree_weightless, where=hyperedge_degree_weightless!=0,dtype = np.float32)
inverse_diag_edge_degree_weightless_matrix = diags(inverse_hyperedge_degrees_weightless,dtype = np.float32)

# Convert the DOK matrix to CSR format
incidence_matrix = incidence_matrix.tocsr()
binary_incidence_matrix = binary_incidence_matrix.tocsr()


# Save the matrix as .npz file
save_npz(OUTPUT_FOLDER + "hypergraph_incidence_matrix_weighted.npz", incidence_matrix)
save_npz(OUTPUT_FOLDER + "hypergraph_incidence_matrix_binary.npz", binary_incidence_matrix)
save_npz(OUTPUT_FOLDER + "diag_node_degree_matrix.npz", diag_node_degree_matrix)
save_npz(OUTPUT_FOLDER + "inverse_diag_node_degree_matrix.npz", inverse_diag_node_degree_matrix)
save_npz(OUTPUT_FOLDER + "inverse_diag_edge_degree_matrix.npz", inverse_diag_edge_degree_matrix)
save_npz(OUTPUT_FOLDER + "inverse_diag_edge_degree_weightless_matrix.npz", inverse_diag_edge_degree_weightless_matrix)
# Print confirmation
print(f"Weighted incidence matrix saved as {OUTPUT_FOLDER}hypergraph_incidence_matrix_weighted.npz'.")
print(f"Binary incidence matrix saved as {OUTPUT_FOLDER}hypergraph_incidence_matrix_binary.npz'.")
print(f"Diagonal node degree matrix saved as {OUTPUT_FOLDER}inverse_diag_node_degree_matrix.npz'.")
print(f"Inverse diagonal node degree matrix saved as {OUTPUT_FOLDER}inverse_diag_node_degree_matrix.npz'.")
print(f"Inverse diagonal edge degree matrix saved as {OUTPUT_FOLDER}inverse_diag_edge_degree_matrix.npz'.")
print(f"Inverse diagonal edge degree weightless matrix saved as {OUTPUT_FOLDER}inverse_diag_edge_degree_weightless_matrix.npz'.")


Weighted incidence matrix saved as ./output/DGIDB_BIPOLAR/hypergraph_incidence_matrix_weighted.npz'.
Binary incidence matrix saved as ./output/DGIDB_BIPOLAR/hypergraph_incidence_matrix_binary.npz'.
Diagonal node degree matrix saved as ./output/DGIDB_BIPOLAR/inverse_diag_node_degree_matrix.npz'.
Inverse diagonal node degree matrix saved as ./output/DGIDB_BIPOLAR/inverse_diag_node_degree_matrix.npz'.
Inverse diagonal edge degree matrix saved as ./output/DGIDB_BIPOLAR/inverse_diag_edge_degree_matrix.npz'.
Inverse diagonal edge degree weightless matrix saved as ./output/DGIDB_BIPOLAR/inverse_diag_edge_degree_weightless_matrix.npz'.


In [24]:
print(incidence_matrix)
print(binary_incidence_matrix)
print(diag_node_degree_matrix)
print(inverse_diag_node_degree_matrix)
print(inverse_diag_edge_degree_matrix)
print(inverse_diag_edge_degree_weightless_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 632 stored elements and shape (359, 16)>
  Coords	Values
  (0, 0)	0.009999999776482582
  (0, 4)	0.009999999776482582
  (1, 1)	209.0
  (2, 2)	47.0
  (2, 7)	47.0
  (2, 4)	47.0
  (2, 1)	47.0
  (2, 6)	47.0
  (2, 8)	47.0
  (2, 12)	47.0
  (2, 14)	47.0
  (2, 3)	47.0
  (3, 3)	0.009999999776482582
  (4, 4)	10.0
  (4, 2)	10.0
  (4, 12)	10.0
  (4, 14)	10.0
  (4, 7)	10.0
  (4, 1)	10.0
  (4, 8)	10.0
  (4, 6)	10.0
  (5, 5)	0.009999999776482582
  (5, 13)	0.009999999776482582
  (6, 6)	88.0
  (6, 2)	88.0
  :	:
  (335, 1)	0.009999999776482582
  (336, 13)	0.009999999776482582
  (337, 11)	32.0
  (338, 4)	0.009999999776482582
  (339, 7)	54.0
  (340, 13)	72.0
  (340, 11)	72.0
  (341, 1)	0.009999999776482582
  (342, 1)	0.009999999776482582
  (343, 1)	69.0
  (344, 1)	113.0
  (345, 11)	42.0
  (346, 11)	413.0
  (347, 13)	45.0
  (348, 4)	115.0
  (349, 4)	80.0
  (350, 1)	244.0
  (351, 0)	0.009999999776482582
  (352, 11)	138.0
  (353, 7)	249.0
  (354, 1

In [25]:
print(incidence_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 632 stored elements and shape (359, 16)>
  Coords	Values
  (0, 0)	0.009999999776482582
  (0, 4)	0.009999999776482582
  (1, 1)	209.0
  (2, 2)	47.0
  (2, 7)	47.0
  (2, 4)	47.0
  (2, 1)	47.0
  (2, 6)	47.0
  (2, 8)	47.0
  (2, 12)	47.0
  (2, 14)	47.0
  (2, 3)	47.0
  (3, 3)	0.009999999776482582
  (4, 4)	10.0
  (4, 2)	10.0
  (4, 12)	10.0
  (4, 14)	10.0
  (4, 7)	10.0
  (4, 1)	10.0
  (4, 8)	10.0
  (4, 6)	10.0
  (5, 5)	0.009999999776482582
  (5, 13)	0.009999999776482582
  (6, 6)	88.0
  (6, 2)	88.0
  :	:
  (335, 1)	0.009999999776482582
  (336, 13)	0.009999999776482582
  (337, 11)	32.0
  (338, 4)	0.009999999776482582
  (339, 7)	54.0
  (340, 13)	72.0
  (340, 11)	72.0
  (341, 1)	0.009999999776482582
  (342, 1)	0.009999999776482582
  (343, 1)	69.0
  (344, 1)	113.0
  (345, 11)	42.0
  (346, 11)	413.0
  (347, 13)	45.0
  (348, 4)	115.0
  (349, 4)	80.0
  (350, 1)	244.0
  (351, 0)	0.009999999776482582
  (352, 11)	138.0
  (353, 7)	249.0
  (354, 1

In [26]:
print(np.sum(gene_weights != 0.01), "out of", len(gene_weights), "genes have non-default weights (aka are found in HumanNet).")

246 out of 359 genes have non-default weights (aka are found in HumanNet).


In [27]:
row_sums = np.sum(np.abs(binary_incidence_matrix), axis=1)

# Indices of zero rows
zero_row_indices = np.where(row_sums == 0)[0]

# Count
num_zero_rows = len(zero_row_indices)

print("Zero row indices:", zero_row_indices)
print("Number of zero rows:", num_zero_rows)

Zero row indices: []
Number of zero rows: 0


In [29]:
for drug in specific_disease_drugs:
    if drug in drug_to_index:
        idx = drug_to_index[drug]
        print(f"Drug: {drug}, Index: {idx}")
    else:
        print(f"Drug: {drug} not found in drug_to_index.")


Drug: CLONAZEPAM, Index: 15
Drug: BUPROPION HYDROCHLORIDE, Index: 0
Drug: SERTRALINE HYDROCHLORIDE, Index: 9
Drug: OLANZAPINE, Index: 6
Drug: ZIPRASIDONE, Index: 12
Drug: QUETIAPINE FUMARATE, Index: 4
Drug: CHLORPROMAZINE, Index: 8
Drug: CARBAMAZEPINE, Index: 13
Drug: ALLOPURINOL, Index: 5
Drug: PERPHENAZINE, Index: 7
Drug: VALPROIC ACID, Index: 11
Drug: CLOZAPINE, Index: 2
Drug: LITHIUM, Index: 3
Drug: RISPERIDONE, Index: 1
Drug: LAMOTRIGINE, Index: 10
Drug: ARIPIPRAZOLE LAUROXIL, Index: 14


In [None]:
# # Compute gene-gene adjacency matrix by projecting via shared drugs
# adj_matrix = adj_matrix = binary_csr_matrix @ binary_csr_matrix.T  # Matrix multiplication: shared drugs
# adj_matrix.setdiag(0)
# adj_matrix.eliminate_zeros()

# # --- Step 2: Extract Edgelist from Upper Triangle Only ---
# # Use sparse coo_matrix to iterate efficiently
# from scipy.sparse import triu

# adj_matrix_upper = triu(adj_matrix, k=1)  # upper triangle, no diag
# adj_coo = adj_matrix_upper.tocoo()

# # Optional: if you have gene names
# # gene_names = ['TP53', 'EGFR', 'BRCA1', ...]
# # Otherwise use indices as names

# edges = []
# for i, j, v in zip(adj_coo.row, adj_coo.col, adj_coo.data):
#     edges.append((i, j, v))  # replace i/j with gene_names[i] if available

# # Convert to DataFrame and save
# edge_df = pd.DataFrame(edges, columns=["Gene1", "Gene2", "Weight"])

# # If you have gene names, map them:
# # edge_df["Gene1"] = edge_df["Gene1"].map(lambda i: gene_names[i])
# # edge_df["Gene2"] = edge_df["Gene2"].map(lambda i: gene_names[i])

# edge_df.to_csv("gene_gene_edgelist.csv", index=False)

In [None]:
# import pandas as pd

# if 'NCBI_INFO' not in globals():
#     print("Reading gene2refseq.gz...")
#     NCBI_INFO = pd.read_csv("../Data/ncbi/gene2refseq.gz", sep='\t', compression='gzip')
# else:
#     print("NCBI_INFO already loaded.")

In [None]:
# index_to_ncbi = {idx: gene for gene, idx in gene_to_index.items()}
# human_gene2refseq = NCBI_INFO[NCBI_INFO['#tax_id'] == 9606]
# id_to_gene_claim = pd.Series(human_gene2refseq.Symbol.values, index=human_gene2refseq.GeneID).to_dict()

# # Your existing function to get common gene name from ncbi gene id
# def get_gene_claim_name(ncbi_gene_id):
#     try:
#         ncbi_gene_id = int(ncbi_gene_id)
#         result = id_to_gene_claim.get(ncbi_gene_id, None)
#         return result if result else "Gene name not found"
#     except:
#         return "Gene name not found"

In [None]:
# # Step 1: Map index → NCBI gene ID
# edge_df['Gene1_ncbi'] = edge_df['Gene1'].map(index_to_ncbi)
# edge_df['Gene2_ncbi'] = edge_df['Gene2'].map(index_to_ncbi)

# # Step 2: Map NCBI gene ID → gene symbol
# edge_df['Gene1'] = edge_df['Gene1_ncbi'].apply(get_gene_claim_name)
# edge_df['Gene2'] = edge_df['Gene2_ncbi'].apply(get_gene_claim_name)

# # Step 3: Drop temp NCBI ID columns
# edge_df = edge_df.drop(columns=['Gene1_ncbi', 'Gene2_ncbi'])

# # Optional: Save to CSV
# edge_df.to_csv('gene_gene_edgelist_named.csv', index=False)
