In [1]:
import numpy as np
import json
from scipy.sparse import load_npz, diags
import pandas as pd
import os
import RWR
DISEASE = "BIPOLAR"
HYPERGRAPH_DIRECTORY = "../../Gen_Hypergraph/output/" + FOLDER
OUTPUT_FOLDER = f"../output/{DISEASE}"
DGIDB = pd.read_csv("../../Data/DGIDB/DrugToGene.tsv", sep="\t")
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

NameError: name 'FOLDER' is not defined

In [None]:
# Load all matrices
binary_matrix = load_npz(HYPERGRAPH_DIRECTORY + "hypergraph_incidence_matrix_binary.npz")
weighted_matrix = load_npz(HYPERGRAPH_DIRECTORY + "hypergraph_incidence_matrix_weighted.npz")
inverse_diag_node_degree_matrix = load_npz(HYPERGRAPH_DIRECTORY + "inverse_diag_node_degree_matrix.npz")
inverse_diag_edge_degree_matrix = load_npz(HYPERGRAPH_DIRECTORY + "inverse_diag_edge_degree_matrix.npz")

# Get dimensions
num_genes = binary_matrix.shape[0]  # Number of genes
num_drugs = binary_matrix.shape[1]  # Number of drugs

# Initialize parameters
restart_prob = 0.2  # Restart probability (theta)
num_iterations = 1000  # Number of iterations

# Initial uniform probability vector
v0 = np.ones(num_genes) / num_genes  



In [None]:
result = RWR.random_talk_with_restart(num_iterations,
                             restart_prob, 
                             binary_matrix, 
                             weighted_matrix, 
                             inverse_diag_node_degree_matrix, 
                             inverse_diag_edge_degree_matrix,
                             v0)
# Print results
print("Top Indices by Importance:")
for index, score in result["Importance"][:10]:
    print(f"Index {index}: {score:.6f}")

print("\nDistance per Iteration:")
for dist in result["Distance"]:
    print(f"{dist:.10f}")

Random Walk Progress:   0%|          | 0/1000 [00:00<?, ?it/s]

Random Walk Progress: 100%|██████████| 1000/1000 [00:00<00:00, 3116.95it/s]

Top Indices by Importance:
Index 190: 0.039720
Index 364: 0.033889
Index 61: 0.018546
Index 518: 0.016319
Index 293: 0.015846
Index 85: 0.014851
Index 253: 0.013967
Index 1484: 0.013806
Index 396: 0.013006
Index 698: 0.010866

Distance per Iteration:
0.4389389953
0.5789744490
0.3038291662
0.0946812351
0.0332201292
0.0158157424
0.0093140036
0.0060335171
0.0040968423
0.0028586745
0.0020197490
0.0014406983
0.0010364260
0.0007534771
0.0005574894
0.0004158993
0.0003123532
0.0002359642
0.0001803823
0.0001388910
0.0001074070
0.0000833508
0.0000654593
0.0000528066
0.0000427249
0.0000346528
0.0000281492
0.0000228965
0.0000186657
0.0000152418
0.0000124637
0.0000101996
0.0000083532
0.0000068454
0.0000056141
0.0000046065
0.0000037813
0.0000031052
0.0000025510
0.0000020972
0.0000017258
0.0000014206
0.0000011696
0.0000009633
0.0000007936
0.0000006539
0.0000005389
0.0000004442
0.0000003663
0.0000003022
0.0000002493
0.0000002058
0.0000001698
0.0000001402
0.0000001157
0.0000000955
0.0000000789
0.000000




In [None]:
# Load the JSON data from the file
with open(HYPERGRAPH_DIRECTORY + 'gene_to_index.json', 'r') as file:
    gene_to_index = json.load(file)

# Invert the dictionary to map indices back to genes
index_to_gene = {v: k for k, v in gene_to_index.items()}
def get_gene_claim_name(ncbi_gene_id):
    ncbi_gene_id = int(ncbi_gene_id)
    result = DGIDB[(DGIDB['ncbi_gene_id']) == ncbi_gene_id]
    if not result.empty:
        return result['gene_name'].values[0]
    else:
        return "Gene name not found"

In [None]:
results_df = pd.DataFrame(result["Importance"], columns=['Index', 'Score'])
results_df["ncbi_gene_id"] = results_df["Index"].apply(index_to_gene.get)
results_df["claim_name"] = results_df["ncbi_gene_id"].apply(get_gene_claim_name)
results_df.to_csv(OUTPUT_FOLDER + "single_layer_rwr_results.csv", index=False)

In [None]:
# Transpose the binary matrix to have drugs as rows and genes as columns
Binary_matrix = binary_matrix.T

# Initialize drug score vector
drug_score_vector = np.zeros(num_genes)

# Compute drug score vector by aggregating scores from neighbors
for index in range(num_genes):
    # pick rows where column index != 0
    rows = Binary_matrix.getcol(index).nonzero()[0]   # 1-D ndarray of row indices
    selected_rows = Binary_matrix[rows, :]

    # entry-wise sum (across rows)
    row_sum_vector = selected_rows.sum(axis=0)

    # Normalize every nonzero entry to one
    row_sum_vector = (row_sum_vector != 0).astype(int)
    row_sum_vector = np.array(row_sum_vector).ravel()

    # Computer the drug score of a gene by aggregating scores from its neighbors
    drug_score = np.dot(row_sum_vector, result["unsorted"])
    drug_score_vector[index] = drug_score

# Save the vectors
np.save(OUTPUT_FOLDER + 'DGIDB_vector.npy', result["unsorted"])
np.save(OUTPUT_FOLDER + 'drug_score_vector.npy', drug_score_vector)

print('DGIDB_vector.npy',"saved to", OUTPUT_FOLDER + 'DGIDB_vector.npy')
print('drug_score_vector.npy',"saved to", OUTPUT_FOLDER + 'drug_score_vector.npy')

# Check percentage of zeros in both vectors
(num_zeros,) = (result["unsorted"] == 0).sum(),
print("Zeros:", num_zeros, "out of", result["unsorted"].size, "(Irrelevant genes get probability from from the restart probability)")

(num_zeros,) = (drug_score_vector == 0).sum(),
print("Zeros:", num_zeros, "out of", drug_score_vector.size, "Irrelevant genes have no neighbors in DGIDB")

# Display the vectors
display(result["unsorted"])
display(drug_score_vector)


DGIDB_vector.npy saved to ../output/DGIDB_BIPOLAR/DGIDB_vector.npy
drug_score_vector.npy saved to ../output/DGIDB_BIPOLAR/drug_score_vector.npy
Zeros: 0 out of 4774 Coming from the restart probability
Zeros: 4415 out of 4774


array([1.07821548e-02, 9.39544483e-03, 5.11256425e-05, ...,
       5.11256425e-05, 5.11256425e-05, 5.11256425e-05])

array([0.61579378, 0.17496624, 0.        , ..., 0.        , 0.        ,
       0.        ])