In [1]:
import numpy as np
import json
from scipy.sparse import load_npz
import pandas as pd
import os
from tqdm import tqdm
import RWR

DISEASE = "BIPOLAR"
DGIDB_DIRECTORY = f"../../Gen_Hypergraph/output/DGIDB_{DISEASE}/"
DGIDB_DRUG_SCORE_PATH = f"../output/{DISEASE}/drug_score_vector.npy"
DGIDB_RESULT_VECTOR_PATH = f"../output/{DISEASE}/DGIDB_vector.npy"
MSIGDB_DIRECTORY = f"../../Gen_Hypergraph/output/MSigDB_FULL/"
OUTPUT_FOLDER = f"../output/{DISEASE}/"
with open(MSIGDB_DIRECTORY + 'ncbi_to_gene.json', 'r') as file:
    ncbi_to_gene = json.load(file)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
restart_prob = 0.2  # Restart probability (theta)
num_iterations = 30  # Number of iterations


In [2]:
import numpy as np

# Open the JSON file and load its content into a dictionary
with open(DGIDB_DIRECTORY + "gene_to_index.json", "r") as file:
    dgidb = json.load(file)
with open(MSIGDB_DIRECTORY + "gene_to_index.json", "r") as file:
    msigdb = json.load(file)
    
# Jump probability for matching genes
w = 1

# Number of genes (assuming they are both of same size or matchable)
num_genes_dgidb = len(dgidb)
num_genes_msigdb = len(msigdb)

# Initialize the inter-layer matrix with zeros
interlayer_transition_matrix = np.zeros((num_genes_msigdb,num_genes_dgidb))
i = 0
# Build the inter-layer matrix
for gene_dgidb, idx_dgidb in dgidb.items():
    # If the gene exists in both gene-to-index mappings
    if gene_dgidb in msigdb:      
        idx_msigdb = msigdb[gene_dgidb]
        interlayer_transition_matrix[idx_msigdb,idx_dgidb] = w  # Set jump probability
        i += 1
    else:
        print(f"Gene {gene_dgidb} not found in MSIGDB mapping.")
rows_with_high_sum = np.where(interlayer_transition_matrix.sum(axis=1) > 0)[0]
print(i/len(dgidb), "of DGIDB genes have a match in MSIGDB")

Gene 927 not found in MSIGDB mapping.
Gene 11 not found in MSIGDB mapping.
Gene 724 not found in MSIGDB mapping.
Gene 360158 not found in MSIGDB mapping.
Gene 469 not found in MSIGDB mapping.
Gene 100529264 not found in MSIGDB mapping.
Gene 2609 not found in MSIGDB mapping.
Gene 447 not found in MSIGDB mapping.
Gene 453 not found in MSIGDB mapping.
Gene 459 not found in MSIGDB mapping.
Gene 2616 not found in MSIGDB mapping.
Gene 442 not found in MSIGDB mapping.
Gene 600 not found in MSIGDB mapping.
Gene 507 not found in MSIGDB mapping.
Gene 62 not found in MSIGDB mapping.
Gene 68 not found in MSIGDB mapping.
Gene 2772 not found in MSIGDB mapping.
Gene 485 not found in MSIGDB mapping.
Gene 620 not found in MSIGDB mapping.
Gene 121131 not found in MSIGDB mapping.
Gene 285834 not found in MSIGDB mapping.
Gene 6962 not found in MSIGDB mapping.
Gene 1480 not found in MSIGDB mapping.
Gene 2749 not found in MSIGDB mapping.
Gene 1624 not found in MSIGDB mapping.
Gene 63 not found in MSIGDB map

In [3]:
# Load matrices
MSIGDB_weighted_matrix = load_npz(MSIGDB_DIRECTORY + "hypergraph_incidence_matrix_weighted.npz")
MSIGDB_binary_matrix = load_npz(MSIGDB_DIRECTORY + "hypergraph_incidence_matrix_binary.npz")
DGIDB_drug_score = np.load(DGIDB_DRUG_SCORE_PATH)
DGIDB_vector = np.load(DGIDB_RESULT_VECTOR_PATH)
inverse_diag_node_degree_matrix = load_npz(MSIGDB_DIRECTORY + "inverse_diag_node_degree_matrix.npz")
inverse_diag_edge_degree_matrix = load_npz(MSIGDB_DIRECTORY + "inverse_diag_edge_degree_matrix.npz")

num_genes_MSIGDB = MSIGDB_binary_matrix.shape[0]  # Number of genes in MSIGDB

# Initialize probability vectors
v0 = np.ones(num_genes_MSIGDB) / (num_genes_MSIGDB) 

In [4]:

# # Run the random walk
# v_curr = v0.copy()  # Start with uniform probability
# transition_matrix = inverse_diag_node_degree_matrix @ MSIGDB_binary_matrix @ inverse_diag_edge_degree_matrix @ MSIGDB_weighted_matrix.T
# transition_matrix = transition_matrix.T
# distance_list = []
# bias_constant_vector = interlayer_transition_matrix @ DGIDB_drug_score

# for k in tqdm(range(num_iterations), desc="Random Walk Progress"):  
#     # Store previous probability vector
#     v_prev = v_curr.copy()
    
#     # Matrix multiplication for transition
#     v_curr = (1-restart_prob) * (transition_matrix @ v_prev) + restart_prob * v0 + bias_constant_vector

#     # Normalize v_curr to avoid overflow
#     v_curr /= np.sum(v_curr) if np.sum(v_curr) > 0 else 1
#     # Calculate distance
#     distance = np.sum(np.abs(v_prev - v_curr))
#     distance_list.append(distance)

# unsorted = v_curr
# # Sort importance scores in descending order
# importance_scores = np.argsort(v_curr)[::-1]
# importance_values = v_curr[importance_scores]

# # Return importance scores and distance values
# result = {"Importance": list(zip(importance_scores, importance_values)), "Distance": distance_list, "unsorted": unsorted}

In [5]:
result = RWR.random_talk_with_restart(num_iterations,
                             restart_prob, 
                             MSIGDB_binary_matrix, 
                             MSIGDB_weighted_matrix, 
                             inverse_diag_node_degree_matrix, 
                             inverse_diag_edge_degree_matrix,
                             v0,
                             interlayer_transition_matrix @ DGIDB_drug_score)
# Print results
print("Top Indices by Importance:")
for index, score in result["Importance"][:10]:
    print(f"Index {index}: {score:.6f}")

print("\nDistance per Iteration:")
print(result["Distance"])

Random Walk Progress: 100%|██████████| 30/30 [00:09<00:00,  3.29it/s]

Top Indices by Importance:
Index 8817: 0.010268
Index 6191: 0.010267
Index 1295: 0.010050
Index 2332: 0.009853
Index 3338: 0.008621
Index 440: 0.008081
Index 4369: 0.008041
Index 1631: 0.007917
Index 5010: 0.007853
Index 262: 0.007692

Distance per Iteration:
[1.9347029224233503, 0.006436098313694042, 3.294141103664346e-05, 1.8922693506892377e-07, 1.1453642169809079e-09, 7.223236813705927e-12, 4.7368179214051623e-14, 3.2510201835649836e-16, 3.0514097227040153e-18, 3.705769144237564e-20, 1.5881867761018131e-22, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]





In [6]:
# Load the JSON data from the file
with open(MSIGDB_DIRECTORY + 'gene_to_index.json', 'r') as file:
    gene_to_index = json.load(file)

# Invert the dictionary to map indices back to genes
index_to_gene = {v: k for k, v in gene_to_index.items()}

def get_gene_claim_name(ncbi_gene_id):
    ncbi_gene_id = str(ncbi_gene_id)
    result = ncbi_to_gene[ncbi_gene_id]
    if result:
        return result
    else:
        return "Gene name not found"

In [7]:
results_df = pd.DataFrame(result["Importance"], columns=['Index', 'Score'])
results_df["ncbi_gene_id"] = results_df["Index"].apply(index_to_gene.get)
results_df["claim_name"] = results_df["ncbi_gene_id"].apply(get_gene_claim_name)
results_df.to_csv(OUTPUT_FOLDER + "unidirectional_multilayer_rwr_results_ORIGINAL.csv", index=False)