In [1]:
# Necessary imports
%load_ext autoreload
%autoreload 2\

import networkx as nx
import numpy as np
np.set_printoptions(precision=3)

import pandas as pd

from TCGAMultiOmics.multiomics import MultiOmicsData

from moge.network.heterogeneous_network import HeterogeneousNetwork
from moge.visualization.plot_data import matrix_heatmap

np.set_printoptions(precision=2)

# Heterogeneous Network

In [2]:
# READ Edgeslist
# network.import_edgelist_file(file="moge/data/lncRNA_miRNA_mRNA/miRNA-mRNA_node_similarity_99_50.edgelist", 
#                              directed=False)
# network.import_edgelist_file(file="moge/data/lncRNA_miRNA_mRNA/miRNA-mRNA_regulatory_interactions.edgelist", 
#                              directed=True)
# network.remove_extra_nodes()

In [3]:
# WRITE Edgelist
# nx.write_edgelist(network.get_subgraph(["MIR", "GE"]), 
#                   "moge/data/lncRNA_miRNA_mRNA/miRNA-mRNA_network_test_05_val_01_seed_0.edgelist", 
#                   data=False)

In [4]:
import pickle



# READ
with open('moge/data/mirtarbase_lnc-mi/lmn_n07_m07_l08_full.pickle', 'rb') as file:
    network = pickle.load(file)


#  Split Network to train test and write to file

In [6]:
from moge.evaluation.utils import split_train_test_edges

network_train, val_edges_dict, test_edges_dict = split_train_test_edges(network, network.node_list, 
                                                          test_frac=.15, val_frac=.0, edge_types=["u", "d", "u_n"], 
                                                          seed=0)

preprocessing...
creating true edges...
edges in MST: 33830
Num WCC:  17913
Largest WCC num nodes:  2623
Largest WCC num edges:  24398
Fraction of train edges with both endpoints in L-WCC:  0.1499165911766779
Fraction of test edges with both endpoints in L-WCC:  0.15104220601505983
Fraction of val edges with both endpoints in L-WCC:  N/A
final checks for disjointness...
creating adj_train...
Done with train-test split!
Num train edges (true, ): ( 383053 , 
Num test edges (true, ): ( 67597 , 
Num val edges (true, ): ( 0 , 
Removed 67597 test, and 0 val, type u edges
preprocessing...
creating true edges...
edges in MST: 21092
Num WCC:  13886
Largest WCC num nodes:  20937
Largest WCC num edges:  543446
Fraction of train edges with both endpoints in L-WCC:  0.9999889594665214
Fraction of test edges with both endpoints in L-WCC:  1.0
Fraction of val edges with both endpoints in L-WCC:  N/A
final checks for disjointness...
creating adj_train...
Done with train-test split!
Num train edges (tr

In [7]:
network_train.remove_extra_nodes()

In [8]:
network_train_d = nx.DiGraph(((source, target, attr) for source, target, attr in network_train.G.edges(data=True) if attr["type"] == "d"))
network_train_d.number_of_nodes()

20950

In [9]:
network_train_u = nx.Graph(((source, target, attr) for source, target, attr in network_train.G.edges(data=True) if attr["type"] == "u"))
network_train_u.number_of_nodes()

18032

In [9]:
# WRITE Edgelist
nx.write_edgelist(network_train_u.subgraph(network_train.node_list), 
                  "moge/data/lncRNA_miRNA_mRNA/lmn_network_filtered_undirected.edgelist", 
                  data=False)
nx.write_edgelist(network_train_d.subgraph(network_train.node_list), 
                  "moge/data/lncRNA_miRNA_mRNA/lmn_network_filtered_directed.edgelist", 
                  data=False)

# Siamese Network Embedding

In [15]:
from moge.network.data_generator import DataGenerator

generator = DataGenerator(network=network, negative_sampling_ratio=2.0,
                          maxlen=700, padding='post', truncating="post",
                          batch_size=1, 
                          shuffle=True, seed=0)
generator.genes_info = generator.genes_info[generator.genes_info["Transcript sequence"].notna()]

Genes info columns: ['locus_type', 'location', 'Family', 'Transcript sequence', 'Disease association', 'Transcript length']
Number of nodes without seq removed: 2521
num_words: None {'A': 1, 'T': 2, 'G': 3, 'C': 4, 'U': 5}
Ed_count: 536508 , Eu_count: 191303 , En_count: 769972
Ens_count: 1073016


In [22]:
from moge.embedding.siamese_graph_embedding import SiameseGraphEmbedding

siamese = SiameseGraphEmbedding(d=512, batch_size=4096, lr=0.05, epochs=20, negative_sampling_ratio=5.0,
                                 max_length=700, truncating="post", verbose=True)
siamese.import_embedding("moge/data/mirtarbase_lnc-mi/embeddings/node_train_512.emb", 
                         node_list=generator.node_list)

AssertionError: 

In [21]:
from moge.embedding.static_graph_embedding import ImportedGraphEmbedding
graph_emb = ImportedGraphEmbedding(d=512, method_name="siamese_switch")
graph_emb.import_embedding("moge/data/mirtarbase_lnc-mi/embeddings/node_train_512.emb", 
                           node_list=generator.node_list)
embs = graph_emb._X
embs.shape

AssertionError: 

# Import different models

In [None]:
from moge.embedding.static_graph_embedding import ImportedGraphEmbedding

line_emb_d = ImportedGraphEmbedding(d=128, method_name="LINE")
line_emb_d.import_embedding("/home/jonny/PycharmProjects/OpenNE/lmn_filtered_train_line_d.emb", 
                          node_list=network.node_list)

line_emb_u = ImportedGraphEmbedding(d=128, method_name="LINE")
line_emb_u.import_embedding("/home/jonny/PycharmProjects/OpenNE/lmn_filtered_train_line_u.emb", 
                          node_list=network.node_list)

node2vec_emb = ImportedGraphEmbedding(d=128, method_name="node2vec")
node2vec_emb.import_embedding("/home/jonny/PycharmProjects/OpenNE/vec_all_node2vec.txt", 
                          node_list=network.node_list)

node_list = list(set(line_emb_d.node_list) & set(line_emb_u.node_list))
print(len(node_list))

In [29]:
models = []
models.append(line_emb_d)
models.append(line_emb_u)
# models.append(node2vec_emb)
# models.append(siamese_emb)

# Evaluate Graph Reconstruction

In [9]:
from moge.evaluation.graph_reconstruction import evaluateStaticGraphReconstruction
# Directed edges
for model in models:
    print("\n",model.get_method_name())
    norm, avg = evaluateStaticGraphReconstruction(network, model, edge_type="d", train_embedding=False,
                                                  sample_ratio=1.0, node_list=model.node_list)

    print(norm, avg)


 siamese_graph_embedding
Sampling 386720 edges to be evaluated.
577.6855184299358 0.9045182015184343


In [10]:
# Undirected edges
for model in models:
    print("\n",model.get_method_name())
    norm, avg = evaluateStaticGraphReconstruction(network, model, edge_type="u", train_embedding=False,
                                                  sample_ratio=1.0, node_list=model.node_list)
    print(norm, avg)


 siamese_graph_embedding
Sampling 865830 edges to be evaluated.
287.0505497323289 0.13390040519234594


# Evaluate Link Prediction

In [28]:
models[0]._node_num = G.number_of_nodes()

In [29]:
from moge.evaluation.link_prediction import evaluate_top_k_link_prediction, evaluate_random_link_prediction
from moge.embedding.dual_graph_embedding import SourceTargetGraphEmbedding

# Directed edges
for model in models:
    print("\n",model.get_method_name())
    scores = evaluate_top_k_link_prediction(top_k=10000, network=network_train, node_list=network.node_list,
                                 graph_emb=model, saved_test_edges=test_edges_dict["d"], train_embedding=False,
                                 edge_type="d", test_frac=0.001, seed=0)
    print(scores)


 hope_gsvd
test_edges: 73373


IndexError: index 19855 is out of bounds for axis 0 with size 19825

In [None]:
from moge.evaluation.link_prediction import evaluate_top_k_link_prediction, evaluate_random_link_prediction

# Undirected edges
for model in models:
    print("\n",model.get_method_name())
    scores = evaluate_top_k_link_prediction(top_k=10000, network=network_train, node_list=model.node_list,
                             graph_emb=model, saved_test_edges=test_edges_dict["u"], train_embedding=False,
                             edge_type="u", test_frac=0.001, seed=0)
    print(scores)

In [17]:
# Evaluate random directed edge prediction
print(evaluate_random_link_prediction(top_k=10000, network=network, edge_type="d", node_list=siamese_emb.node_list,
                                      test_frac=0.001, seed=0))
print(evaluate_random_link_prediction(top_k=100000, network=network, edge_type="u", node_list=siamese_emb.node_list,
                                      test_frac=0.00001, seed=0))

{'precision': 0.0, 'recall': 0.0}
{'precision': 0.0, 'recall': 0.0}


# Import GEM Graph Embedding Methods

In [16]:
from gem.utils import graph_util, plot_util
from gem.evaluation import visualize_embedding as viz
from gem.evaluation import evaluate_graph_reconstruction as gr
from gem.evaluation.metrics import getPrecisionReport
from gem.evaluation.evaluate_link_prediction import evaluateStaticLinkPrediction

from time import time

from gem.embedding.gf       import GraphFactorization
from gem.embedding.hope     import HOPE
from gem.embedding.lap      import LaplacianEigenmaps
from gem.embedding.lle      import LocallyLinearEmbedding
from gem.embedding.node2vec import node2vec
from gem.embedding.sdne     import SDNE

Using TensorFlow backend.


In [17]:
models = []
# You can comment out the methods you don't want to run
models.append(HOPE(d=256, beta=0.1))
# models.append(GraphFactorization(d=256, max_iter=100, eta=1*10**-4, regu=1.0))
# models.append(LaplacianEigenmaps(d=200))
# models.append(LocallyLinearEmbedding(d=256))
models.append(node2vec(d=256, max_iter=10, walk_len=5, num_walks=10, con_size=10, ret_p=1, inout_p=1))
models.append(SDNE(d=256, beta=5, alpha=1e-5, nu1=1e-6, nu2=1e-6, 
                   K=3, n_units=[50, 15,], rho=0.3, 
                   n_iter=100, xeta=0.01, n_batch=500))

# models.append(DualGraphEmbedding(d=100, reg=1.0, lr=0.05, iterations=100))
                   

In [18]:
G = nx.convert_node_labels_to_integers(network_train_d)

In [19]:
print ('Num nodes: %d, num edges: %d' % (G.number_of_nodes(), G.number_of_edges()))
for embedding in models:
    try:
        print("\n", embedding.get_method_name(), embedding.get_method_summary())
        t1 = time()
        
    #     Learn embedding - accepts a networkx graph or file with edge list
        embedding.learn_embedding(graph=G, edge_f=None, is_weighted=False, no_python=True)
        print (embedding._method_name+':\n\tTraining time: %f' % (time() - t1))

#         Y = embedding.get_embedding()

#         # Evaluate on graph reconstruction
#         MAP, prec_curv, err, err_baseline = gr.evaluateStaticGraphReconstruction(G, embedding, Y, 
#                                                                                  sample_ratio_e=0.0001, k=500000)
#         print ("Graph Reconstruction. MAP:", MAP)

        
        # Evaluate on link prediction
        MAP, prec_curv = evaluateStaticLinkPrediction(G, embedding,
                                     train_ratio=0.8,
                                     is_undirected=False)
        print ("Link Prediction. MAP:", MAP)
        print (embedding._method_name+':\n\tTraining time: %f' % (time() - t1))
        
        # Visualize
#         viz.plot_embedding2D(embedding.get_embedding(), di_graph=G, node_colors=None)
#         plt.show()
    except Exception as e:
        print("could not run", embedding, e)

Num nodes: 19825, num edges: 418147

 hope_gsvd hope_gsvd_256
SVD error (low rank): 64.040331
hope_gsvd:
	Training time: 141.285477
could not run <gem.embedding.hope.HOPE object at 0x7fec90b2b588> SubGraph Views are readonly. Mutations not allowed

 node2vec_rw node2vec_rw_256
[Errno 2] No such file or directory: 'gem/c_exe/node2vec': 'gem/c_exe/node2vec'
could not run <gem.embedding.node2vec.node2vec object at 0x7fece5ad3c88> ./node2vec not found. Please compile snap, place node2vec in the path and grant executable permission


In [22]:
hope = models[0]

# Node Visualization

In [None]:
from sklearn.manifold import TSNE

model = TSNE(n_components=2)
node_pos = model.fit_transform(models[0].get_embedding())
pos = {}
for i in range(len(G.nodes())):
    pos[i] = node_pos[i, :]
nx.draw_networkx(G, pos, width=0.1, node_size=300, arrows=False,
                             alpha=0.8, font_size=12)