In [1]:
import gc
import networkx as nx
import numpy as np
import os
import pandas as pd
import time
import scipy
import sklearn
from sklearn import cluster, linear_model
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
import sys
import warnings # Silence perf warning

sys.path.append(os.path.realpath('..'))

import nodevectors
import csrgraph
from csrgraph import CSRGraph
from csrgraph import methods
import link_pred
import graph_eval

# From the related karateclub lib (on pip)
# https://github.com/benedekrozemberczki/KarateClub
from karateclub.node_embedding.neighbourhood import GraRep, NodeSketch, Walklets
# UMAP to test (on pip)
import umap

warnings.simplefilter("ignore")

def nx_node_weights(G, method, **kwargs):
    """Node Weights through networkX API"""
    pr = np.zeros(len(G))
    prdict = method(G, **kwargs)
    for i in G.nodes:
        pr[i] = prdict[i]
    return pr

In [2]:
#### CONFIG
N_COMPONENTS = 6 # resulting embedding dim
SEED = 42 # RNG Seed
TEST_SIZE = 0.2

# For resampling tests
RESAMPLE_WALKS = 30
RESAMPLE_LEN = 5

In [3]:
#### GRAPHS
#### Uncomment one to choose which graph to run evaluation on

#### Artificial random graphs
# G = nx.binomial_graph(700, 0.6)
# G, labels = graph_eval.make_cluster_graph(n_nodes=820, n_clusters=18, connections=1000, drop_pct=0.5)
# G, labels = graph_eval.make_weighed_cluster_graph(n_nodes=500, n_clusters=6, connections=1500, drop_pct=0.2, max_edge_weight=15)
#### Social graphs
# G, labels = graph_eval.make_blogcatalog(dedupe=True)
G, mlabels = graph_eval.make_blogcatalog(dedupe=False)
# G, labels = graph_eval.make_email()
# G, labels = graph_eval.get_karateclub("facebook") # twitch, github, facebook, wikipedia
# G = graph_eval.get_from_snap(url="http://snap.stanford.edu/data/facebook_combined.txt.gz", sep=' ', header=None, comment='#')
#### Biology Graphs
# G, mlabels = graph_eval.get_n2v_ppi("../data/bioNEV/node2vec_PPI")


#### Needs OutOfBounds Nodes support from CSRGraphs to work
# G = graph_eval.get_drugbank_ddi("../data/bioNEV/DrugBank_DDI")
# G, mlabels = graph_eval.get_mashup_ppi("../data/bioNEV/Mashup_PPI")

In [4]:
#### For Link Prediction: Split graph into train and test edge sets
#### (All nodes are still present in both)
G_train, testing_pos_edges = link_pred.split_train_test_graph(G, testing_ratio=TEST_SIZE)

#### Lazy way to set up evaluation
try:
    y = labels.label
    n_clusters = y.nunique()
    HAS_LABELS = True
    print(f"clusters: {n_clusters}")
except:
    try: # Multilabels 
        y = MultiLabelBinarizer().fit_transform(mlabels.mlabels)
        HAS_LABELS = True
        print(f"multilabels: {y.shape[1]}")
    except: # No Labels
        HAS_LABELS = False
        print("No Labels")
NNODES = len(G)
print(f"Nodes: {NNODES}\nEdges: {len(G.edges)}\nconnected: {nx.is_connected(G_train)}")

multilabels: 39
Nodes: 10312
Edges: 333983
connected: True


In [None]:
ggvec_params = dict(
    n_components=N_COMPONENTS,
    order=1,
    tol=0.1,
    tol_samples=100,
    max_epoch=6_000,
    learning_rate=0.1,
    negative_ratio=0.05,
    exponent=0.33,
    verbose=True,
)

start_t = time.time()
w_train = nodevectors.GGVec(**ggvec_params).fit_transform(G_train)

print(f"Time: {time.time() - start_t :.4f}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
time.sleep(0.1)
if HAS_LABELS:
    w = nodevectors.GGVec(**ggvec_params).fit_transform(G)
    graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

Loss: 0.0654	:   2%|▏         | 102/6000 [00:01<01:52, 52.55it/s]


Converged! Loss: 0.0651
Time: 4.0844
Link Prediction:


In [None]:
n2v_params = dict(
    n_components=N_COMPONENTS,
    epochs=5,
    walklen=30,
    return_weight=1.,
    neighbor_weight=1.,
    w2vparams={
        "window":3, 
        "negative":5, 
        "iter":2,
        "batch_words":128}
)

start_t = time.time()
w_train = nodevectors.Node2Vec(**n2v_params).fit_transform(G_train)
print(f"Time: {time.time() - start_t :.4f}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
if HAS_LABELS:
    w = nodevectors.Node2Vec(**n2v_params).fit_transform(G)
    graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

Making walks... Done, T=2.40
Mapping Walk Names... Done, T=2.20
Training W2V... Done, T=7.31
Time: 14.3655
Link Prediction:
	(logit) AUC-ROC: 0.951, AUC-PR: 0.948, Acc: 0.882, F1: 0.883
	(lgbm)  AUC-ROC: 0.954, AUC-PR: 0.950, Acc: 0.888, F1: 0.890
Making walks... Done, T=0.33
Mapping Walk Names... Done, T=2.15
Training W2V... 

In [None]:
pne_params = dict(
    n_components=N_COMPONENTS,
    step=5,
    mu=0.2,
    theta=0.5,
)

start_t = time.time()
pne = nodevectors.ProNE(**pne_params)
w_train = pne.fit_transform(G_train)
print(f"Time: {time.time() - start_t :.4f}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
if HAS_LABELS:
    pne = nodevectors.ProNE(**pne_params)
    w = pne.fit_transform(G)
    graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

In [None]:
grarep_params = dict(
    n_components=N_COMPONENTS,
    order=2,
    embedder=TruncatedSVD(
        n_iter=10,
        random_state=42),
    merger=(lambda x : np.sum(x, axis=0)),
)

start_t = time.time()
w_train = nodevectors.GraRep(**grarep_params).fit_transform(G_train)

print(f"Time: {time.time() - start_t :.4f}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
time.sleep(0.1)
if HAS_LABELS:
    w = nodevectors.GraRep(**grarep_params).fit_transform(G)
    graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

In [None]:
##### GraRep + GGVec ####
grarep_params = dict(
    n_components=N_COMPONENTS,
    order=2,
    embedder=nodevectors.GGVec(
        n_components=N_COMPONENTS,
        tol=0.1,
        tol_samples=200,
        max_epoch=6_000,
        learning_rate=0.02,
        negative_ratio=0.6,
        exponent=0.33,
        verbose=True,
    ),
    verbose=False,
    merger=(lambda x : np.sum(x, axis=0)),
)

start_t = time.time()
w_train = nodevectors.GraRep(**grarep_params).fit_transform(G_train)

print(f"Time: {time.time() - start_t :.4f}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
time.sleep(0.1)
if HAS_LABELS:
    w = nodevectors.GraRep(**grarep_params).fit_transform(G)
    graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

In [None]:
ump_params = dict(
    embedder=umap.UMAP,
    n_neighbors=3,
    min_dist=0.,
    metric='cosine',
    normalize_graph=True,
    n_components=N_COMPONENTS,
)

start_t = time.time()
w_train = nodevectors.SKLearnEmbedder(**ump_params).fit_transform(G_train)
print(f"Time: {time.time() - start_t :.4f}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
if HAS_LABELS:
    w = nodevectors.SKLearnEmbedder(**ump_params).fit_transform(G)
    graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

In [None]:
### GLoVe with random walks ###
glove_params = dict(
    n_components=N_COMPONENTS,
    tol=0.0005,
    max_epoch=6_000,
    learning_rate=0.02, 
    max_loss=10.,
    max_count=50, 
    exponent=0.5,
)

start_t = time.time()
wg = CSRGraph(G_train).random_walk_resample(walklen=RESAMPLE_LEN, epochs=RESAMPLE_WALKS)
w_train = nodevectors.Glove(**glove_params).fit_transform(wg)

print(f"Time: {time.time() - start_t :.4f}")
print(f"Virtual edges: {wg.dst.size}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
if HAS_LABELS:
    wg = CSRGraph(G).random_walk_resample(walklen=RESAMPLE_LEN, epochs=RESAMPLE_WALKS)
    w = nodevectors.Glove(**glove_params).fit_transform(wg)
    graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

In [None]:
### GGVec with random walks ###
ggvec_params = dict(
    n_components=N_COMPONENTS,
    tol=0.02,
    tol_samples=200,
    max_epoch=6_000,
    learning_rate=0.02,
    negative_ratio=0.3,
    exponent=0.35,
    verbose=True,
)

start_t = time.time()
wg = CSRGraph(G_train).random_walk_resample(walklen=RESAMPLE_LEN, epochs=RESAMPLE_WALKS)
w_train = wg.ggvec(**ggvec_params)

print(f"Time: {time.time() - start_t :.4f}")
print(f"Virtual edges: {wg.dst.size}")
result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
if HAS_LABELS:
    wg = CSRGraph(G).random_walk_resample(walklen=RESAMPLE_LEN, epochs=RESAMPLE_WALKS)
    w = wg.ggvec(**ggvec_params)
    graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)

In [None]:
###### Slooooowwwwwww ########
# walklets_params = dict(
#     walk_number=10, 
#     walk_length=30, 
#     dimensions=N_COMPONENTS,
#     window_size=4,
#     epochs=1, 
#     learning_rate=0.05
# )

# try: # Karateclub models don't handle certain graphs
#     start_t = time.time()
#     model = Walklets(**walklets_params)
#     model.fit(G_train)
#     print(f"Time: {time.time() - start_t :.3f}")
#     w_train = model.get_embedding()
#     result = link_pred.LinkPrediction(w_train, G, G_train, testing_pos_edges)
#     if HAS_LABELS:
#         model = Walklets(**walklets_params)
#         model.fit(G)
#         w = model.get_embedding()
#         graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
# except: pass

In [None]:
### Completely random baseline ###

w = np.random.randn(len(G), N_COMPONENTS)

result = link_pred.LinkPrediction(w, G, G_train, testing_pos_edges)
try:
    graph_eval.print_labeled_tests(w, y, test_size=TEST_SIZE, seed=SEED)
except: pass