In [1]:
from typing import Dict
from ensmallen_graph import EnsmallenGraph
from ensmallen_experiments import MeasureResources
import compress_json
from tqdm.auto import tqdm, trange
import os
from time import perf_counter, sleep
from humanize import naturaldelta
import matplotlib.pyplot as plt
import pandas as pd
from sanitize_ml_labels import sanitize_ml_labels
import networkx as nx
from glob import glob
from node2vec import Node2Vec
from multiprocessing import cpu_count

## Method to load the graphs using each library

In [2]:
def load_graph_ensmallen(edge_path:str):
    """Load graph object using EnsmallenGraph."""
    return EnsmallenGraph.from_csv(
        edge_path,
        sources_column_number=0,
        destinations_column_number=1,
        directed=False
    )

In [3]:
def load_graph_networkx(edge_path:str):
    """Load graph object using NetworkX."""
    return nx.read_edgelist(
        edge_path,
        data=False,
        delimiter="\t",
    )

## Methods to compute first and second order walks in each library

In [4]:
def ensmallen_first_order_walks(graph: EnsmallenGraph, walk_length:int, num_walks:int, **kwargs):
    """Execute uniform first order walks using Networkx-based node2vec walker."""
    return graph.complete_walks(length=walk_length, iterations=num_walks)

def ensmallen_second_order_walks(graph: EnsmallenGraph, p:float, q:float, walk_length:int, num_walks:int):
    """Execute uniform second order walks using Networkx-based node2vec walker."""
    return graph.complete_walks(length=walk_length, iterations=num_walks, return_weight=1/p, explore_weight=1/q)

In [5]:
def node2vec_first_order_walks(graph: nx.Graph, walk_length:int, num_walks:int, **kwargs):
    """Execute uniform first order walks using Networkx-based node2vec walker."""
    return Node2Vec(graph, walk_length=walk_length, num_walks=num_walks, workers=cpu_count()).walks

def node2vec_second_order_walks(graph: nx.Graph, p:float, q:float, walk_length:int, num_walks:int):
    """Execute uniform second order walks using Networkx-based node2vec walker."""
    return Node2Vec(graph, walk_length=walk_length, num_walks=num_walks, workers=cpu_count(), p=p, q=q).walks

In [6]:
def build_path(data:Dict, root:str):
    """Build path to edge file from given metadata."""
    return os.path.join(root, data["folder_name"], "sanitized.tsv")

def get_graph_report(data:Dict, root:str)->Dict:
    """Build path to edge file from given metadata."""
    return compress_json.load(os.path.join(root, data["folder_name"], "report.json"))

## The parameters for the experiments

In [7]:
loops = 1
graph_root = "graphs"
walk_parameters = dict(
    walk_length=30,
    num_walks=1, # This is iterations in ensmallen.
    p=2,
    q=2
)

libraries = {
    "ensmallen":[
        load_graph_ensmallen,
        ensmallen_first_order_walks,
        ensmallen_second_order_walks
    ],
    "networkx":[
        load_graph_networkx,
        node2vec_first_order_walks,
        node2vec_second_order_walks
    ]
}

## Execute the benchmarks

In [8]:
def run_benchmark(tracker:MeasureResources, metadata:Dict, walk_parameters:Dict, path:str):
    with tracker(**metadata, step="builder"):
        graph = builder(path)
    with tracker(**metadata, step="first_order_walks"):
        _ = first_order_walker(graph, **walk_parameters)
    with tracker(**metadata, step="second_order_walks"):
        _ = second_order_walker(graph, **walk_parameters)

In [9]:
graphs_data = compress_json.load("graphs.json")
graphs_data = [graphs_data[0]]
tracker = MeasureResources(verbose=False)

for graph_data in tqdm(graphs_data, desc="Computing benchmarks"):
    name = graph_data["graph"]
    path = build_path(graph_data, graph_root)
    report = get_graph_report(graph_data, graph_root)
    for library, (builder, first_order_walker, second_order_walker) in libraries.items():
        for loop in range(loops):
            metadata = dict(
                library=library,
                graph=name,
                loop=loop,
                **report
            )
            run_benchmark(tracker, metadata, walk_parameters, path)

results = tracker.get_results()

HBox(children=(FloatProgress(value=0.0, description='Computing benchmarks', max=1.0, style=ProgressStyle(descr…

Computing transition probabilities: 100%|██████████| 16073/16073 [13:18<00:00, 20.12it/s]  
Process Process-6:
Traceback (most recent call last):
  File "/home/lucacappelletti/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/lucacappelletti/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/lucacappelletti/ensmallen_experiments/ensmallen_experiments/measure_resources.py", line 72, in resources_logger
    sleep(refresh_delay)
KeyboardInterrupt
ERRO:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERRO:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/lucacappelletti/anaconda3/lib/python3.7/site-packages/joblib/parallel.py", line 909, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "/home/lucacappelletti/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 562, in wrap_future_result
    return future.result(timeout=timeout)
  File "/home/lucacappelletti/anaconda3/lib/python3.7/concurrent/futures/_base.py", line 430, in result
    self._condition.wait(timeout)
  File "/home/lucacappelletti/anaconda3/lib/python3.7/threading.py", line 296, in wait
    waiter.acquire()
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<ipython-input-8-e07b0cdebf4c>", line 5, in run_benchmark
    _ = first_order_walker(graph, **walk_parameters)
  File "<ipython-input-5-4c26d3b51f8d>", line 3, in node2vec_first_order_walks
    return Node2Vec(graph, walk_length=walk_length, num_wal


KeyboardInterrupt



In [None]:
results

## Visualize the results

### Plot RAM over time for each task and each graph

In [None]:
graphs = results.graph.unique()
libraries = results.library.unique()
steps = results.step.unique()

fig, axes = plt.subplots(
    ncols=len(steps),
    nrows=len(graphs),
    squeeze=False,
    figsize=(5*len(steps), 4*len(graphs))
)
for graph, sub_axes in zip(graphs, axes):
    for step, axis in zip(steps, sub_axes):
        filtered = results[(results.graph==graph) & (results.step==step)]
        axis.plot(filtered.delta, filtered.ram)
        axis.set_xlabel("Seconds")
        axis.set_ylabel("RAM used")
        axis.set_title(graph)
fig.tight_layout()