# Configure
Set `jvm_path` to your java virtual machine full path

In [None]:
jvm_path = None

Set up logging

In [None]:
import logging
logging.basicConfig(
  level=logging.INFO,
  format='%(asctime)s %(name)-12s %(levelname)-8s: %(message)s',
  datefmt='%Y-%m-%d %H:%M:%S',
)

Install package and dependencies

In [None]:
import sys
import os

do_install = False
package_root = ".."

if do_install:
  cmd = "{} -m pip install -U {} --use-feature=in-tree-build".format(
    sys.executable,
    package_root,
  )
  logging.info(cmd)
  os.system(cmd)
  cmd = "{} -m pip install -Ur {}".format(
    sys.executable,
    os.path.join(package_root, "notebooks-requirements.txt"),
  )
  logging.info(cmd)
  os.system(cmd)

Start jvm

In [None]:
from featgraph.jwebgraph import start_jvm
start_jvm(jvm_path=jvm_path)

Import the java library

In [None]:
from it.unimi.dsi import webgraph

# Working on Wikipedia
## Download graph

In [None]:
from featgraph import jwebgraph

dest_dir = "graphs"
os.makedirs(dest_dir, exist_ok=True)

wiki_basename = "enwiki-2013"
urls = {
  "{}.graph".format(wiki_basename): "http://data.law.di.unimi.it/webdata/enwiki-2013/enwiki-2013.graph",
  "{}.properties".format(wiki_basename): "http://data.law.di.unimi.it/webdata/enwiki-2013/enwiki-2013.properties",
}

jwebgraph.download_dependencies(
  deps=urls,
  root=dest_dir,
)

wiki_basepath = os.path.join(dest_dir, wiki_basename)

Reconstruct offsets

In [None]:
offsets_path = ".".join((wiki_basepath, "offsets"))
if not os.path.isfile(offsets_path):
  webgraph.BVGraph.main(["-O", wiki_basepath])
else:
  logging.info("Found '%s'. Skipping", offsets_path)

Load and check number of nodes and arcs

In [None]:
graph = webgraph.BVGraph.load(wiki_basepath)

def pretty_print_int(n: int, k: int = 3) -> str:
  def _ppi_it(s: str):
    i = 0
    for c in reversed(s):
      if i == k:
        i = 0
        yield " "
      yield c
      i += 1
  return "".join(reversed(list(_ppi_it(str(n)))))

nnodes = graph.numNodes()
narcs = graph.numArcs()
print("Graph '{}' has\n{:>13} nodes\n{:>13} arcs".format(
  wiki_basename,
  pretty_print_int(nnodes),
  pretty_print_int(narcs),
))

# check correctness
expected_nnodes = 4206785
expected_narcs = 101355853

assert(nnodes == expected_nnodes)
assert(narcs == expected_narcs)

## Degree correlation
Compute degree files

In [None]:
import glob

stats_basename = ".".join((wiki_basepath, "stats"))

stats_files = glob.glob(stats_basename + "*")
if len(stats_files):
  logging.info("Found stats files: %s. Skipping", stats_files)
else:
  webgraph.Stats.main([
    "--save-degrees",
    wiki_basepath,
    stats_basename,
  ])

Compute Kendall's $\tau$

In [None]:
from it.unimi.dsi import law
import java.lang

def get_degrees(s: str, prefix: str = stats_basename):
  assert(s in ("in", "out"))
  return law.stat.KendallTau.loadAsDoubles(
    ".".join((prefix, s + "degrees")),
    java.lang.String, False
  )

kendall_tau = law.stat.KendallTau.INSTANCE.compute(
  get_degrees("in"), get_degrees("out"),
)
print("KendallTau:", kendall_tau)

Degree scatterplot

In [None]:
from matplotlib import pyplot as plt

plt.scatter(
  get_degrees("out"),
  get_degrees("in"),
  marker=".", c="k", alpha=2**(-7)
)

plt.gca().set_yscale("log")
plt.gca().set_xscale("log")

plt.xlabel("out-degree")
plt.ylabel("in-degree")
plt.title(r"""{} node degree log-log scatterplot
(Kendall $\tau$ = {:.5f})""".format(wiki_basename, kendall_tau));

## PageRank
Compute transpose

In [None]:
transpose_basepath = ".".join((wiki_basepath, "transpose"))

transpose_files = glob.glob(transpose_basepath + "*")
if len(transpose_files):
  logging.info("Found transpose files: %s. Skipping", transpose_files)
else:
  webgraph.Transform.main([
    "transposeOffline", wiki_basepath, transpose_basepath
  ])

Compute PageRank

In [None]:
rank_basepath = ".".join((wiki_basepath, "pagerank"))
rank_filepath = ".".join((rank_basepath, "ranks"))

if os.path.isfile(rank_filepath):
  logging.info("Found pagerank file: %s. Skipping", rank_filepath)
else:
  law.rank.PageRankParallelGaussSeidel.main([
    transpose_basepath,
    rank_basepath,
  ])

Plot pagerank against indegree

In [None]:
import numpy as np

def scatter_data():
  return (
    get_degrees("in"),
    law.stat.KendallTau.loadAsDoubles(
      rank_filepath, java.lang.Double, False
    )
  )

plt.scatter(*scatter_data(), c="k", marker=".")

plt.xlabel("indegree")
plt.ylabel("pagerank")
pcc = np.corrcoef(*scatter_data())[0, 1]
plt.title(r"""{} pagerank vs in-degree
(Pearson CC = {:.3f})""".format(wiki_basename, pcc));

## HyperBall
Computer HyperBall on the transposed graph to compute the incoming-distances distribution

In [None]:
nf_filepath = ".".join((wiki_basepath, "nf.txt"))
hb_nbits = 8

if os.path.isfile(nf_filepath):
  logging.info("Found neighbourhood function file: %s. Skipping", nf_filepath)
else:
  webgraph.algo.HyperBall.main([
    "--log2m", str(hb_nbits), "--offline", "--external",
    "-n", nf_filepath,
    transpose_basepath, wiki_basepath
  ])

Plot the neighbourhood function estimate

In [None]:
nf = law.stat.KendallTau.loadAsDoubles(
  nf_filepath, java.lang.String, False
)
df = np.diff([0, *nf])

ax = plt.subplot(211)
plt.plot(nf, c="k")

plt.gca().set_xscale("log")
plt.gca().set_yscale("log")
plt.xlim(plt.xlim()[0], len(nf) - 1)

plt.ylabel("cumulative frequency (#pairs)")
plt.title("neighbourhood function")

plt.subplot(212, sharex=ax)
plt.plot(df, c="k")

plt.gca().set_xscale("log")
plt.gca().set_yscale("log")
plt.xlim(plt.xlim()[0], len(nf) - 1)

plt.xlabel("distance")
plt.ylabel("frequency (#pairs)")
plt.title("distance function")

plt.gcf().suptitle("{}\nHyperBall ($log_2m$ = {})".format(
  wiki_basename, hb_nbits,
))
plt.gcf().set_size_inches([
  plt.gcf().get_size_inches()[0],
  2*plt.gcf().get_size_inches()[1],
]);

Compute statistics

In [None]:
from scipy import stats

df_n = len(df) - 1

df_rv = stats.rv_discrete(values=(
  np.arange(df_n + 1),
  df / nf[-1],
))

df_mode = np.argmax(df)
print("""Distance
  mode: {} ({:.2f}% of pairs)
  mean: {:.3f}
  std:  {:.3f}""".format(
    df_mode, 100 * df[df_mode] / nf[-1],
    df_rv.mean(),
    df_rv.std(),
  )
)