# Configure
Set `jvm_path` to your java virtual machine full path

In [None]:
jvm_path = None

Set up logging

In [None]:
import logging
logging.basicConfig(
  level=logging.INFO,
  format='%(asctime)s %(name)-12s %(levelname)-8s: %(message)s',
  datefmt='%Y-%m-%d %H:%M:%S',
)

Install package and dependencies

In [None]:
import sys
import os

do_install = False
package_root = ".."

if do_install:
  cmd = "{} -m pip install -U {} --use-feature=in-tree-build".format(
    sys.executable,
    package_root,
  )
  logging.info(cmd)
  os.system(cmd)
  cmd = "{} -m pip install -Ur {}".format(
    sys.executable,
    os.path.join(package_root, "notebooks-requirements.txt"),
  )
  logging.info(cmd)
  os.system(cmd)

Start jvm

In [None]:
from featgraph.jwebgraph import start_jvm
start_jvm(jvm_path=jvm_path)

Import the java library

In [None]:
from it.unimi.dsi import webgraph

# Working on Wikipedia
## Download graph

In [None]:
from featgraph import jwebgraph

dest_dir = "graphs"
os.makedirs(dest_dir, exist_ok=True)

wiki_basename = "enwiki-2013"
urls = {
  "{}.graph".format(wiki_basename): "http://data.law.di.unimi.it/webdata/enwiki-2013/enwiki-2013.graph",
  "{}.properties".format(wiki_basename): "http://data.law.di.unimi.it/webdata/enwiki-2013/enwiki-2013.properties",
}

jwebgraph.download_dependencies(
  deps=urls,
  root=dest_dir,
)

wiki_basepath = os.path.join(dest_dir, wiki_basename)

Reconstruct offsets

In [None]:
def notisfile(filepath: str, func=os.path.isfile, log=True):
  if isinstance(func, bool):
    b = func
  else:
    b = func(filepath)
  if b and log:
    logging.info("Found '%s'. Skipping", filepath)
  return not b

offsets_path = ".".join((wiki_basepath, "offsets"))
if notisfile(offsets_path):
  webgraph.BVGraph.main(["-O", wiki_basepath])

Load and check number of nodes and arcs

In [None]:
graph = webgraph.BVGraph.load(wiki_basepath)

def pretty_print_int(n: int, k: int = 3) -> str:
  def _ppi_it(s: str):
    i = 0
    for c in reversed(s):
      if i == k:
        i = 0
        yield " "
      yield c
      i += 1
  return "".join(reversed(list(_ppi_it(str(n)))))

nnodes = graph.numNodes()
narcs = graph.numArcs()
print("Graph '{}' has\n{:>13} nodes\n{:>13} arcs".format(
  wiki_basename,
  pretty_print_int(nnodes),
  pretty_print_int(narcs),
))

# check correctness
expected_nnodes = 4206785
expected_narcs = 101355853

assert(nnodes == expected_nnodes)
assert(narcs == expected_narcs)

## Degree correlation
Compute degree files

In [None]:
import glob

stats_basename = ".".join((wiki_basepath, "stats"))

stats_files = glob.glob(stats_basename + "*")
if notisfile(stats_files, len):
  webgraph.Stats.main([
    "--save-degrees",
    wiki_basepath,
    stats_basename,
  ])

Compute Kendall's $\tau$

In [None]:
from it.unimi.dsi import law
import java.lang

def get_degrees(s: str, prefix: str = stats_basename):
  assert(s in ("in", "out"))
  return law.stat.KendallTau.loadAsDoubles(
    ".".join((prefix, s + "degrees")),
    java.lang.String, False
  )

kendall_tau = law.stat.KendallTau.INSTANCE.compute(
  get_degrees("in"), get_degrees("out"),
)
print("KendallTau:", kendall_tau)

Degree scatterplot

In [None]:
from matplotlib import pyplot as plt
import functools

logplots_dict = {
  "lin": (False, False),
  "loglog": (True, True),
  "semilogx": (True, False),
  "semilogy": (False, True),
}

def kt_helper(xfunc, yfunc):
  return law.stat.KendallTau.INSTANCE.compute(
    xfunc(), yfunc(),
  )

def scatter_helper(
  xfunc, yfunc,
  scale="lin",
  compute_kt=True,
  ax=None,
  xlabel=None,
  ylabel=None,
  label=wiki_basename,
  marker=".", c="k", **kwargs
):
  if ax is None:
    ax = plt.gca()
  ax.scatter(
    xfunc(), yfunc(),
    marker=marker, c=c,**kwargs
  )
  for b, f in zip(
    logplots_dict[scale],
    (ax.set_xscale, ax.set_yscale)
  ):
    if b:
      f("log")
  if compute_kt:
    kt = kt_helper(xfunc, yfunc)
  else:
    kt = None
  tit_li = []
  if label is not None:
    tit_li.append(label)
  if ylabel is not None:
    ax.set_ylabel(ylabel)
    tit_li.append(ylabel)
  if xlabel is not None:
    ax.set_xlabel(xlabel)
    if ylabel is not None:
      tit_li.append("vs")
    tit_li.append(xlabel)
  if len(tit_li):
    tit_li = [" ".join(tit_li)]
  if kt is not None:
    tit_li.append(r"(Kendall $\tau$ = {:.5f})".format(kt))
  if len(tit_li):
    ax.set_title("\n".join(tit_li))

scatter_helper(
  functools.partial(get_degrees, "out"),
  functools.partial(get_degrees, "in"),
  "loglog", alpha=2**(-7),
  xlabel="out-degree",
  ylabel="in-degree",
)

## PageRank
Compute transpose

In [None]:
transpose_basepath = ".".join((wiki_basepath, "transpose"))

transpose_files = glob.glob(transpose_basepath + "*")
if notisfile(transpose_files, len):
  webgraph.Transform.main([
    "transposeOffline", wiki_basepath, transpose_basepath
  ])

Compute PageRank

In [None]:
def rank_path(alpha=0.85, fp=True):
  if fp:
    return ".".join((rank_path(alpha, False), "ranks"))
  alpha_str = "{:.2f}".format(alpha)[2:]
  return ".".join((wiki_basepath, "pagerank-{}".format(alpha_str)))

def compute_pagerank(alpha=0.85):
  bp = rank_path(alpha, False)
  fp = rank_path(alpha, True)
  if notisfile(fp):
    law.rank.PageRankParallelGaussSeidel.main([
      "--alpha", str(alpha),
      transpose_basepath, bp,
    ])

compute_pagerank()

Plot pagerank against indegree

In [None]:
scatter_helper(
  functools.partial(get_degrees, "in"),
  functools.partial(
    law.stat.KendallTau.loadAsDoubles,
    rank_path(), java.lang.Double, False
  ),
  xlabel="in-degree",
  ylabel="pagerank",
)

## HyperBall
Computer HyperBall on the transposed graph to compute the incoming-distances distribution

In [None]:
nf_filepath = ".".join((wiki_basepath, "nf.txt"))
hb_nbits = 8

if notisfile(nf_filepath):
  webgraph.algo.HyperBall.main([
    "--log2m", str(hb_nbits), "--offline", "--external",
    "-n", nf_filepath,
    transpose_basepath, wiki_basepath
  ])

Plot the neighbourhood function estimate

In [None]:
import numpy as np

nf = law.stat.KendallTau.loadAsDoubles(
  nf_filepath, java.lang.String, False
)
df = np.diff([0, *nf])

ax = plt.subplot(211)
plt.plot(nf, c="k")

plt.gca().set_xscale("log")
plt.gca().set_yscale("log")
plt.xlim(plt.xlim()[0], len(nf) - 1)

plt.ylabel("cumulative frequency (#pairs)")
plt.title("neighbourhood function")

plt.subplot(212, sharex=ax)
plt.plot(df, c="k")

plt.gca().set_xscale("log")
plt.gca().set_yscale("log")
plt.xlim(plt.xlim()[0], len(nf) - 1)

plt.xlabel("distance")
plt.ylabel("frequency (#pairs)")
plt.title("distance function")

plt.gcf().suptitle("{}\nHyperBall ($log_2m$ = {})".format(
  wiki_basename, hb_nbits,
))
plt.gcf().set_size_inches([
  plt.gcf().get_size_inches()[0],
  2*plt.gcf().get_size_inches()[1],
]);

Compute statistics

In [None]:
from scipy import stats

df_n = len(df) - 1

df_rv = stats.rv_discrete(values=(
  np.arange(df_n + 1),
  df / nf[-1],
))

df_mode = np.argmax(df)
print("""Distance
  mode: {} ({:.2f}% of pairs)
  mean: {:.3f}
  std:  {:.3f}""".format(
    df_mode, 100 * df[df_mode] / nf[-1],
    df_rv.mean(),
    df_rv.std(),
  )
)

## Harmonic Centrality

In [None]:
hc_filepath = ".".join((wiki_basepath, "hc.ranks"))
hb_nbits = 8

if notisfile(hc_filepath):
  webgraph.algo.HyperBall.main([
    "--log2m", str(hb_nbits), "--offline", "--external",
    "-h", hc_filepath,
    transpose_basepath, wiki_basepath
  ])

In [None]:
scatter_helper(
  functools.partial(
    law.stat.KendallTau.loadAsDoubles,
    rank_path(), java.lang.Double, False
  ),
  functools.partial(
    law.stat.KendallTau.loadAsDoubles,
    hc_filepath, java.lang.Float, False
  ), "semilogx",
  xlabel="pagerank",
  ylabel="harmonic centrality",
)

## PageRank changing $\alpha$
Compute PageRank for different values of $\alpha$

In [None]:
da = 0.1
alphas = np.linspace(da, 1, int(1/da - 1), endpoint=False)
kt_hc_ranks_a = np.zeros(len(alphas))
for i, a in enumerate(alphas):
  compute_pagerank(a)
  kt_hc_ranks_a[i] = kt_helper(
    functools.partial(
      law.stat.KendallTau.loadAsDoubles,
      rank_path(a), java.lang.Double, False
    ),
    functools.partial(
      law.stat.KendallTau.loadAsDoubles,
      hc_filepath, java.lang.Float, False
    )
  )

In [None]:
plt.plot(alphas, kt_hc_ranks_a, c="k")
plt.title("Correlation between Harmonic Centrality and PageRank")
plt.xlabel(r"PageRank $\alpha$")
plt.ylabel(r"Kendall $\tau$")
plt.xlim(*alphas[[0, -1]]);

The 10 nodes that have the largest PageRank at $\alpha=0.90$

In [None]:
def best_ranked_pr(n, alpha=.90):
  return np.flip(np.argsort(
    law.stat.KendallTau.loadAsDoubles(rank_path(alpha), java.lang.Double, False)
  )[-n:])

best_ranked_pr(10)

The 10 nodes that have the largest Harmonic Centrality

In [None]:
def best_ranked_hc(n: int):
  return np.flip(np.argsort(
    law.stat.KendallTau.loadAsDoubles(hc_filepath, java.lang.Float, False)
  )[-n:])

best_ranked_hc(10)

Jaccard coefficient between the top-10

In [None]:
def jaccard(a, b):
  i = len(set(a).intersection(b))
  return i / (len(a) + len(b) - i)

print("Jaccard index: {:.2f}%".format(
  100*jaccard(best_ranked_pr(10), best_ranked_hc(10))
))

Jaccard coefficient between the top-100

In [None]:
print("Jaccard index: {:.2f}%".format(
  100*jaccard(best_ranked_pr(100), best_ranked_hc(100))
))