# Configure

Set the dataset base path

In [None]:
spotify_basename = "graphs/spotify-2018"

Set `jvm_path` to your java virtual machine full path

In [None]:
jvm_path = None

Set up logging

In [None]:
import logging
logging.basicConfig(
  level=logging.INFO,
  format='%(asctime)s %(name)-12s %(levelname)-8s: %(message)s',
  datefmt='%Y-%m-%d %H:%M:%S',
)

Start jvm

In [None]:
from featgraph.jwebgraph import start_jvm
start_jvm(jvm_path=jvm_path)

# Working on Spotify
## Load graph

In [None]:
import featgraph.jwebgraph.utils
from featgraph import jwebgraph
import os

graph = jwebgraph.utils.BVGraph(spotify_basename)
print(graph)

for r in (
  "graph", "properties", "ids.txt"
):
  if not os.path.isfile(graph.path(r)):
    raise FileNotFoundError(graph.path(r))

Reconstruct offsets

In [None]:
graph.reconstruct_offsets()

Load and check number of nodes and arcs

In [None]:
from featgraph.misc import pretty_print_int

print("Graph '{}' has\n{:>11} nodes\n{:>11} arcs".format(
  graph.basename,
  pretty_print_int(graph.numNodes()),
  pretty_print_int(graph.numArcs()),
))

## Degree correlation
Compute degree files

In [None]:
graph.compute_degrees()

Prepare scatter plots reference artists

In [None]:
from matplotlib import pyplot as plt
import featgraph.plots

refnames = (
  "Ludwig van Beethoven",
  "Robert Farnon Orchestra",
  "Ed Sheeran",
  "Rick Ross",
  "Vulfpeck",
)
refidx = tuple(
  graph.artist(name=n).index
  for n in refnames
)

def scatter_refs(x, y, **kwargs):
  xs = x() if callable(x) else x
  xs = [xs[i] for i in refidx]
  ys = y() if callable(y) else y
  ys = [ys[i] for i in refidx]
  for xi, yi, ni in zip(xs, ys, refnames):
    plt.scatter(xi, yi, label=ni, **kwargs)
  plt.legend()

Degree scatterplot

In [None]:
featgraph.plots.scatter(
  graph.outdegrees, graph.indegrees,
  marker=".", c="k", alpha=2**(-5),
  label=graph.basename, xlabel="out-degree", ylabel="in-degree",
)
scatter_refs(graph.outdegrees, graph.indegrees)
plt.show()

## PageRank
Compute transpose

In [None]:
graph.compute_transpose()

Compute PageRank

In [None]:
graph.compute_pagerank()

Plot pagerank against indegree

In [None]:
featgraph.plots.scatter(
  graph.indegrees, graph.pagerank,
  marker=".", c="k", alpha=2**(-5),
  label=graph.basename, xlabel="in-degree", ylabel="pagerank",
)
scatter_refs(graph.indegrees, graph.pagerank)
plt.show()

## HyperBall
Computer HyperBall on the transposed graph to compute the incoming-distances distribution

In [None]:
graph.compute_neighborhood()

Plot the neighbourhood function estimate

In [None]:
ax = plt.subplot(211)
plt.plot(graph.neighborhood(), c="k")

plt.ylabel("cumulative frequency (#pairs)")
plt.title("neighborhood function")

plt.subplot(212, sharex=ax)
plt.plot(graph.distances(), c="k")

plt.xlabel("distance")
plt.ylabel("frequency (#pairs)")
plt.title("distance function")

plt.gcf().suptitle("{}\nHyperBall ($log_2m$ = 8)".format(
  graph.basename,
))
plt.gcf().set_size_inches([
  plt.gcf().get_size_inches()[0],
  2*plt.gcf().get_size_inches()[1],
])
plt.show()

Compute statistics

In [None]:
from scipy import stats
import numpy as np

nf = graph.neighborhood()
df = graph.distances()

df_rv = stats.rv_discrete(values=(
  np.arange(len(df)),
  df / nf[-1],
))

df_mode = np.argmax(df)
print("""Distance
  mode: {} ({:.2f}% of pairs)
  mean: {:.3f}
  std:  {:.3f}
  max:  {}""".format(
    df_mode, 100 * df[df_mode] / nf[-1],
    df_rv.mean(),
    df_rv.std(),
    len(df) - 1,
  )
)

## Harmonic Centrality

In [None]:
graph.compute_harmonicc()

In [None]:
featgraph.plots.scatter(
  graph.pagerank, graph.harmonicc,
  xscale="log",
  marker=".", c="k", alpha=2**(-5),
  label=graph.basename, xlabel="pagerank", ylabel="harmonic centrality",
)
scatter_refs(graph.pagerank, graph.harmonicc)
plt.show()

## PageRank changing $\alpha$
Compute PageRank for different values of $\alpha$

In [None]:
from tqdm.notebook import tqdm

da = 0.1
alphas = np.linspace(da, 1, int(1/da - 1), endpoint=False)
kt_hc_ranks_a = np.zeros(len(alphas))
for i, a in enumerate(tqdm(alphas)):
  graph.compute_pagerank(a)
  kt_hc_ranks_a[i] = jwebgraph.utils.kendall_tau(
    graph.pagerank(a), graph.harmonicc
  )

In [None]:
plt.plot(alphas, kt_hc_ranks_a, c="k")
plt.title(
  "{}\nCorrelation between Harmonic Centrality and PageRank".format(
    graph.basename
  )
)
plt.xlabel(r"PageRank $\alpha$")
plt.ylabel(r"Kendall $\tau$")
plt.xlim(*alphas[[0, -1]])
plt.show()

The 10 nodes that have the largest PageRank at $\alpha=0.90$

In [None]:
for i, a in enumerate(graph.best(10, graph.pagerank(alpha=0.90))):
  print("{:>2}) {}".format(i+1, a.name or ""))

The 10 nodes that have the largest Harmonic Centrality

In [None]:
for i, a in enumerate(graph.best(10, graph.harmonicc)):
  print("{:>2}) {}".format(i+1, a.name or ""))

Jaccard coefficient between the top-10

In [None]:
print("Jaccard index: {:.2f}%".format(
  100 * jwebgraph.utils.jaccard(
    graph.best(10, graph.pagerank(alpha=0.90)),
    graph.best(10, graph.harmonicc),
  )
))

Jaccard coefficient between the top-100

In [None]:
print("Jaccard index: {:.2f}%".format(
  100 * jwebgraph.utils.jaccard(
    graph.best(100, graph.pagerank(alpha=0.90)),
    graph.best(100, graph.harmonicc),
  )
))

## Bonus: plot popularity vs graph analytics

In [None]:
missing_value = -20

def popularity(missing_value=str(missing_value)):
  with open(graph.path("popularity", "txt"), "r") as f:
    return [float(r.rstrip("\n") or missing_value) for r in f]

popularity_ticks = (
  [missing_value, *(np.arange(6) * 20)],
  ["no data", *list(map("{:.0f}".format, np.arange(6) * 20))],
)

In [None]:
featgraph.plots.scatter(
  graph.pagerank, popularity,
  kendall_tau=False,
  xscale="log",
  marker=".", c="k", alpha=2**(-5),
  label=graph.basename, xlabel="pagerank", ylabel="popularity",
)
scatter_refs(graph.pagerank, popularity)
plt.yticks(*popularity_ticks)
plt.legend(loc="lower right")
plt.show()

In [None]:
featgraph.plots.scatter(
  graph.harmonicc, popularity,
  kendall_tau=False,
  marker=".", c="k", alpha=2**(-5),
  label=graph.basename, xlabel="harmonic centrality", ylabel="popularity",
)
scatter_refs(graph.harmonicc, popularity)
plt.yticks(*popularity_ticks)
plt.legend(loc="upper left")
plt.show()