# Transitions in Centralities
Comparison of centralities transitions in the Spotify graph and in the SGC model

## Configure

Set the dataset base path

In [None]:
spotify_basename = "graphs/spotify-2018"

Set the random graph base path

In [None]:
sgc_basename = "graphs/sgc"

Set the centralities CSV path

In [None]:
csv_path = "./centralities_comparison.csv"

Set `jvm_path` to your java virtual machine full path

In [None]:
jvm_path = None

Set up logging

In [None]:
import logging
logging.basicConfig(
  level=logging.INFO,
  format='%(asctime)s %(name)-12s %(levelname)-8s: %(message)s',
  datefmt='%Y-%m-%d %H:%M:%S',
)

Start jvm

In [None]:
from featgraph.jwebgraph import start_jvm
start_jvm(jvm_path=jvm_path)

## Load the Spotify graph

In [None]:
import featgraph.jwebgraph.utils
from featgraph import jwebgraph
import os

graph = jwebgraph.utils.BVGraph(spotify_basename)
print(graph)

for r in (
  "graph", "properties", "ids.txt"
):
  if not os.path.isfile(graph.path(r)):
    raise FileNotFoundError(graph.path(r))

## Sample a graph from the SGC model

In [None]:
from featgraph import sgc, pathutils

seed = 42
sgc_model = sgc.SGCModel()
sgc_graph = jwebgraph.utils.BVGraph(sgc_basename)
if pathutils.notisglob(sgc_graph.path("*"), msg="Found: %.40s... Skipping"):
  logging.info("Sampling SGC graph")
  sgc_nxgraph = sgc_model(seed=seed)
  logging.info("Converting nxgraph to BVGraph")
  sgc.to_bv(sgc_nxgraph, sgc_basename)

## Compute subgraphs for different thresholds
Define data structure

In [None]:
from featgraph.sgc import ThresholdComparison

tc = ThresholdComparison(
  ThresholdComparison.sgc_graph(sgc_graph),
  ThresholdComparison.spotify_graph(graph),
)

print(f"Thresholding based on {tc.attribute} at thresholds:\n  {', '.join(map(str, tc.thresholds))}")

Perform thresholing

In [None]:
from tqdm.notebook import tqdm

tc.threshold_graphs(tqdm=tqdm)

Compute centralities

In [None]:
tc.compute_centralities(tqdm=tqdm)

Build dataframe

In [None]:
df = tc.dataframe("centralities_transitions.csv", tqdm=tqdm)
df

## Plot centrality transitions

In [None]:
# prepare plots
from matplotlib import pyplot as plt
import matplotlib as mpl
import numpy as np
_default_cols = mpl.rcParams["axes.prop_cycle"].by_key()["color"]

_gen_cols = {
  "classical": _default_cols[0],
  "hip-hop": _default_cols[1],
  "rock": _default_cols[2],
  "community leaders": _default_cols[0],
  "celebrities": _default_cols[1],
  "masses": _default_cols[2],
}

def plot_centrality_comparison(
  centrality_name: str,
  norm = None,
  logy: bool = False,
  save: bool = False,
  aspect: float = 8/9,
  width: float = 12,
  figext: str = "svg",
  gen_cols = _gen_cols,
  std_scale: float = 0.7,
  fill_alpha: float = 0.1,
  median: bool = True,
):
  fig, ax = plt.subplots(2, 1, sharex=True)
  for a, graph_name in zip(ax, ("spotify-2018", "sgc")):
    dff = df[
      (df["graph"] == graph_name) & (df["centrality"] == centrality_name)
    ]
    for k in set(dff["type_value"]):
      dffk = dff[dff["type_value"] == k]
      kx = dffk["threshold"].to_numpy()
      idx = np.argsort(kx)
      kx = kx[idx]
      
      if median:
        kq1 = dffk["quartile-1"].to_numpy()[idx]
        kq2 = dffk["median"].to_numpy()[idx]
        kq3 = dffk["quartile-3"].to_numpy()[idx]
      else:
        kq2 = dffk["mean"].to_numpy()[idx]
        ks = dffk["mean"].to_numpy()[idx] * std_scale
        kq1 = kq2 - std_scale
        kq3 = kq2 + std_scale
      if norm:
        kn = dffk[norm].to_numpy()[idx]
        kq1 /= kn
        kq2 /= kn
        kq3 /= kn
      a.plot(
        kx, kq2,
        label=k,
        c=gen_cols.get(k, "k")
      )
      a.fill_between(
        kx,
        kq1,
        kq3,
        facecolor=gen_cols.get(k, "k"),
        alpha=fill_alpha,
      )
    if logy:
      a.set_yscale("log")
    a.legend()
    a.set_title(graph_name)
    a.set_ylabel(centrality_name)
  a.set_xlabel("popularity threshold")
  fig.set_size_inches(width * np.array([aspect, 1]))
  if save:
    plt.savefig(
      f"compare-{centrality_name}" + \
      (f"-norm_{norm}" if norm else "") + \
      (f"-median" if logy else "") + \
      (f"-semilogy" if logy else "") + \
      f".{figext}",
      bbox_inches="tight",
    )

In [None]:
import seaborn as sns

sns.set()
plot_centrality_comparison(
  centrality_name="Harmonic Centrality",
  norm="narcs",
)

## Plot boxplots

In [None]:
# prepare plots
from IPython.display import SVG, display
import itertools
import pygal

def plot_centrality_boxes(
  c: str,
  t: float,
  basegraph: str = "spotify",
  savefig: bool = False,
  figext: str = "svg"
):
  box_plot = pygal.Box()
  box_plot.title = f"{c}\npopularity > {t:.0f}\n{basegraph}"
  
  baseg = next(g for g in tc.basegraphs if g.label == basegraph)
  gen_k = baseg.type_key
  gen_vals = baseg.type_values
  check_fn = baseg.check_fn
  
  g = tc.subgraph(baseg, t)
  
  for gen in gen_vals:
    box_plot.add(gen, np.array(list(itertools.compress(
      getattr(g, c)(),
      map(check_fn(gen), getattr(g, gen_k)())
    ))))
  
  display(SVG(box_plot.render()))
  if savefig:
    box_plot.render_to_file(
      f"boxplot-{basegraph}-{c}.{figext}"
    )

In [None]:
plot_centrality_boxes(
  "harmonicc", 50,
  "spotify-2018",
)