## 1. Setup

In [148]:
import tensorflow as tf
import numpy as np
import gpflow
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display, clear_output
import networkx as nx
from gpflow.utilities import print_summary
import tensorflow_probability as tfp
import seaborn as sns
import math
from tqdm import tqdm

In [149]:
import sys
import os
project_root = os.path.abspath("../..")
sys.path.append(project_root)

In [150]:
from efficient_graph_gp.graph_kernels import diffusion_kernel, get_normalized_laplacian, generate_noisy_samples
from efficient_graph_gp.gpflow_kernels import GraphDiffusionKernel, GraphDiffusionFastGRFKernel, GraphDiffusionPoFMKernel, GraphGeneralPoFMKernel
from utils import plot_network_graph, plot_gp_fit, compute_fro

In [151]:
def generate_random_graph_by_degree(num_nodes, average_degree, seed=42):
    probability = average_degree / (num_nodes - 1)  # Calculate edge probability from average degree
    G = nx.erdos_renyi_graph(num_nodes, probability, seed=seed, directed=False)
    return nx.to_numpy_array(G)

def gp_inference(X,Y,X_new, graph_kernel):
    model = gpflow.models.GPR(data=(X, Y), kernel=graph_kernel, mean_function=None)
    # model.likelihood.variance.prior = tfp.distributions.LogNormal(loc=np.log(0.07), scale=0.5)
    gpflow.optimizers.Scipy().minimize(model.training_loss, model.trainable_variables)
    mean, variance = model.predict_f(X_new)
    stddev = tf.sqrt(variance)
    return model, mean, stddev

def gp_inference_fixed_noise(X,Y,X_new, graph_kernel, noise_variance):
    model = gpflow.models.GPR(data=(X, Y), kernel=graph_kernel, mean_function=None)
    model.likelihood.variance.assign(noise_variance)
    gpflow.utilities.set_trainable(model.likelihood.variance, False)
    gpflow.optimizers.Scipy().minimize(model.training_loss, model.trainable_variables)
    mean, variance = model.predict_f(X_new)
    stddev = tf.sqrt(variance)
    return model, mean, stddev

def lengthscale2modulator(beta, max_expansion):
    theta = np.array([(-beta / 2) ** i / math.factorial(i) for i in range(max_expansion)])
    return theta

In [152]:
def gp_log_marginal_likelihood_terms(y, K, sigma_n):
    """
    Compute the three parts of the Gaussian Process log marginal likelihood
    (assuming mean = 0):
      1) data_fit    = -1/2 * y^T (K + sigma_n^2 I)^-1 y
      2) complexity  = -1/2 * log |K + sigma_n^2 I|
      3) constant    = -n/2 * log(2 pi)
    """
    n = len(y)
    Ky = K + sigma_n**2 * np.eye(n)
    invKy = np.linalg.inv(Ky)
    sign, logdet = np.linalg.slogdet(Ky)

    data_fit   = -0.5 * y.T  @ invKy @ y
    complexity = -0.5 * logdet
    constant   = -0.5 * n * np.log(2 * np.pi)

    return data_fit, complexity, constant


## 2. Data Generation

In [153]:
# Generate a random graph
num_nodes = 1000
average_degree = 10
adjacency_matrix = generate_random_graph_by_degree(num_nodes, average_degree)


In [154]:
# Genrate Noisy Samples from the Graph By Sampling from a GP
beta_sample = 3.0
noise_std = 0.01

K_true = diffusion_kernel(adjacency_matrix, beta_sample)
Y_noisy = generate_noisy_samples(K_true, noise_std=noise_std)
X = tf.convert_to_tensor(np.arange(num_nodes, dtype=np.float64).reshape(-1, 1))
X_new = tf.convert_to_tensor(np.arange(num_nodes, dtype=np.float64).reshape(-1, 1))
Y = tf.reshape(tf.convert_to_tensor(Y_noisy, dtype=tf.float64), (-1, 1))

## 3. GP Inference

In [155]:
MAX_EXPANSION = 3

### 3.1. Analytical Ground Truth PoFM Hyperparameter $\theta_0$

In [156]:
theta_0 = lengthscale2modulator(beta_sample, MAX_EXPANSION)
print(theta_0)

[ 1.    -1.5    1.125]


### 3.2. Best Fit PoFM Hyperparameter using Exact Diffusion Kernel $\theta_1$

In [157]:
exact_kernel = GraphDiffusionKernel(adjacency_matrix, beta=0.5) # The initial beta value for optimizaiton
model, mean, stddev = gp_inference_fixed_noise(X, Y, X_new, exact_kernel, noise_variance=noise_std**2)
print_summary(model)
learned_beta_exact = model.kernel.beta.numpy()

╒═════════════════════════╤═══════════╤══════════════════╤═════════╤═════════════╤═════════╤═════════╤═════════╕
│ name                    │ class     │ transform        │ prior   │ trainable   │ shape   │ dtype   │   value │
╞═════════════════════════╪═══════════╪══════════════════╪═════════╪═════════════╪═════════╪═════════╪═════════╡
│ GPR.kernel.beta         │ Parameter │ Softplus         │         │ True        │ ()      │ float64 │ 3.05433 │
├─────────────────────────┼───────────┼──────────────────┼─────────┼─────────────┼─────────┼─────────┼─────────┤
│ GPR.likelihood.variance │ Parameter │ Softplus + Shift │         │ False       │ ()      │ float64 │ 0.0001  │
╘═════════════════════════╧═══════════╧══════════════════╧═════════╧═════════════╧═════════╧═════════╧═════════╛


In [158]:
theta_1 = lengthscale2modulator(learned_beta_exact, MAX_EXPANSION)
print(theta_1)
print(model.log_marginal_likelihood().numpy())

[ 1.         -1.52716686  1.16611931]
103.35114714879683


### 3.3. Best Fit PoFM Hyperparameter using General PoFM Kernel $\hat{\theta}$

In [159]:
general_pofm_kernel = GraphGeneralPoFMKernel(adjacency_matrix, max_walk_length=MAX_EXPANSION)
model, mean, stddev = gp_inference_fixed_noise(X, Y, X_new, general_pofm_kernel, noise_variance=noise_std**2)
print_summary(model)

╒═════════════════════════════╤═══════════╤══════════════════╤═════════╤═════════════╤═════════╤═════════╤═══════════════════════════════════════╕
│ name                        │ class     │ transform        │ prior   │ trainable   │ shape   │ dtype   │ value                                 │
╞═════════════════════════════╪═══════════╪══════════════════╪═════════╪═════════════╪═════════╪═════════╪═══════════════════════════════════════╡
│ GPR.kernel.modulator_vector │ Parameter │ Identity         │         │ True        │ (3,)    │ float64 │ [-0.88681834  0.97648526 -0.30462473] │
├─────────────────────────────┼───────────┼──────────────────┼─────────┼─────────────┼─────────┼─────────┼───────────────────────────────────────┤
│ GPR.likelihood.variance     │ Parameter │ Softplus + Shift │         │ False       │ ()      │ float64 │ 0.00010000000000000011                │
╘═════════════════════════════╧═══════════╧══════════════════╧═════════╧═════════════╧═════════╧═════════╧════════════

In [160]:
theta_hat = model.kernel.modulator_vector.numpy()
print(theta_hat)
print(model.log_marginal_likelihood().numpy())

[-0.88681834  0.97648526 -0.30462473]
103.02230154786344


### 3.4. Grid Search to Visualize the Posterior Distribution of $\hat{\theta}$

In [161]:
def evaluate_ml_diffusion(adjacency_matrix, X, Y, beta_values, noise_variance):
    ml_values = []
    for beta in tqdm(beta_values, desc="Evaluating ML over beta (Exact Diffusion)"):
        kernel = GraphDiffusionKernel(adjacency_matrix, beta=beta)
        model = gpflow.models.GPR(data=(X, Y), kernel=kernel, mean_function=None)
        model.likelihood.variance.assign(noise_variance)
        ml_values.append(np.exp(model.log_marginal_likelihood().numpy()))
    return ml_values

def evaluate_ml_constrained_pofm(adjacency_matrix, X, Y, beta_values, noise_variance):
    ml_values = []
    for beta in tqdm(beta_values, desc="Evaluating ML over beta (PoFM)"):
        theta = lengthscale2modulator(beta, MAX_EXPANSION)
        kernel = GraphGeneralPoFMKernel(adjacency_matrix, max_walk_length=MAX_EXPANSION, modulator_vector=theta)
        model = gpflow.models.GPR(data=(X, Y), kernel=kernel, mean_function=None)
        model.likelihood.variance.assign(noise_variance)
        ml_values.append(np.exp(model.log_marginal_likelihood().numpy()))
    return ml_values

In [162]:
# # Plot the marginal likelihood distribution to verify the correctness of the optimization

# beta_values = np.linspace(0.1, 5, 300)

# ml_values_diff = evaluate_ml_diffusion(adjacency_matrix, X, Y, beta_values, noise_std**2)
# ml_values_diff_normalized = ml_values_diff / np.sum(ml_values_diff)

# ml_values_constrained_pofm = evaluate_ml_constrained_pofm(adjacency_matrix, X, Y, beta_values, noise_std**2)
# ml_values_constrained_pofm_normalized = ml_values_constrained_pofm / np.sum(ml_values_constrained_pofm)

# plt.plot(beta_values, ml_values_diff_normalized, label="Diffusion Kernel")
# plt.plot(beta_values, ml_values_constrained_pofm_normalized, label="Constrained PoFM Kernel")
# plt.xlabel("Beta")
# plt.ylabel("Normalized Marginal Likelihood")
# plt.title("Marginal Likelihood v.s. Beta")
# plt.legend()
# plt.show()

### 3.5. Break Down the Marginal Likelihood


My guess is that the PoFM model gives better data fit term, but with larger (potentially much larger) model complexity. We can think about how to perhaps penalize the complexity.

In [163]:
ground_truth_kernel = GraphDiffusionKernel(adjacency_matrix, beta=beta_sample) # The initial beta value for optimizaiton
model = gpflow.models.GPR(data=(X, Y), kernel=ground_truth_kernel, mean_function=None)
model.likelihood.variance.assign(noise_std**2)
K_ground_truth = model.kernel.K(X).numpy()
sigma_ground_truth = np.sqrt(model.likelihood.variance.numpy())
data_fit, complexity, constant = gp_log_marginal_likelihood_terms(Y_noisy, K_ground_truth, sigma_ground_truth)

In [164]:
print('Inspecting the Log Marginal Likelihood Terms of the Ground Truth Kernel')
print('data fit: ', data_fit.squeeze())
print('negative complexity: ', complexity)
print('noise std: ', sigma_ground_truth)
print('constant: ', constant)
print('log marginal likelihood: ', data_fit.squeeze() + complexity + constant)

Inspecting the Log Marginal Likelihood Terms of the Ground Truth Kernel
data fit:  -476.97826229003465
negative complexity:  1498.4804831356607
noise std:  0.010000000000000005
constant:  -918.9385332046727
log marginal likelihood:  102.56368764095339


In [165]:
exact_kernel = GraphDiffusionKernel(adjacency_matrix, beta=0.5) # The initial beta value for optimizaiton
model, _, _ = gp_inference_fixed_noise(X, Y, X_new, exact_kernel, noise_variance=noise_std**2)
K_exact = model.kernel.K(X).numpy()
sigma_exact = np.sqrt(model.likelihood.variance.numpy())
data_fit, complexity, constant = gp_log_marginal_likelihood_terms(Y_noisy, K_exact, sigma_exact)

In [166]:
print('Inspecting the Log Marginal Likelihood Terms of the Exact Diffusion Kernel')
print('data fit: ', data_fit.squeeze())
print('negative complexity: ', complexity)
print('noise std: ', sigma_exact)
print('constant: ', constant)
print('log marginal likelihood: ', data_fit.squeeze() + complexity + constant)

Inspecting the Log Marginal Likelihood Terms of the Exact Diffusion Kernel
data fit:  -503.2503033257732
negative complexity:  1525.5399836792446
noise std:  0.010000000000000005
constant:  -918.9385332046727
log marginal likelihood:  103.35114714879876


In [167]:
general_pofm_kernel = GraphGeneralPoFMKernel(adjacency_matrix, max_walk_length=MAX_EXPANSION)
model, _, _, = gp_inference_fixed_noise(X, Y, X_new, general_pofm_kernel, noise_variance=noise_std**2)
K_pofm = model.kernel.K(X).numpy()
sigma_pofm = np.sqrt(model.likelihood.variance.numpy())
data_fit, complexity, constant = gp_log_marginal_likelihood_terms(Y_noisy, K_pofm, sigma_pofm)

In [168]:
print('Inspecting the Log Marginal Likelihood Terms of the PoFM Kernel')
print('data fit: ', data_fit.squeeze())
print('negative complexity: ', complexity)
print('noise std: ', sigma_pofm)
print('constant: ', constant)
print('log marginal likelihood: ', data_fit.squeeze() + complexity + constant)

Inspecting the Log Marginal Likelihood Terms of the PoFM Kernel
data fit:  -500.00440856313617
negative complexity:  1521.9652433156712
noise std:  0.010000000000000005
constant:  -918.9385332046727
log marginal likelihood:  103.0223015478623
