## Debugging the `LabelModel` with deps + higher-order cliques

In [1]:
%matplotlib inline
import sys
sys.path.append("../") # go to parent dir

import numpy as np
import torch
import matplotlib.pyplot as plt
import mpmath

from synthetic.generate import SingleTaskTreeDepsGenerator
from metal.label_model import LabelModel
from metal.label_model.utils import (
    compute_mu,
    compute_covariance,
    compute_inv_covariance,
    print_matrix,
    visualize_matrix
)

### Generating a dataset

In [2]:
# Generate synthetic dataset
np.random.seed(1)
N = 50
M = 5
K = 2
EDGE_PROB=1.0
data = SingleTaskTreeDepsGenerator(N, M, k=K, edge_prob=EDGE_PROB)

(0, 1)   [0.44911967 0.70277143]
(0, 2)   [0.94198516 0.86167983]
(1, 1)   [0.38194617 0.57209334]
(1, 2)   [0.4991076  0.30661949]
(2, 1)   [0.58097252 0.92256582]
(2, 2)   [0.51148433 0.48762871]
(3, 1)   [0.94521501 0.80055031]
(3, 2)   [0.74437346 0.82248175]
(4, 1)   [0.18352073 0.56633729]
(4, 2)   [0.87851823 0.84623222]
((0, 1), 1, 1)   [0.84664302 0.34574498]
((0, 1), 1, 2)   [0.15331888 0.70347524]
((0, 1), 2, 1)   [0.63375897 0.70448869]
((0, 1), 2, 2)   [0.47060909 0.27779581]
((1, 2), 1, 1)   [0.36066668 0.22790812]
((1, 2), 1, 2)   [0.80498303 0.47128496]
((1, 2), 2, 1)   [0.13075418 0.66162699]
((1, 2), 2, 2)   [0.69457216 0.36864577]
((0, 3), 1, 1)   [0.50152106 0.29991209]
((0, 3), 1, 2)   [0.16602775 0.52231467]
((0, 3), 2, 1)   [0.18655503 0.91303315]
((0, 3), 2, 2)   [0.20754142 0.57231944]
((0, 4), 1, 1)   [0.1752607 0.9251752]
((0, 4), 1, 2)   [0.91940354 0.3690371 ]
((0, 4), 2, 1)   [0.62595021 0.60932082]
((0, 4), 2, 2)   [0.65254449 0.96088209]
Labeler =  0
P(L

### Training the `LabelModel`

Note that:
* The `train` method assembles other data structures, such as the dependencies junction tree, etc.
* The `higher_order_cliques` kwarg controls whether or not to include them

In [3]:
lm = LabelModel(k=data.k, class_balance=data.p)

In [4]:
lm.config['higher_order_cliques'] = True

In [5]:
# Generate the "correct" mu
lm._set_constants(data.L)
lm._set_dependencies(data.E)
mu = compute_mu(lm._get_augmented_label_matrix(data.L.todense()), data.Y, K, data.p)

# Compute O, O_inv, P based on L
lm._generate_O(data.L.todense())
O = lm.O.numpy()
print(O)
d, d = O.shape
O_inv = np.linalg.inv(O)
P = np.diag(data.p)

JJT = np.linalg.inv(np.linalg.inv(P) - mu.T @ O_inv @ mu)
ZZT = O_inv @ mu @ JJT @ mu.T @ O_inv.T

[[0.56 0.   0.56 0.   0.56 0.   0.56 0.   0.56 0.   0.56 0.   0.   0.
  0.56 0.   0.   0.   0.56 0.   0.   0.   0.56 0.   0.   0.  ]
 [0.   0.44 0.   0.44 0.   0.44 0.   0.44 0.   0.44 0.   0.   0.   0.44
  0.   0.   0.   0.44 0.   0.   0.   0.44 0.   0.   0.   0.44]
 [0.56 0.   0.56 0.   0.56 0.   0.56 0.   0.56 0.   0.56 0.   0.   0.
  0.56 0.   0.   0.   0.56 0.   0.   0.   0.56 0.   0.   0.  ]
 [0.   0.44 0.   0.44 0.   0.44 0.   0.44 0.   0.44 0.   0.   0.   0.44
  0.   0.   0.   0.44 0.   0.   0.   0.44 0.   0.   0.   0.44]
 [0.56 0.   0.56 0.   0.56 0.   0.56 0.   0.56 0.   0.56 0.   0.   0.
  0.56 0.   0.   0.   0.56 0.   0.   0.   0.56 0.   0.   0.  ]
 [0.   0.44 0.   0.44 0.   0.44 0.   0.44 0.   0.44 0.   0.   0.   0.44
  0.   0.   0.   0.44 0.   0.   0.   0.44 0.   0.   0.   0.44]
 [0.56 0.   0.56 0.   0.56 0.   0.56 0.   0.56 0.   0.56 0.   0.   0.
  0.56 0.   0.   0.   0.56 0.   0.   0.   0.56 0.   0.   0.  ]
 [0.   0.44 0.   0.44 0.   0.44 0.   0.44 0.   0.44 0.   0.   0

LinAlgError: Singular matrix

In [None]:
# Check that JJT is indeed PSD ==> ZZT is rank k
np.linalg.eig(JJT)[0]

In [None]:
np.linalg.eig((ZZT + ZZT.T)/2)[0]

In [None]:
lm.train(
    data.L,
    deps=data.E,
    all_unary_cliques=True,
    higher_order_cliques=True,
    n_epochs=50000,
    print_every=5000,
    lr=0.0001,
    l2=0,
    O_inv_prec=1024,
    #O_inv=ZZT
)

# Test against the true parameter values
mu_est = lm.mu.detach().numpy()
print(f"Average absolute error: {np.mean(np.abs(mu_est - mu))}")

### Sanity check that the true $Z$ gets lower loss

In [None]:
sorted(np.linalg.eig(O)[0])[::-1]

In [None]:
O = lm.O.numpy()
d, d = O.shape
O_inv = lm.O_inv.numpy()
mask = lm.mask.numpy()
P = lm.P.numpy()

JJT = np.linalg.inv(np.linalg.inv(P) - mu.T @ O_inv @ mu)
ZZT = O_inv @ mu @ JJT @ mu.T @ O_inv.T

np.linalg.norm((O_inv + ZZT) * mask)**2

#### Seed=2

In [None]:
visualize_matrix(mask)

In [None]:
#visualize_matrix(np.abs((O_inv + ZZT) * mask))

In [None]:
lm.c_data

In [None]:
Z_est = lm.Z.detach().numpy()
visualize_matrix(np.abs((O_inv + Z_est @ Z_est.T) * mask))

#### Seed=1

In [None]:
visualize_matrix(mask)

In [None]:
visualize_matrix(np.abs((O_inv + ZZT) * mask))

In [None]:
Z_est = lm.Z.detach().numpy()
visualize_matrix(np.abs((O_inv + Z_est @ Z_est.T) * mask))

#### Seed=0

In [None]:
visualize_matrix(mask)

In [None]:
visualize_matrix(np.abs((O_inv + ZZT) * mask))

In [None]:
Z_est = lm.Z.detach().numpy()
visualize_matrix(np.abs((O_inv + Z_est @ Z_est.T) * mask))

### Visualizing matrices

In [None]:
visualize_matrix(np.abs(mu_est - mu))

In [None]:
visualize_matrix(np.abs(mu_est - mu))

In [None]:
lm.c_data

In [None]:
data.E

### Trying to solve with `scipy.optimize.minimize`

In [None]:
from scipy.optimize import minimize

O_inv = lm.O_inv.numpy()
mask = lm.mask.numpy()

z0 = np.random.randn(lm.d * lm.k)

def objective_fn(z):
    Z = z.reshape(-1, data.k)
    return np.linalg.norm( (O_inv + Z @ Z.T) * mask )**2

def gradient_fn(z):
    Z = z.reshape(-1, data.k)
    X = (O_inv + Z @ Z.T) * mask
    return np.ravel(X @ Z)

res = minimize(objective_fn, z0, jac=gradient_fn, method='BFGS')
Z = res['x'].reshape(-1, data.k)
res['fun']

In [None]:
O = lm.O.numpy()
P = lm.P.numpy()
I_k = np.eye(data.k)
Q = O @ Z @ np.linalg.inv(I_k + Z.T @ O @ Z) @ Z.T @ O

mu0 = np.random.randn(lm.d * lm.k)

def objective_fn_2(mu):
    M = mu.reshape(-1, data.k)
    return np.linalg.norm(Q - M @ P @ M.T)**2 + np.linalg.norm(np.sum(M @ P, 1) - np.diag(O))**2

res_2 = minimize(objective_fn_2, mu0, method='BFGS')
M = res_2['x'].reshape(-1, data.k)
res_2['fun']

In [None]:
# Test against the true parameter values
print(f"Average absolute error: {np.mean(np.abs(M - mu))}")

## Visualizing the inverse covariance matrix

In [None]:
J = compute_inv_covariance(
    lm._get_augmented_label_matrix(data.L.todense()),
    data.Y,
    data.k,
    data.p
)
visualize_matrix(np.abs(J))

In [None]:
visualize_matrix(lm.mask.numpy(), fig_size=[5,5])

In [None]:
O_inv = lm.O_inv.numpy()
Z = lm.Z.detach().numpy()
mask = lm.mask.numpy()
visualize_matrix(np.abs((O_inv + Z@Z.T) * mask))

### Looking at the internal 'bookkeeping' of cliques...

In [None]:
lm.c_data

In [None]:
# Dependency edge weights
[((i,j), data.theta[(i,j)]) for i,j in data.E]