In [89]:
import numpy as np
import scipy as sp
from scipy import linalg as sp_linalg
import matplotlib.pyplot as plt
%matplotlib inline

V = 1000
rng = np.random.RandomState(0)

power_law_seed = rng.power(a=1.0, size=V)
P_ij = np.triu(rng.lognormal(mean=1, sigma=1, size=[V, V]), k=1)
P_ij *= power_law_seed
P_ij += P_ij.T #P_ij == P_ji
P_ij /= P_ij.sum()

p_i = P_ij.sum(axis=1)
P_i = np.diag(p_i)
P_i_inv = np.diag(1.0 / p_i)
similarities = P_i_inv.dot(P_ij).dot(P_i_inv)

pmis = np.log(similarities)
pmis[pmis == -np.inf] = 0

assert np.allclose(similarities, similarities.T)
assert np.allclose(similarities.dot(P_i).sum(axis=1), 1)
assert np.allclose(P_ij, P_ij.T)
assert np.allclose(pmis, pmis.T)
assert np.allclose(P_i.sum(), 1)
assert np.allclose(P_ij.sum(), 1)



In [99]:
rng = np.random.RandomState(0)
n = V**2 * 1.0

emp_ij = rng.multinomial(n, P_ij.flatten()).reshape([V, V])
emp_ij = emp_ij / emp_ij.sum()

emp_i = emp_ij.sum(axis=1)
non_zeros = emp_i != 0
emp_ij = emp_ij[non_zeros][:, non_zeros]
emp_i = emp_i[non_zeros]

emp_i_inv = np.diag(1.0 / emp_i)
emp_sim = emp_i_inv.dot(emp_ij).dot(emp_i_inv)

emp_eps_pmi = np.log(emp_sim)
emp_eps_pmi[emp_eps_pmi == -np.inf] = 0

emp_ppmi = emp_eps_pmi.copy()
emp_ppmi[emp_ppmi < 0] = 0

positive_pmis = (pmis[non_zeros][:, non_zeros]).copy()
positive_pmis[positive_pmis < 0] = 0

  from ipykernel import kernelapp as app


In [113]:
true_pmis = pmis[non_zeros][:, non_zeros]
# true_u, true_s, true_v = np.linalg.svd(true_pmis)
emp_u, emp_s, emp_v = np.linalg.svd(emp_eps_pmi)
emp_ppmi_u, emp_ppmi_s, emp_ppmi_v = np.linalg.svd(emp_ppmi)

In [114]:
# from scipy.stats import pearsonr
from sklearn.preprocessing import normalize

def get_neighbors(sims, k=10):
    neighbors = []
    for i in range(sims.shape[0]):
        neighbors.append(np.argsort(-sims[i])[:k])
        
    return np.asarray(neighbors)

def recall(neighbors1, neighbors2):
    intersection = 0.0
    for i in range(neighbors1.shape[0]):
        intersection += len(set(neighbors1[i]) & set(neighbors2[i]))
        
    return intersection / (neighbors1.shape[0] * neighbors1.shape[1])

k = 5
true_neighbors = get_neighbors(true_pmis)

emp_sims = emp_u[:, :k].dot(np.diag(emp_s[:k])).dot(emp_v[:k, :])
np.fill_diagonal(emp_sims, 0)
emp_neighbors = get_neighbors(emp_sims)

norm_emp_u = normalize(emp_u[:, :k])
norm_emp_sims = norm_emp_u.dot(norm_emp_u.T)
np.fill_diagonal(norm_emp_sims, 0)
norm_emp_neighbors = get_neighbors(norm_emp_sims)

recall(true_neighbors, emp_neighbors), recall(true_neighbors, norm_emp_neighbors)

(0.0453, 0.0082)