In [135]:
import numpy as np
import scipy as sp
from scipy import linalg as sp_linalg
import scipy.sparse
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import normalize
%matplotlib inline

def lognormal_similarity_seed(rng, V):
    return rng.lognormal(mean=1, sigma=1, size=[V, V])

def gaussian_mixture_similarity_seed_factory(num_centers=10):
    def inner(rng, V):
        X, labels = make_blobs(n_samples=V,
                   n_features=num_centers,
                   centers=num_centers,
                   cluster_std=1.0,
                   shuffle=False,
                   random_state=rng)
        
        dist = euclidean_distances(X)
        inv_dist = 1.0 / dist
        inv_dist[inv_dist == np.inf] = 0
        inv_dist /= inv_dist.mean()
        return inv_dist
    return inner

def generate_data(V=1000, power_law_alpha=1.0, similarity_seed_fun=lognormal_similarity_seed, seed=0):
    rng = np.random.RandomState(seed)
    power_law_seed = rng.power(a=power_law_alpha, size=V)
    P_ij = np.triu(similarity_seed_fun(rng, V), k=1)
    P_ij *= power_law_seed
    P_ij += P_ij.T #P_ij == P_ji
    P_ij /= P_ij.sum()

    p_i = P_ij.sum(axis=1)
    P_i = np.diag(p_i)
    P_i_inv = np.diag(1.0 / p_i)
    similarities = P_i_inv.dot(P_ij).dot(P_i_inv)

    pmis = np.log(similarities)
    pmis[pmis == -np.inf] = 0

    assert np.allclose(similarities, similarities.T)
    assert np.allclose(similarities.dot(P_i).sum(axis=1), 1)
    assert np.allclose(P_ij, P_ij.T)
    assert np.allclose(pmis, pmis.T)
    assert np.allclose(P_i.sum(), 1)
    assert np.allclose(P_ij.sum(), 1)
    return P_ij, p_i, similarities, pmis

def sample_and_estimate(P_ij, sampling_factor=1.0, seed=0):
    rng = np.random.RandomState(seed)
    n = V**2 * sampling_factor

    emp_ij = rng.multinomial(n, P_ij.flatten()).reshape([V, V])
    emp_ij = emp_ij / emp_ij.sum()

    emp_i = emp_ij.sum(axis=1)
    non_zeros = emp_i != 0
    emp_ij = emp_ij[non_zeros][:, non_zeros]
    emp_i = emp_i[non_zeros]

    emp_i_inv = np.diag(1.0 / emp_i)
    emp_sim = emp_i_inv.dot(emp_ij).dot(emp_i_inv)

    emp_pmi = np.log(emp_sim)
    emp_pmi[emp_pmi == -np.inf] = 0
    return emp_pmi, non_zeros

def get_neighbors(sims, k=10):
    neighbors = []
    for i in range(sims.shape[0]):
        neighbors.append(np.argsort(-sims[i])[:k])
        
    return np.asarray(neighbors)

def recall(neighbors1, neighbors2):
    intersection = 0.0
    for i in range(neighbors1.shape[0]):
        intersection += len(set(neighbors1[i]) & set(neighbors2[i]))
        
    return intersection / (neighbors1.shape[0] * neighbors1.shape[1])

In [144]:
# from scipy.stats import kendalltau

V = 1000
rank = 3
num_neighbors = 100
sampling_factor = 10.0

for alpha in [0.1, 1, 10]:
    P_ij, p_i, similarities, pmis = generate_data(V, power_law_alpha=alpha, 
                                                  similarity_seed_fun=gaussian_mixture_similarity_seed_factory(5))
    emp_pmi, non_zeros = sample_and_estimate(P_ij, sampling_factor=sampling_factor)
    sparse_emp_pmi = scipy.sparse.csr_matrix(emp_pmi)

    true_pmis = pmis[non_zeros][:, non_zeros]
    true_neighbors = get_neighbors(true_pmis, num_neighbors)
    
    emp_u, emp_s, emp_v = scipy.sparse.linalg.svds(sparse_emp_pmi, k=rank)
    
    # Honest SVD reconstruction-based ranking with -inf on diagonal
    svd_sims = emp_u[:, :rank].dot(np.diag(emp_s[:rank])).dot(emp_v[:rank, :])
    np.fill_diagonal(svd_sims, -np.inf)
    svd_neighbors = get_neighbors(svd_sims, num_neighbors)

    # Cosine between rows of eigenvectors with -inf on diagonal
    normed_emp_u = normalize(emp_u[:, :rank])
    cos_sims = normed_emp_u.dot(normed_emp_u.T)
    np.fill_diagonal(cos_sims, -np.inf)
    cos_neighbors = get_neighbors(cos_sims, num_neighbors)
    
    print(alpha, recall(true_neighbors, svd_neighbors), recall(true_neighbors, cos_neighbors))



0.1 0.48624 0.20653




1 0.32514 0.43927




10 0.33235 0.54825


In [141]:
emp_s

array([ 24.80956045,  25.1572714 ,  27.60591235,  27.82407685,
        29.10719434, 192.7616492 , 262.09520684, 428.74410465,
       431.0396813 , 594.29977161])