In [1]:
import numpy as np
import scipy.sparse
from scipy.sparse.linalg import lobpcg, eigsh
from scipy.sparse import csr_matrix
import time

In [2]:
adj_path = "/Users/i.lobov/hyperwords/data/wiki/wikipedia.corpus.nodups_counts_win=1.adj"
adjacency_matrix = scipy.sparse.load_npz(adj_path + ".npz")
#adjacency_matrix.data = np.ones_like(adjacency_matrix.data, dtype=np.float64)
adjacency_matrix.data = adjacency_matrix.data ** 0.1

In [3]:
n = adjacency_matrix.shape[0]
degrees = np.asarray(adjacency_matrix.sum(axis=1)).flatten()
D = scipy.sparse.spdiags(degrees, [0], n, n, format='csr')
L = D - adjacency_matrix
degrees_sqrt = np.sqrt(degrees)
D_inv_sqrt = scipy.sparse.spdiags(1.0 / degrees_sqrt, [0], n, n, format='csr')
L = D_inv_sqrt.dot(L.dot(D_inv_sqrt))

In [4]:
I = scipy.sparse.eye(n, n, dtype=np.float64, format='csr')

In [4]:
degrees = np.asarray(adjacency_matrix.sum(axis=1)).flatten()

In [3]:
# PMI-adjacency
n = adjacency_matrix.shape[0]
degrees = np.asarray(adjacency_matrix.sum(axis=1)).flatten()
total_count = degrees.sum()
D_inv = scipy.sparse.spdiags(1.0 / degrees, [0], n, n, format='csr')
adjacency_matrix = D_inv.dot(adjacency_matrix.dot(D_inv))
adjacency_matrix.data = np.maximum(np.log(adjacency_matrix.data * total_count), 0)
adjacency_matrix.eliminate_zeros()
degrees = np.asarray(adjacency_matrix.sum(axis=1), dtype=np.float64).flatten() #Update degrees

In [5]:
#r = np.sqrt(5176.292450)
r = np.sqrt(np.mean(degrees**2) / np.mean(degrees) - 1)
n = adjacency_matrix.shape[0]

dt = adjacency_matrix.data * r
dt /= r**2 - adjacency_matrix.data ** 2

adjacency_matrix.data = adjacency_matrix.data**2 / (r**2 - adjacency_matrix.data**2)
bethe_diagonal = 1 + np.asarray(adjacency_matrix.sum(axis=1)).flatten()
D = scipy.sparse.spdiags(bethe_diagonal, [0], n, n, format='csr')

adjacency_matrix.data = dt
Hr = D - adjacency_matrix

In [6]:
#rng = np.random.RandomState(0)
dim = 100
tol = np.sqrt(1e-15)*n
#init = rng.rand(n, dim)
#init[:,0] = degrees_sqrt

start = time.time()
#vals, vecs = lobpcg(L, X=init, maxiter=100, largest=False, verbosityLevel=1)
# vals, vecs = eigsh(L-I, dim, which='LM', tol=tol)
vals, vecs = eigsh(Hr, dim, which='SA', tol=tol)
print("time elapsed: %d" % (time.time() - start))

time elapsed: 382


In [7]:
vals

array([ -4.53101092e+01,  -1.08895323e+01,  -8.19233865e+00,
        -6.11187696e+00,  -4.94279585e+00,  -4.15455813e+00,
        -4.08836478e+00,  -3.63634506e+00,  -3.12352071e+00,
        -2.95109532e+00,  -2.79151716e+00,  -2.66331401e+00,
        -2.33386005e+00,  -2.26894258e+00,  -2.08696911e+00,
        -1.85999982e+00,  -1.79711046e+00,  -1.67084156e+00,
        -1.64233173e+00,  -1.62821491e+00,  -1.47430340e+00,
        -1.39976119e+00,  -1.35298976e+00,  -1.30205807e+00,
        -1.24198053e+00,  -1.15406503e+00,  -1.10311912e+00,
        -1.05981124e+00,  -1.05633489e+00,  -1.00502024e+00,
        -9.85664442e-01,  -9.38488954e-01,  -8.79238747e-01,
        -8.53475171e-01,  -8.20144019e-01,  -8.07808641e-01,
        -7.90999241e-01,  -7.77533479e-01,  -7.18181170e-01,
        -7.03102983e-01,  -6.66527986e-01,  -6.60826533e-01,
        -6.14422837e-01,  -5.93851640e-01,  -5.67799416e-01,
        -5.58837706e-01,  -5.50974785e-01,  -5.12038111e-01,
        -4.97828179e-01,

In [8]:
output_path = "../data/wiki/win=1_weighted_bethe_hessian_pow=0.10_dim=100"
np.save(output_path + ".vecs", vecs)
np.save(output_path + ".vals", vals)
np.save(output_path + ".degrees", degrees)

In [11]:
output_path = "../data/wiki/win=1_bethe_hessian_small_rhoB_est_pow=0.00_dim=100"
vecs2 = np.load(output_path + ".vecs.npy")
vals2 = np.load(output_path + ".vals.npy")

In [14]:
all_vals = np.concatenate([vals, vals2], axis=0)
all_vecs = np.concatenate([vecs, vecs2], axis=1)

In [19]:
top_vals_inds = np.argsort(all_vals)[:100]
output_path = "../data/wiki/win=1_bethe_hessian_combo_rhoB_est_pow=0.00_dim=100"
np.save(output_path + ".vecs", all_vecs[:, top_vals_inds])
np.save(output_path + ".vals", all_vals[top_vals_inds])
np.save(output_path + ".degrees", degrees)