In [3]:
import numpy as np
import scipy.sparse
from scipy.sparse.linalg import lobpcg
from scipy.sparse import csr_matrix

In [8]:
### Build adjacency matrix from counts
vocab_path = "/Users/i.lobov/hyperwords/data/counts.words.vocab"
word2id = {}
with open(vocab_path, 'r') as f:
    for idx, line in enumerate(f):
        word, _ = line.strip().split(",")
        word2id[word] = idx
        
print("Built the word2id index")

N = len(word2id)
counts_path = "/Users/i.lobov/hyperwords/data/counts"
data = []
rows = []
cols = []

with open(counts_path, 'r') as f:
    for line in f:
        count, word_a, word_b = line.strip().split()
        word_a_id = word2id[word_a]
        word_b_id = word2id[word_b]
        
        data.append(float(count))
        rows.append(word_a_id)
        cols.append(word_b_id)
        
print("Read count data")

adjacency_matrix = csr_matrix((data, (rows, cols)), shape=[N,N])
print("Built adjacency matrix")

Built the word2id index
Read count data
Built adjacency matrix


In [22]:
### Find spectral embeddings
d=100
max_iter=1000
verbosity=0
seed=0

degrees = adjacency_matrix.sum(axis=1).flatten()
n = adjacency_matrix.shape[0]
D = scipy.sparse.spdiags(degrees, [0], n, n, format='csr')
L = D - adjacency_matrix
degrees_sqrt = 1.0 / scipy.sqrt(degrees)
DH = scipy.sparse.spdiags(degrees_sqrt, [0], n, n, format='csr')
L_norm = DH.dot(L.dot(DH))

rng = np.random.RandomState(seed)
init = rng.rand(n, d + 1)
vals, vecs = lobpcg(A=L_norm, X=init, largest=False, maxiter=max_iter, verbosityLevel=verbosity)

eigen_scaling = 1.0 / (vals[:])
rescaled_eigenvectors = np.sqrt(eigen_scaling) * vecs[:, :] / np.sqrt(np.asarray(degrees).T)
rescaled_eigenvectors = np.ascontiguousarray(rescaled_eigenvectors)

In [25]:
save_path = "/Users/i.lobov/hyperwords/data/spectral_embeddings_unscaled_d=100.words"
np.save(save_path, vecs[:,1:])