In [1]:
import numpy as np
import scipy.sparse
from scipy.sparse.linalg import lobpcg, eigsh
from scipy.sparse import csr_matrix
import time

In [2]:
adj_path = "/Users/i.lobov/hyperwords/data/wiki/wikipedia.corpus.nodups_counts_win=1.adj"
adjacency_matrix = scipy.sparse.load_npz(adj_path + ".npz")
adjacency_matrix.data = np.ones_like(adjacency_matrix.data, dtype=np.float64)

In [3]:
n = adjacency_matrix.shape[0]
degrees = np.asarray(adjacency_matrix.sum(axis=1)).flatten()
D = scipy.sparse.spdiags(degrees, [0], n, n, format='csr')
L = D - adjacency_matrix
degrees_sqrt = np.sqrt(degrees)
D_inv_sqrt = scipy.sparse.spdiags(1.0 / degrees_sqrt, [0], n, n, format='csr')
L = D_inv_sqrt.dot(L.dot(D_inv_sqrt))

In [4]:
### Initial step
rng = np.random.RandomState(0)
dim = 51
init = rng.rand(n, dim)
init[:, 0] = degrees_sqrt
vals, vecs = lobpcg(L, X=init, maxiter=100, largest=False, verbosityLevel=1)

Solving generalized eigenvalue problem with preconditioning

matrix size 189533
block size 51

No constraints


iteration 0
current block size: 50
eigenvalue: [ -9.67921317e-15   9.98312152e-01   9.98380793e-01   9.98457002e-01
   9.98487424e-01   9.98520206e-01   9.98532051e-01   9.98583051e-01
   9.98644007e-01   9.98681632e-01   9.98688627e-01   9.98721160e-01
   9.98778709e-01   9.98823126e-01   9.98835939e-01   9.98875172e-01
   9.98895839e-01   9.98907764e-01   9.98947802e-01   9.98961920e-01
   9.98973479e-01   9.99031479e-01   9.99076097e-01   9.99101508e-01
   9.99112562e-01   9.99137453e-01   9.99163813e-01   9.99200518e-01
   9.99213717e-01   9.99221323e-01   9.99257599e-01   9.99307719e-01
   9.99321386e-01   9.99364384e-01   9.99383241e-01   9.99405335e-01
   9.99424025e-01   9.99471946e-01   9.99517710e-01   9.99560318e-01
   9.99576117e-01   9.99602373e-01   9.99649524e-01   9.99657233e-01
   9.99697431e-01   9.99742160e-01   9.99768698e-01   9.99795616e-01
   9.99920054

iteration 5
current block size: 46
eigenvalue: [ -9.67921317e-15   5.43916752e-01   6.05717982e-01   6.48894446e-01
   6.68231644e-01   6.84534688e-01   6.95105727e-01   7.11702575e-01
   7.21188043e-01   7.29103190e-01   7.37606639e-01   7.41279521e-01
   7.52174019e-01   7.58741057e-01   7.63617042e-01   7.72940744e-01
   7.77169491e-01   7.78427400e-01   7.82031484e-01   7.84517247e-01
   7.89743870e-01   7.92832871e-01   7.96616723e-01   7.96760176e-01
   8.02011462e-01   8.03841642e-01   8.06380642e-01   8.06891882e-01
   8.08705946e-01   8.09169038e-01   8.10877082e-01   8.11621210e-01
   8.14074078e-01   8.19316225e-01   8.20109597e-01   8.23314287e-01
   8.26903023e-01   8.29411190e-01   8.31947693e-01   8.35354997e-01
   8.36128413e-01   8.38138228e-01   8.40066418e-01   8.42488399e-01
   8.44922915e-01   8.46172231e-01   8.48191760e-01   8.49112111e-01
   8.52760848e-01   8.53839851e-01   8.57967202e-01]
residual norms: [  4.51402916e-13   1.57594516e-03   3.04221479e-03   3.

iteration 10
current block size: 5
eigenvalue: [ -9.67921317e-15   5.43911658e-01   6.05693230e-01   6.48848483e-01
   6.68182777e-01   6.84355498e-01   6.94948223e-01   7.11358221e-01
   7.20768311e-01   7.28554556e-01   7.36702693e-01   7.40450073e-01
   7.50891318e-01   7.56966443e-01   7.62115718e-01   7.70768596e-01
   7.74797037e-01   7.75431930e-01   7.78017225e-01   7.82025832e-01
   7.85380230e-01   7.87665208e-01   7.90503113e-01   7.91734890e-01
   7.92769433e-01   7.97270879e-01   7.98190173e-01   8.00271294e-01
   8.01679225e-01   8.02916446e-01   8.03494006e-01   8.04006381e-01
   8.04811628e-01   8.05144145e-01   8.10413977e-01   8.12496594e-01
   8.14564099e-01   8.15481403e-01   8.16356855e-01   8.19451542e-01
   8.21637441e-01   8.24054531e-01   8.26279162e-01   8.26965953e-01
   8.29452931e-01   8.31043371e-01   8.32688388e-01   8.33674584e-01
   8.34442352e-01   8.34936069e-01   8.35808903e-01]
residual norms: [  4.51478053e-13   5.63883094e-04   9.05304807e-04   9.

In [6]:
from representations.matrix_serializer import load_vocabulary

path = "/Users/i.lobov/hyperwords/data/wiki/wikipedia.corpus.nodups_counts_win=1.words.vocab"
wi, iw = load_vocabulary(path)

path = "/Users/i.lobov/hyperwords/testsets/ws/bruni_men.txt"
test = []
with open(path) as f:
    for line in f:
        x, y, sim = line.strip().lower().split()
        if x in wi and y in wi:
            test.append(((x, y), float(sim)))

In [9]:
from scipy.stats.stats import spearmanr

def evaluate(vecs):
    m = vecs.copy()
    m = m / np.linalg.norm(m, axis=1, keepdims=True)

    def represent(w):
        return m[wi[w], :]

    def similarity(w1, w2):
        return represent(w1).dot(represent(w2))

    results = []
    for (x, y), sim in test:
        results.append((similarity(x, y), sim))

    actual, expected = zip(*results)
    print(spearmanr(actual, expected)[0])

In [10]:
evaluate(vecs)

0.401791812264


In [2]:
vals

NameError: name 'vals' is not defined

In [3]:
ipr = (vecs**4).sum(axis=0)
print("Max ipr is %f" % np.max(ipr))
learning_rate = 10
threshold = 5/n

for i in range(100):
    print("Iteration %d" % i)
    max_ipr_index = np.argmax(ipr)
    update_vec = vecs[:, max_ipr_index]**2 * learning_rate
    update_matrix = scipy.sparse.spdiags(update_vec, [0], n, n, format='csr')
    L = L + update_matrix
    vals, vecs = lobpcg(L, X=vecs, maxiter=100, largest=False, verbosityLevel=1)
    
    ipr = (vecs**4).sum(axis=0)
    print("Max ipr is %f" % np.max(ipr))
    evaluate(vecs)
    if np.max(ipr) < threshold:
        break

NameError: name 'vecs' is not defined

In [19]:
output_path = "../data/wiki/win=1_x-laplacin_pow=0.00_dim=50"
np.save(output_path + ".vecs", vecs)
np.save(output_path + ".vals", vals)
np.save(output_path + ".degrees", degrees)