In [1]:
import numpy as np
import scipy.sparse
from scipy.sparse.linalg import lobpcg, eigsh
from scipy.sparse import csr_matrix

In [8]:
### Build adjacency matrix from counts
print("Building the word2id index..")
vocab_path = "/Users/i.lobov/hyperwords/data/wiki/wikipedia.corpus.nodups_counts_win=1.words.vocab"
word2id = {}
with open(vocab_path, 'r') as f:
    for idx, line in enumerate(f):
        split_line = line.strip().split(",")
        word = ",".join(split_line[:len(split_line)-1])
        word2id[word] = idx

print("Reading the count data...")
N = len(word2id)
counts_path = "/Users/i.lobov/hyperwords/data/wiki/wikipedia.corpus.nodups_counts_win=1"
data = []
rows = []
cols = []

with open(counts_path, 'r') as f:
    for line in f:
        count, word_a, word_b = line.strip().split()
        word_a_id = word2id[word_a]
        word_b_id = word2id[word_b]
        
        data.append(float(count))
        rows.append(word_a_id)
        cols.append(word_b_id)

print("Building the adjacency matrix...")
adjacency_matrix = csr_matrix((data, (rows, cols)), shape=[N,N])
print("Done!")

Building the word2id index..
Reading the count data...
Building the adjacency matrix...
Done!


In [2]:
adj_path = "/Users/i.lobov/hyperwords/data/wiki/wikipedia.corpus.nodups_counts_win=1.adj"
#scipy.sparse.save_npz(adj_path, adjacency_matrix)
adjacency_matrix = scipy.sparse.load_npz(adj_path + ".npz")

In [3]:
np.sum(adjacency_matrix, axis=1)

matrix([[  1.88991825e+08],
        [  9.56194520e+07],
        [  8.25782250e+07],
        ..., 
        [  1.03000000e+02],
        [  1.03000000e+02],
        [  1.01000000e+02]])

In [3]:
#adjacency_matrix.data = np.ones_like(adjacency_matrix.data)
adjacency_matrix.data = adjacency_matrix.data**0.1

In [5]:
### Find spectral embeddings
d=500
max_iter=1000
verbosity=1
seed=0

degrees = adjacency_matrix.sum(axis=1).flatten()
n = adjacency_matrix.shape[0]
D = scipy.sparse.spdiags(degrees, [0], n, n, format='csr')
L = D - adjacency_matrix
D_inv = scipy.sparse.spdiags(1.0 / degrees, [0], n, n, format='csr')
degrees_sqrt = 1.0 / scipy.sqrt(degrees)
DH = scipy.sparse.spdiags(degrees_sqrt, [0], n, n, format='csr')
L_norm = DH.dot(L.dot(DH))
#N = DH.dot(adjacency_matrix.dot(DH))

rng = np.random.RandomState(seed)
init = rng.rand(n, d + 1)
init[:,0] = np.sqrt(adjacency_matrix.sum(axis=1).flatten())
vals, vecs = lobpcg(A=L_norm, X=init, largest=False, maxiter=max_iter, verbosityLevel=verbosity)

# eigen_scaling = 1.0 / (vals[1:])
# rescaled_eigenvectors = np.sqrt(eigen_scaling) * vecs[:, 1:] / np.sqrt(np.asarray(degrees).T)
# rescaled_eigenvectors = np.ascontiguousarray(rescaled_eigenvectors)

Solving generalized eigenvalue problem with preconditioning

matrix size 189533
block size 501

No constraints


iteration 0
current block size: 500
eigenvalue: [  1.57869665e-15   9.96168366e-01   9.96301968e-01   9.96342566e-01
   9.96390059e-01   9.96441330e-01   9.96459059e-01   9.96487080e-01
   9.96495263e-01   9.96519646e-01   9.96552748e-01   9.96578597e-01
   9.96609938e-01   9.96618002e-01   9.96639895e-01   9.96661260e-01
   9.96675602e-01   9.96687112e-01   9.96703984e-01   9.96714638e-01
   9.96744939e-01   9.96759648e-01   9.96769586e-01   9.96783347e-01
   9.96796962e-01   9.96813231e-01   9.96828684e-01   9.96839109e-01
   9.96865975e-01   9.96882415e-01   9.96903585e-01   9.96911910e-01
   9.96918727e-01   9.96937943e-01   9.96953276e-01   9.96960931e-01
   9.96975789e-01   9.96996372e-01   9.97004348e-01   9.97016080e-01
   9.97022882e-01   9.97034348e-01   9.97041771e-01   9.97061806e-01
   9.97071265e-01   9.97081901e-01   9.97099829e-01   9.97117756e-01
   9.971332

iteration 1
current block size: 500
eigenvalue: [  1.57869665e-15   6.95526172e-01   7.65414637e-01   8.15987855e-01
   8.78894477e-01   8.95838398e-01   9.04304704e-01   9.09643509e-01
   9.18610570e-01   9.20683627e-01   9.25661565e-01   9.29788001e-01
   9.34737812e-01   9.35646226e-01   9.38269556e-01   9.40265215e-01
   9.42090202e-01   9.43540836e-01   9.44168526e-01   9.45875606e-01
   9.47274436e-01   9.47609422e-01   9.47772182e-01   9.48856616e-01
   9.49523712e-01   9.50004882e-01   9.50566587e-01   9.51465129e-01
   9.51886745e-01   9.52243430e-01   9.52899339e-01   9.53686551e-01
   9.54049115e-01   9.54269617e-01   9.54511179e-01   9.54664873e-01
   9.55432204e-01   9.55813662e-01   9.56013925e-01   9.56471637e-01
   9.56496826e-01   9.56684530e-01   9.57383120e-01   9.57635803e-01
   9.57870937e-01   9.58306004e-01   9.58497731e-01   9.58611892e-01
   9.58819608e-01   9.59096471e-01   9.59380532e-01   9.59617538e-01
   9.59795914e-01   9.59978439e-01   9.60119547e-01   9

iteration 2
current block size: 500
eigenvalue: [  1.57869665e-15   5.19774615e-01   5.47225672e-01   6.56410063e-01
   6.76573335e-01   6.94532153e-01   7.06598507e-01   7.22631522e-01
   7.35942309e-01   7.41175138e-01   7.52539981e-01   7.58498213e-01
   7.70559959e-01   7.76145253e-01   7.83706503e-01   7.95088431e-01
   7.99751619e-01   8.03010473e-01   8.05365761e-01   8.11433498e-01
   8.12090875e-01   8.18637773e-01   8.19635435e-01   8.22603914e-01
   8.26442225e-01   8.28811405e-01   8.30272568e-01   8.34214067e-01
   8.35961814e-01   8.38019763e-01   8.39977516e-01   8.41651230e-01
   8.44044551e-01   8.46326619e-01   8.48461375e-01   8.49757399e-01
   8.51946074e-01   8.53424649e-01   8.58934211e-01   8.59844546e-01
   8.62701987e-01   8.63805330e-01   8.64538362e-01   8.66309847e-01
   8.68108886e-01   8.71365549e-01   8.73557432e-01   8.74529462e-01
   8.75558334e-01   8.77094787e-01   8.79350725e-01   8.79912179e-01
   8.80456815e-01   8.81625714e-01   8.82861176e-01   8

iteration 3
current block size: 498
eigenvalue: [  1.57869665e-15   5.16160907e-01   5.42929098e-01   6.49698927e-01
   6.68734226e-01   6.83029298e-01   6.93981955e-01   7.09393795e-01
   7.20688709e-01   7.24787964e-01   7.36118957e-01   7.39662109e-01
   7.49677009e-01   7.56537314e-01   7.60915482e-01   7.69647306e-01
   7.72857467e-01   7.75804627e-01   7.78223354e-01   7.85874452e-01
   7.87882754e-01   7.89949920e-01   7.91549593e-01   7.93652193e-01
   7.97637447e-01   7.98403616e-01   7.99419611e-01   8.01773423e-01
   8.02955387e-01   8.04394341e-01   8.04947970e-01   8.07736595e-01
   8.10762888e-01   8.11917937e-01   8.12993256e-01   8.14759269e-01
   8.16055273e-01   8.18273474e-01   8.21975348e-01   8.24638488e-01
   8.26770222e-01   8.28524667e-01   8.29100674e-01   8.30432751e-01
   8.31094928e-01   8.32698089e-01   8.34181933e-01   8.35453347e-01
   8.37286498e-01   8.38737821e-01   8.40181327e-01   8.40779829e-01
   8.42110289e-01   8.43755465e-01   8.45569854e-01   8

iteration 4
current block size: 477
eigenvalue: [  1.57869665e-15   5.16140885e-01   5.42891060e-01   6.49546775e-01
   6.68531193e-01   6.82711908e-01   6.93616500e-01   7.08945009e-01
   7.20159839e-01   7.24112394e-01   7.35431540e-01   7.38838624e-01
   7.48668858e-01   7.55562652e-01   7.59646823e-01   7.67976473e-01
   7.71065458e-01   7.74175351e-01   7.76401688e-01   7.84067966e-01
   7.86107962e-01   7.87880083e-01   7.89409207e-01   7.91413755e-01
   7.95177298e-01   7.95765205e-01   7.96842685e-01   7.98875139e-01
   8.00060167e-01   8.00817088e-01   8.01788832e-01   8.04592563e-01
   8.07663429e-01   8.08257075e-01   8.09409517e-01   8.10957728e-01
   8.12425572e-01   8.14754000e-01   8.17811924e-01   8.20090735e-01
   8.22043933e-01   8.23981003e-01   8.24564358e-01   8.25677336e-01
   8.26365088e-01   8.27529157e-01   8.29167028e-01   8.29881690e-01
   8.31635970e-01   8.32668896e-01   8.34219718e-01   8.35080693e-01
   8.36384820e-01   8.36730712e-01   8.38904362e-01   8

iteration 5
current block size: 414
eigenvalue: [  1.57869665e-15   5.16138828e-01   5.42888123e-01   6.49544241e-01
   6.68527422e-01   6.82705622e-01   6.93608124e-01   7.08933971e-01
   7.20145123e-01   7.24090850e-01   7.35410891e-01   7.38812641e-01
   7.48636817e-01   7.55530429e-01   7.59602860e-01   7.67904159e-01
   7.70999939e-01   7.74111090e-01   7.76321982e-01   7.83974141e-01
   7.86031504e-01   7.87788597e-01   7.89310766e-01   7.91301343e-01
   7.94971073e-01   7.95558497e-01   7.96637571e-01   7.98645577e-01
   7.99816541e-01   8.00498719e-01   8.01523326e-01   8.04318961e-01
   8.07385500e-01   8.07937530e-01   8.09072222e-01   8.10584432e-01
   8.12082178e-01   8.14397812e-01   8.17366404e-01   8.19595613e-01
   8.21513293e-01   8.23423681e-01   8.24071452e-01   8.25097535e-01
   8.25790873e-01   8.26961171e-01   8.28518175e-01   8.29232243e-01
   8.30920281e-01   8.31878464e-01   8.33408482e-01   8.34332496e-01
   8.35525052e-01   8.35873306e-01   8.37923004e-01   8

iteration 6
current block size: 324
eigenvalue: [  1.57869665e-15   5.16138180e-01   5.42887220e-01   6.49544025e-01
   6.68527129e-01   6.82705088e-01   6.93607391e-01   7.08932982e-01
   7.20143776e-01   7.24089020e-01   7.35408826e-01   7.38809797e-01
   7.48633024e-01   7.55526691e-01   7.59597733e-01   7.67895740e-01
   7.70991187e-01   7.74101621e-01   7.76313153e-01   7.83961380e-01
   7.86021720e-01   7.87774820e-01   7.89295160e-01   7.91284073e-01
   7.94959459e-01   7.95548760e-01   7.96626387e-01   7.98632876e-01
   7.99803549e-01   8.00480634e-01   8.01507207e-01   8.04302598e-01
   8.07370435e-01   8.07919193e-01   8.09051364e-01   8.10561544e-01
   8.12060504e-01   8.14375575e-01   8.17334919e-01   8.19561532e-01
   8.21477438e-01   8.23384293e-01   8.24033575e-01   8.25056895e-01
   8.25745246e-01   8.26919077e-01   8.28469748e-01   8.29182208e-01
   8.30862853e-01   8.31813838e-01   8.33344366e-01   8.34272356e-01
   8.35453168e-01   8.35802755e-01   8.37835788e-01   8

iteration 7
current block size: 217
eigenvalue: [  1.57869665e-15   5.16136788e-01   5.42886153e-01   6.49543945e-01
   6.68526993e-01   6.82704952e-01   6.93607117e-01   7.08932688e-01
   7.20143415e-01   7.24088544e-01   7.35408407e-01   7.38809274e-01
   7.48632343e-01   7.55525548e-01   7.59596575e-01   7.67894425e-01
   7.70989780e-01   7.74100267e-01   7.76311594e-01   7.83959218e-01
   7.86020108e-01   7.87772718e-01   7.89292823e-01   7.91281792e-01
   7.94958491e-01   7.95547876e-01   7.96625531e-01   7.98631639e-01
   7.99802187e-01   8.00479035e-01   8.01505466e-01   8.04300941e-01
   8.07368957e-01   8.07917261e-01   8.09049549e-01   8.10559068e-01
   8.12058385e-01   8.14373290e-01   8.17331789e-01   8.19557717e-01
   8.21473013e-01   8.23379659e-01   8.24028833e-01   8.25051798e-01
   8.25739556e-01   8.26913704e-01   8.28462485e-01   8.29175999e-01
   8.30855000e-01   8.31804543e-01   8.33334574e-01   8.34264178e-01
   8.35442696e-01   8.35792683e-01   8.37823617e-01   8

iteration 8
current block size: 117
eigenvalue: [  1.57869665e-15   5.16136216e-01   5.42885418e-01   6.49543882e-01
   6.68526906e-01   6.82704745e-01   6.93606803e-01   7.08932392e-01
   7.20142955e-01   7.24088019e-01   7.35407737e-01   7.38808536e-01
   7.48631213e-01   7.55524378e-01   7.59595118e-01   7.67891980e-01
   7.70987127e-01   7.74097393e-01   7.76309032e-01   7.83956215e-01
   7.86017361e-01   7.87769151e-01   7.89289227e-01   7.91277791e-01
   7.94958244e-01   7.95547619e-01   7.96625300e-01   7.98631319e-01
   7.99801896e-01   8.00478751e-01   8.01505163e-01   8.04300558e-01
   8.07368565e-01   8.07916849e-01   8.09049095e-01   8.10558453e-01
   8.12057920e-01   8.14372746e-01   8.17331071e-01   8.19556957e-01
   8.21472400e-01   8.23378862e-01   8.24027935e-01   8.25050742e-01
   8.25738372e-01   8.26912612e-01   8.28461145e-01   8.29174781e-01
   8.30853630e-01   8.31803389e-01   8.33333053e-01   8.34262752e-01
   8.35440783e-01   8.35791164e-01   8.37821597e-01   8

iteration 9
current block size: 43
eigenvalue: [  1.57869665e-15   5.16135960e-01   5.42885137e-01   6.49543863e-01
   6.68526867e-01   6.82704692e-01   6.93606720e-01   7.08932274e-01
   7.20142829e-01   7.24087854e-01   7.35407505e-01   7.38808307e-01
   7.48630940e-01   7.55524002e-01   7.59594583e-01   7.67891187e-01
   7.70986443e-01   7.74096541e-01   7.76308163e-01   7.83955288e-01
   7.86016529e-01   7.87767718e-01   7.89287987e-01   7.91276550e-01
   7.94958097e-01   7.95547444e-01   7.96625187e-01   7.98631115e-01
   7.99801649e-01   8.00478543e-01   8.01504909e-01   8.04300300e-01
   8.07368322e-01   8.07916568e-01   8.09048846e-01   8.10558132e-01
   8.12057579e-01   8.14372383e-01   8.17330667e-01   8.19556402e-01
   8.21471892e-01   8.23378241e-01   8.24027374e-01   8.25049978e-01
   8.25737622e-01   8.26911903e-01   8.28459913e-01   8.29174048e-01
   8.30852692e-01   8.31802374e-01   8.33331828e-01   8.34261472e-01
   8.35439557e-01   8.35789835e-01   8.37820099e-01   8.

iteration 10
current block size: 5
eigenvalue: [  1.57869665e-15   5.16135872e-01   5.42885042e-01   6.49543859e-01
   6.68526859e-01   6.82704679e-01   6.93606696e-01   7.08932246e-01
   7.20142793e-01   7.24087809e-01   7.35407455e-01   7.38808251e-01
   7.48630823e-01   7.55523900e-01   7.59594433e-01   7.67890904e-01
   7.70986258e-01   7.74096177e-01   7.76307891e-01   7.83954910e-01
   7.86016219e-01   7.87767263e-01   7.89287615e-01   7.91276295e-01
   7.94958074e-01   7.95547417e-01   7.96625160e-01   7.98631056e-01
   7.99801597e-01   8.00478503e-01   8.01504848e-01   8.04300236e-01
   8.07368256e-01   8.07916479e-01   8.09048748e-01   8.10558045e-01
   8.12057461e-01   8.14372296e-01   8.17330570e-01   8.19556314e-01
   8.21471791e-01   8.23378126e-01   8.24027213e-01   8.25049853e-01
   8.25737392e-01   8.26911673e-01   8.28459728e-01   8.29173847e-01
   8.30852482e-01   8.31802167e-01   8.33331672e-01   8.34261125e-01
   8.35439185e-01   8.35789557e-01   8.37819863e-01   8.

iteration 11
current block size: 1
eigenvalue: [  1.57869665e-15   5.16135868e-01   5.42885025e-01   6.49543859e-01
   6.68526859e-01   6.82704678e-01   6.93606695e-01   7.08932244e-01
   7.20142787e-01   7.24087807e-01   7.35407454e-01   7.38808247e-01
   7.48630812e-01   7.55523883e-01   7.59594430e-01   7.67890901e-01
   7.70986247e-01   7.74096169e-01   7.76307846e-01   7.83954894e-01
   7.86016202e-01   7.87767253e-01   7.89287560e-01   7.91276283e-01
   7.94958070e-01   7.95547412e-01   7.96625158e-01   7.98631053e-01
   7.99801593e-01   8.00478501e-01   8.01504840e-01   8.04300230e-01
   8.07368253e-01   8.07916477e-01   8.09048743e-01   8.10558035e-01
   8.12057457e-01   8.14372288e-01   8.17330562e-01   8.19556306e-01
   8.21471775e-01   8.23378112e-01   8.24027210e-01   8.25049842e-01
   8.25737385e-01   8.26911671e-01   8.28459676e-01   8.29173833e-01
   8.30852466e-01   8.31802160e-01   8.33331665e-01   8.34261107e-01
   8.35439179e-01   8.35789527e-01   8.37819856e-01   8.

iteration 12
final eigenvalue: [  1.57869665e-15   5.16135868e-01   5.42885025e-01   6.49543859e-01
   6.68526859e-01   6.82704678e-01   6.93606695e-01   7.08932244e-01
   7.20142787e-01   7.24087807e-01   7.35407454e-01   7.38808247e-01
   7.48630812e-01   7.55523883e-01   7.59594430e-01   7.67890899e-01
   7.70986246e-01   7.74096169e-01   7.76307840e-01   7.83954894e-01
   7.86016196e-01   7.87767240e-01   7.89287552e-01   7.91276282e-01
   7.94958070e-01   7.95547412e-01   7.96625158e-01   7.98631052e-01
   7.99801593e-01   8.00478499e-01   8.01504838e-01   8.04300229e-01
   8.07368252e-01   8.07916476e-01   8.09048743e-01   8.10558035e-01
   8.12057456e-01   8.14372288e-01   8.17330562e-01   8.19556304e-01
   8.21471775e-01   8.23378111e-01   8.24027199e-01   8.25049837e-01
   8.25737385e-01   8.26911666e-01   8.28459674e-01   8.29173833e-01
   8.30852465e-01   8.31802155e-01   8.33331664e-01   8.34261107e-01
   8.35439178e-01   8.35789524e-01   8.37819850e-01   8.39279356e-01
   

final residual norms: [  8.48442221e-14   2.64603876e-03   2.25881837e-03   5.13593940e-04
   6.08089458e-04   8.05949614e-04   9.27192118e-04   9.92331830e-04
   1.20006481e-03   1.25777215e-03   1.34258457e-03   1.42076361e-03
   1.67372707e-03   1.74046359e-03   1.97155049e-03   2.32111072e-03
   2.46154633e-03   2.36973006e-03   2.59106192e-03   2.48943101e-03
   2.64206421e-03   2.91467363e-03   2.76988337e-03   2.88599325e-03
   7.65573633e-04   7.93533713e-04   7.53262206e-04   9.21731430e-04
   8.78981105e-04   8.98292326e-04   9.75356604e-04   9.29262390e-04
   1.02352937e-03   1.01525771e-03   1.02605842e-03   1.05785951e-03
   1.07202594e-03   1.10373381e-03   1.25397772e-03   1.26427715e-03
   1.41036758e-03   1.44252024e-03   1.32566256e-03   1.45432449e-03
   1.44764162e-03   1.51521198e-03   1.64358577e-03   1.56023439e-03
   1.65172223e-03   1.67461554e-03   1.79614721e-03   1.80304429e-03
   1.93677774e-03   1.87564234e-03   2.03840001e-03   1.98861682e-03
   2.0246571

In [6]:
# save_path = "/Users/i.lobov/hyperwords/data/wiki/spectral_embeddings_d=300.words"
# np.save(save_path, rescaled_eigenvectors)

# save_path = "/Users/i.lobov/hyperwords/data/wiki/eigenvectors_pow=0.1_d=500.words"
# np.save(save_path, vecs)

# save_path = "/Users/i.lobov/hyperwords/data/wiki/eigenvalues_pow=0.1_d=500.words"
# np.save(save_path, vals)

In [188]:
eigen_scaling = 1.0 / (vals[1:])
rescaled_eigenvectors = np.sqrt(eigen_scaling) * vecs[:, 1:] / np.sqrt(np.asarray(degrees).T)
rescaled_eigenvectors = np.ascontiguousarray(rescaled_eigenvectors)
rescaled_eigenvectors /= np.linalg.norm(rescaled_eigenvectors, axis=1, keepdims=True)

save_path = "/Users/i.lobov/hyperwords/data/wiki/eigenscaled_d=500.words"
np.save(save_path, rescaled_eigenvectors)

In [4]:
save_path = "/Users/i.lobov/hyperwords/data/wiki/eigenvectors_pow=0.1_d=500.words.npy"
vecs = np.load(save_path)

save_path = "/Users/i.lobov/hyperwords/data/wiki/eigenvalues_pow=0.1_d=500.words.npy"
vals = np.load(save_path)

degrees = adjacency_matrix.sum(axis=1).flatten()

In [5]:
eigen_scaling = 1.0 / (vals[1:])
rescaled_eigenvectors = np.sqrt(eigen_scaling) * vecs[:, 1:] / np.sqrt(np.asarray(degrees).T)
rescaled_eigenvectors = np.ascontiguousarray(rescaled_eigenvectors)

In [6]:
path = "/Users/i.lobov/hyperwords/testsets/ws/luong_rare.txt"
test = []
with open(path) as f:
    for line in f:
        x, y, sim = line.strip().lower().split()
        test.append(((x, y), float(sim)))

In [7]:
from representations.matrix_serializer import load_vocabulary

path = "/Users/i.lobov/hyperwords/data/wiki/wikipedia.corpus.nodups_counts_win=1.words.vocab"
wi, iw = load_vocabulary(path)

array([  2.72080744e+05,   2.53217468e+05,   2.79508573e+05, ...,
         8.57843653e+00,   4.26408305e+01,   1.19142927e+01])

In [21]:
from scipy.stats.stats import spearmanr

#m = rescaled_eigenvectors.copy()
m = vecs[:,1:].copy()
m = m / np.linalg.norm(m, axis=1, keepdims=True)
dim = m.shape[1]
C = degrees.sum() / 10.0

def represent(w):
    if w in wi:
        return m[wi[w], :]
    else:
        return np.zeros(dim)

# def similarity(w1, w2):
#     rep_w1 = represent(w1)
#     rep_w2 = represent(w2)
#     inner_product = np.log(max(C * rep_w1.dot(rep_w2), 1.0))
#     w1_l2 = np.log(C * rep_w1.dot(rep_w1))
#     w2_l2 = np.log(C * rep_w2.dot(rep_w2))
#     #print(inner_product, rep_w1.dot(rep_w2))
    
#     return inner_product/np.sqrt(w1_l2*w2_l2)

def similarity(w1, w2):
    return represent(w1).dot(represent(w2))

results = []
not_known = 0
for (x, y), sim in test:
    if (not x in wi) or (not y in wi):
        not_known += 1
    else:
        results.append((similarity(x, y), sim))
        
actual, expected = zip(*results)
spearmanr(actual, expected)[0], not_known / len(test)

(0.41475239343495807, 0.3352999016715831)

In [175]:
window = 5

eigenvals = np.zeros_like(vals[1:])
for r in range(1, window+1):
    eigenvals += (1-vals[1:])**r
eigenvals /= window

# Analogy task

In [66]:
m = vecs[:,1:].copy()
m = m / np.linalg.norm(m, axis=1, keepdims=True)

In [21]:
path = "../testsets/analogy/google.txt"
test = []
with open(path) as f:
    for line in f:
        analogy = line.strip().lower().split()
        test.append(analogy)
        
vocab = set()
for analogy in test:
    vocab.update(analogy)
vocab = sorted(vocab)
xi = dict([(a, i) for i, a in enumerate(vocab)])
ix = vocab

In [67]:
vocab_representation = np.zeros([len(vocab), m.shape[1]])
for i, w in enumerate(vocab):
    if w in wi:
        vocab_representation[i] = m[wi[w]]
        
sims = vocab_representation.dot(m.T)
sims = (sims+1)/2

In [70]:
correct_add = 0.0
for i, (a, a_, b, b_) in enumerate(test):
    sa = sims[xi[a]]
    sa_ = sims[xi[a_]]
    sb = sims[xi[b]]

    add_sim = -sa+sa_+sb
#     add = -vocab_representation[xi[a]]
#     add += vocab_representation[xi[a_]]
#     add += vocab_representation[xi[b]]
#     add_sim = m.dot(add)
    
    if a in wi: add_sim[wi[a]] = 0  
    if a_ in wi: add_sim[wi[a_]] = 0
    if b in wi: add_sim[wi[b]] = 0
    b_add = iw[np.argmax(add_sim)]
    if b_add == b_:
        correct_add += 1
        
    if i % 1000 == 0: print(i, correct_add)
        
correct_add / len(test)

0 1.0
1000 584.0
2000 1141.0
3000 1609.0
4000 2067.0
5000 2611.0


KeyboardInterrupt: 

In [68]:
len(test)

19544

In [4]:
import subprocess
import re

# Get process info
ps = subprocess.Popen(['ps', '-caxm', '-orss,comm'], stdout=subprocess.PIPE).communicate()[0].decode()
vm = subprocess.Popen(['vm_stat'], stdout=subprocess.PIPE).communicate()[0].decode()

# Iterate processes
processLines = ps.split('\n')
sep = re.compile('[\s]+')
rssTotal = 0 # kB
for row in range(1,len(processLines)):
    rowText = processLines[row].strip()
    rowElements = sep.split(rowText)
    try:
        rss = float(rowElements[0]) * 1024
    except:
        rss = 0 # ignore...
    rssTotal += rss

# Process vm_stat
vmLines = vm.split('\n')
sep = re.compile(':[\s]+')
vmStats = {}
for row in range(1,len(vmLines)-2):
    rowText = vmLines[row].strip()
    rowElements = sep.split(rowText)
    vmStats[(rowElements[0])] = int(rowElements[1].strip('\.')) * 4096

print('Wired Memory:\t\t%d MB' % ( vmStats["Pages wired down"]/1024/1024))
print('Active Memory:\t\t%d MB' % ( vmStats["Pages active"]/1024/1024 ))
print('Inactive Memory:\t%d MB' % ( vmStats["Pages inactive"]/1024/1024 ))
print('Free Memory:\t\t%d MB' % ( vmStats["Pages free"]/1024/1024 ))
print('Real Mem Total (ps):\t%.3f MB' % ( rssTotal/1024/1024 ))

Wired Memory:		2509 MB
Active Memory:		4920 MB
Inactive Memory:	2941 MB
Free Memory:		4866 MB
Real Mem Total (ps):	9195.434 MB
