In [34]:
import logging
import time
from sparsesvd import sparsesvd

import numpy as np
import os
from docopt import docopt
from scipy.sparse import csr_matrix, dok_matrix, load_npz
from sklearn.preprocessing import normalize
from scipy import linalg

from representations.matrix_serializer import save_vocabulary, load_vocabulary
from utils.randomized import randomized_eigh, normalized_embedder

In [35]:
def load_adjacency_matrix(counts_path):
    if os.path.exists(counts_path + ".adjacency.npz"):
        adjacency_matrix = load_npz(counts_path + ".adjacency.npz")
    else:
        data = np.load(counts_path + ".data.npz")["arr_0"]
        row_inds = np.load(counts_path + ".row_inds.npz")["arr_0"]
        col_inds = np.load(counts_path + ".col_inds.npz")["arr_0"]
        adjacency_matrix = csr_matrix((data, (row_inds, col_inds)), dtype=np.float64)
    return adjacency_matrix


def build_ppmi_matrix(adjacency_matrix, cds, neg, pos):
    sum_w = np.asarray(adjacency_matrix.sum(axis=1)).flatten()
    sum_c = sum_w.copy()
    sum_c = sum_c ** cds

    sum_total = sum_w.sum()
    sum_w = np.reciprocal(sum_w)
    sum_c = np.reciprocal(sum_c)

    pmi = multiply_by_rows(adjacency_matrix, sum_w)
    pmi = multiply_by_columns(pmi, sum_c)
    pmi = pmi * sum_total

    pmi.data = np.log(pmi.data)

    pmi.data = pmi.data - np.log(neg) + np.log(pos)
    pmi.data[pmi.data < 0] = 0
    pmi.eliminate_zeros()

    return pmi


def multiply_by_rows(matrix, row_coefs):
    normalizer = dok_matrix((len(row_coefs), len(row_coefs)))
    normalizer.setdiag(row_coefs)
    return normalizer.tocsr().dot(matrix)


def multiply_by_columns(matrix, col_coefs):
    normalizer = dok_matrix((len(col_coefs), len(col_coefs)))
    normalizer.setdiag(col_coefs)
    return matrix.dot(normalizer.tocsr())

def orthogonalize_normalize(A):
    A, _ = linalg.qr(A, mode='economic')
    A /= np.linalg.norm(A, axis=1, keepdims=True)
    return A

In [4]:
counts_path = "/Users/i.lobov/hyperwords/data/wikipedia.corpus.nodups.clean_win=2_thr=100"
dim = 100
neg = 1
pos = 1
cds = 0.75

_, iw = load_vocabulary(counts_path + '.words.vocab')
adjacency_matrix = load_adjacency_matrix(counts_path)
ppmi = build_ppmi_matrix(adjacency_matrix, cds, neg, pos)

In [37]:
ppmi.setdiag(0)
rng = np.random.RandomState(0)
Q = orthogonalize_normalize(rng.normal(size=(ppmi.shape[0], dim)))

for i in range(5):
    res = ppmi * Q
    print(np.median(np.linalg.norm(res, axis=1)))
    Q = orthogonalize_normalize(res)

129.91264221420187
482.39175861311213
485.3392797281142
540.1761688992946
559.4338753559784


In [56]:
ppmi.setdiag(0)
rng = np.random.RandomState(0)
Q = orthogonalize_normalize(rng.normal(size=(ppmi.shape[0], dim)))
update = np.zeros(ppmi.shape[0])

for i in range(1):
    res = ppmi * Q if i % 2 == 0 else ppmi.T * Q
    norms = np.linalg.norm(res, axis=1)
    print(np.median(np.linalg.norm(res, axis=1)))
    update -= 0.3 * norms
    ppmi.setdiag(update)
    Q = orthogonalize_normalize(res)
    
Q = orthogonalize_normalize(rng.normal(size=(ppmi.shape[0], dim)))

for i in range(6):
    res = ppmi * Q if i % 2 == 0 else ppmi.T * Q
    norms = np.linalg.norm(res, axis=1)
    print(np.median(np.linalg.norm(res, axis=1)))
    Q = orthogonalize_normalize(res)

129.91264221420187
135.4521668485238
335.2389313120066
455.28873494562754
423.66391928993676
518.1042764569506
446.2348474790255


In [57]:
output_path = "../data/wikipedia.corpus.nodups.clean_win=2_thr=100_" + \
    "svd_dim=100_neg=1_pos=1_cds=0.75_normalized_power_iter=3"
np.save(output_path + '.vecs.npy', Q)

In [29]:
np.linalg.norm(res, axis=1)

array([10465.99276693, 14126.05859754, 14250.2168913 , ...,
        1190.98663033,  1087.13432493,   686.70041216])

In [31]:
update * 10

array([ 7890.98274729,  9631.81881784, 11041.07314667, ...,
         795.91483903,   789.87678825,   529.25342911])

In [22]:
ppmi.diagonal()

array([ -79.5780177 ,  -95.37280851, -111.31386341, ...,   -7.86358397,
         -7.90179061,   -5.3329202 ])