In [8]:
import numpy as np
from scipy.sparse import csr_matrix, dok_matrix
import scipy.sparse
from scipy.sparse import linalg as slinalg

adjacency_path = "./data/netflix.adjacency.npz"
adjacency_matrix = scipy.sparse.load_npz(adjacency_path)

In [12]:
users_mask = np.asarray(adjacency_matrix.sum(axis=1)).flatten() >= 5
adjacency_matrix = adjacency_matrix[users_mask, :]

In [55]:
# rng = np.random.RandomState(2)
# user_indices = rng.randint(0, adjacency_matrix.shape[0], size=10000)
# movie_indices = rng.randint(0, adjacency_matrix.shape[1], size=5000)

# adjacency_matrix = adjacency_matrix[user_indices, :][:, movie_indices]

In [13]:
adjacency_test = dok_matrix(adjacency_matrix.shape, dtype=np.float32)
user_ids, movie_ids = adjacency_matrix.nonzero()

indices = np.concatenate((
    user_ids.reshape([len(user_ids), 1]), 
    movie_ids.reshape([len(user_ids), 1])), 
    axis=1)

rng = np.random.RandomState(10)
rng.shuffle(indices)
used_users = set()

for idx in range(indices.shape[0]):
    user_id = indices[idx, 0]
    movie_id = indices[idx, 1]
    if user_id not in used_users:
        adjacency_test[user_id, movie_id] = 1
        used_users.add(user_id)
    
    if len(used_users) == adjacency_matrix.shape[0]:
        break
        
adjacency_test = adjacency_test.tocsr()

In [14]:
adjacency_train = adjacency_matrix - adjacency_test
adjacency_train.eliminate_zeros()
nonzero_users_mask = np.asarray((adjacency_train.sum(axis=1) != 0)).flatten()
adjacency_train = adjacency_train[nonzero_users_mask, :]
adjacency_test = adjacency_test[nonzero_users_mask, :]

In [15]:
assert(adjacency_train.shape == adjacency_test.shape)

In [23]:
def get_pmi_matrix(adjacency, cds, neg, pos):
    sum_w = np.array(adjacency.sum(axis=1))[:, 0]
    sum_c = np.array(adjacency.sum(axis=0))[0, :]
    if cds != 1:
        sum_c = sum_c ** cds
    sum_total = sum_c.sum()
    sum_w = np.reciprocal(sum_w)
    sum_c = np.reciprocal(sum_c)

    pmi = multiply_by_rows(adjacency, sum_w)
    pmi = multiply_by_columns(pmi, sum_c)
    pmi = pmi * sum_total
    pmi.data = np.log(pmi.data) - np.log(neg) + np.log(pos)
    return pmi

def multiply_by_rows(matrix, row_coefs):
    normalizer = dok_matrix((len(row_coefs), len(row_coefs)))
    normalizer.setdiag(row_coefs)
    return normalizer.tocsr().dot(matrix)


def multiply_by_columns(matrix, col_coefs):
    normalizer = dok_matrix((len(col_coefs), len(col_coefs)))
    normalizer.setdiag(col_coefs)
    return matrix.dot(normalizer.tocsr())

cds = 1.0
neg = 1
pos = 100
pmi = get_pmi_matrix(adjacency_train, cds, neg, pos)

  


In [24]:
pmi.data.min()

1.1151063657977658

In [25]:
dim = 100
U, s, Vt = slinalg.svds(pmi, dim)
#U, s, Vt = slinalg.svds(adjacency_train, dim)

In [26]:
#np.save("./data/netflix_users2_pos=100", U)
#np.save("./data/netflix_movies2_pos=100", Vt.T)
#np.save("./data/netflix_singular_vals2_pos=100", s)

np.save("./data/netflix_users_pmi", U)
np.save("./data/netflix_movies_pmi", Vt.T)
np.save("./data/netflix_singular_vals_pmi", s)

In [18]:
scipy.sparse.save_npz("./data/netflix.adjacency.train.npz", adjacency_train)
scipy.sparse.save_npz("./data/netflix.adjacency.test.npz", adjacency_test)