In [1]:
import os
import shutil
import sys
import numpy as np
from scipy import sparse
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sn
sn.set()
import pandas as pd
import tensorflow as tf
from tensorflow.contrib.layers import apply_regularization, l2_regularizer
import bottleneck as bn

In [2]:
pro_dir = "./data/ml-20m/pro_sg/"

unique_sid = list()
with open(os.path.join(pro_dir, 'unique_sid.txt'), 'r') as f:
    for line in f:
        unique_sid.append(line.strip())

n_items = len(unique_sid)

In [3]:
def load_tr_te_data(csv_file_tr, csv_file_te):
    tp_tr = pd.read_csv(csv_file_tr)
    tp_te = pd.read_csv(csv_file_te)

    start_idx = min(tp_tr['uid'].min(), tp_te['uid'].min())
    end_idx = max(tp_tr['uid'].max(), tp_te['uid'].max())

    rows_tr, cols_tr = tp_tr['uid'] - start_idx, tp_tr['sid']
    rows_te, cols_te = tp_te['uid'] - start_idx, tp_te['sid']

    data_tr = sparse.csr_matrix((np.ones_like(rows_tr),
                             (rows_tr, cols_tr)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    data_te = sparse.csr_matrix((np.ones_like(rows_te),
                             (rows_te, cols_te)), dtype='float64', shape=(end_idx - start_idx + 1, n_items))
    return data_tr, data_te

test_data_tr, test_data_te = load_tr_te_data(
    os.path.join(pro_dir, 'test_tr.csv'),
    os.path.join(pro_dir, 'test_te.csv'))

In [4]:
def NDCG_binary_at_k_batch(X_pred, heldout_batch, k=100):
    '''
    normalized discounted cumulative gain@k for binary relevance
    ASSUMPTIONS: all the 0's in heldout_data indicate 0 relevance
    '''
    batch_users = X_pred.shape[0]
    idx_topk_part = bn.argpartition(-X_pred, k, axis=1)
    topk_part = X_pred[np.arange(batch_users)[:, np.newaxis],
                       idx_topk_part[:, :k]]
    idx_part = np.argsort(-topk_part, axis=1)
    # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # topk predicted score
    idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]
    # build the discount template
    tp = 1. / np.log2(np.arange(2, k + 2))

    DCG = (heldout_batch[np.arange(batch_users)[:, np.newaxis],
                         idx_topk].toarray() * tp).sum(axis=1)
    IDCG = np.array([(tp[:min(n, k)]).sum()
                     for n in heldout_batch.getnnz(axis=1)])
    return DCG / IDCG

def Recall_at_k_batch(X_pred, heldout_batch, k=100):
    batch_users = X_pred.shape[0]

    idx = bn.argpartition(-X_pred, k, axis=1)
    X_pred_binary = np.zeros_like(X_pred, dtype=bool)
    X_pred_binary[np.arange(batch_users)[:, np.newaxis], idx[:, :k]] = True

    X_true_binary = (heldout_batch > 0).toarray()
    tmp = (np.logical_and(X_true_binary, X_pred_binary).sum(axis=1)).astype(
        np.float32)
    recall = tmp / np.minimum(k, X_true_binary.sum(axis=1))
    return recall

In [100]:
# users = np.load("./data/movielens20M_users_svd.npy")
# users /= np.linalg.norm(users, axis=1, keepdims=True)

# vals = np.load("./data/movielens20M_singular_vals_svd.npy")
movies = np.load("./data/movielens20M_users_svd.npy")
# movies *= np.sqrt(vals)
# movies = np.load("./data/movielens20M_movies_pic_pmi.npy")
# movies /= np.linalg.norm(movies, axis=1, keepdims=True)
# movies[np.isnan(movies)] = 10000

In [101]:
pop = np.load("./data/movielens20M_movies_pop.npy")
scaling = 1.0 / pop**0.5
scaling[scaling == np.inf] = 10000

  


In [102]:
# pop = np.asarray(test_data_tr.sum(axis=0)).flatten()
# scaling = 1.0 / np.log(2+pop)
users = test_data_tr.dot(movies)
users = np.hstack([users, np.zeros([users.shape[0], 1])])
users /= np.linalg.norm(users, axis=1, keepdims=True)
# users = test_data_tr.dot(movies)

movies /= np.linalg.norm(movies, axis=1, keepdims=True)

In [104]:
from sklearn.metrics.pairwise import euclidean_distances

for f in [0.0, 1.0, 3.0, 5.0]:
    print(f)
    movies_pop = np.hstack([movies, f * scaling.reshape(-1, 1)])
    
    # N_test = test_data_tr.shape[0]
    N_test = 2000
    idxlist_test = range(N_test)
    
    batch_size_test = 2000
    holdout_size = 10000
    shift = len(users)-holdout_size

    n100_list, r20_list, r50_list = [], [], []

    for bnum, st_idx in enumerate(range(0, N_test, batch_size_test)):
        print(bnum)
        end_idx = min(st_idx + batch_size_test, N_test)
        X = test_data_tr[idxlist_test[st_idx:end_idx]]

        if sparse.isspmatrix(X):
            X = X.toarray()
        X = X.astype('float32')

        X_users = users[shift+st_idx:shift+end_idx]
    #     pred_val = X_users.dot((movies).T)
    #         scaling = np.abs(vals)**0.0
    #         pred_val = (X_users * scaling).dot((movies * scaling).T)
        pred_val = 1.0 / euclidean_distances(X_users, movies_pop)

        # exclude examples from training and validation (if any)
#         pred_val[X.nonzero()] = -np.inf
        n100_list.append(NDCG_binary_at_k_batch(pred_val, test_data_te[idxlist_test[st_idx:end_idx]], k=100))
        r20_list.append(Recall_at_k_batch(pred_val, test_data_te[idxlist_test[st_idx:end_idx]], k=20))
        r50_list.append(Recall_at_k_batch(pred_val, test_data_te[idxlist_test[st_idx:end_idx]], k=50))

    n100_list = np.concatenate(n100_list)
    r20_list = np.concatenate(r20_list)
    r50_list = np.concatenate(r50_list)

    print("Test NDCG@100=%.5f (%.5f)" % (np.mean(n100_list), np.std(n100_list) / np.sqrt(len(n100_list))))
    print("Test Recall@20=%.5f (%.5f)" % (np.mean(r20_list), np.std(r20_list) / np.sqrt(len(r20_list))))
    print("Test Recall@50=%.5f (%.5f)" % (np.mean(r50_list), np.std(r50_list) / np.sqrt(len(r50_list))))

0.0
0
Test NDCG@100=0.15412 (0.00226)
Test Recall@20=0.09164 (0.00377)
Test Recall@50=0.25577 (0.00571)
1.0
0
Test NDCG@100=0.15708 (0.00225)
Test Recall@20=0.09395 (0.00383)
Test Recall@50=0.26040 (0.00572)
3.0
0


KeyboardInterrupt: 

In [None]:
# Best: MovieLens 20M, PMI on movies, positive=30, 100 dims, unnormalized movies and users
#     popularity (1/log(2+pop)) scaling for users, cds=0.75, right singular vectors
# 
# Test NDCG@100=0.28199 (0.00407)
# Test Recall@20=0.23678 (0.00475)
# Test Recall@50=0.33990 (0.00551)


# PIC, iters=3, unnormalized movies, users = sum of movies, cds=0.75
# rng = np.random.RandomState(0)
# U = rng.normal(size=[pmi.shape[0], dim])
# U = pmi.T.dot(U)
# U, _ = np.linalg.qr(U)

# for _ in range(3):
#     U = pmi.dot(U)
#     U, _ = np.linalg.qr(U)
    
#     U = pmi.T.dot(U)
#     U, _ = np.linalg.qr(U)

# Test NDCG@100=0.28099 (0.00405)
# Test Recall@20=0.23463 (0.00474)
# Test Recall@50=0.33948 (0.00549)

# Iterations of PIC
# 1
# Test NDCG@100=0.22021 (0.00347)
# Test Recall@20=0.17242 (0.00410)
# Test Recall@50=0.27906 (0.00514)
# 2
# Test NDCG@100=0.26983 (0.00390)
# Test Recall@20=0.22301 (0.00462)
# Test Recall@50=0.32750 (0.00534)
# 3
# Test NDCG@100=0.27299 (0.00401)
# Test Recall@20=0.22580 (0.00470)
# Test Recall@50=0.33259 (0.00544)
# 4
# Test NDCG@100=0.27602 (0.00402)
# Test Recall@20=0.22859 (0.00466)
# Test Recall@50=0.33790 (0.00547)
# 5
# Test NDCG@100=0.27313 (0.00399)
# Test Recall@20=0.22673 (0.00465)
# Test Recall@50=0.33048 (0.00544)