# Variational autoencoders for collaborative filtering 

This notebook accompanies the paper "*Variational autoencoders for collaborative filtering*" by Dawen Liang, Rahul G. Krishnan, Matthew D. Hoffman, and Tony Jebara, in The Web Conference (aka WWW) 2018.

In this notebook, we will show a complete self-contained example of training a variational autoencoder (as well as a denoising autoencoder) with multinomial likelihood (described in the paper) on the public Movielens-20M dataset, including both data preprocessing and model training.

In [1]:
import os
import shutil
import sys

import numpy as np
from scipy import sparse

import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:

# DATA_DIR = './ml1m'
# DATA_DIR = './amazon_videogames'

In [None]:
if DATA_DIR == './ml1m':
    raw_data = pd.read_csv(os.path.join(DATA_DIR, 'ratings.dat'), names=['userId', 'itemId', 'rating', 'timestamp'], delimiter='::')
elif DATA_DIR == './amazon_videogames':
    raw_data = pd.read_json(os.path.join(DATA_DIR, 'Video_Games_5.json'), lines=True)[['reviewerID', 'asin', 'overall']]
    raw_data = raw_data.rename(columns={"reviewerID": "userId", "asin": "itemId", "overall": "rating"})
else:
    raise ValueError("Unrecognized dataname")

In [None]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id)
    count = playcount_groupbyid.size()
    return count

In [None]:
def filter_triplets(tp, min_uc=5, min_sc=0):
    # Only keep the triplets for items which were clicked on by at least min_sc users. 
    if min_sc > 0:
        itemcount = get_count(tp, 'itemId')
        tp = tp[tp['itemId'].isin(itemcount.index[itemcount >= min_sc])]
    
    # Only keep the triplets for users who clicked on at least min_uc items
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        print(tp.shape)
        tp = tp[tp['userId'].isin(usercount[usercount >= min_uc].index)]
        print(tp.shape)
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'itemId') 
    return tp, usercount, itemcount

In [None]:
raw_data, user_activity, item_popularity = filter_triplets(raw_data)

In [None]:
sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))


In [None]:
unique_uid = user_activity.index

np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

In [None]:
# create train/validation/test users
n_users = unique_uid.size

if DATA_DIR == './ml1m':
    n_heldout_users = 1000
elif DATA_DIR == './amazon_videogames':
    n_heldout_users = 3000
else:
    raise ValueError("Unrecognized dataname")

tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

In [None]:
train_plays = raw_data.loc[raw_data['userId'].isin(tr_users)]

In [None]:
unique_sid = pd.unique(raw_data['itemId'])

In [None]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [None]:
def split_train_test_proportion(data, heldout=0.1, unbias=0.1):
    data_grouped_by_user = data.groupby('userId')
    tr_list, te_list, tb_list = list(), list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx_te = np.zeros(n_items_u, dtype='bool')
            idx_bias = np.zeros(n_items_u, dtype='bool')
            arr = np.random.choice(n_items_u, size=int((heldout + unbias) * n_items_u), replace=False).astype('int64')
            idx_te[arr[:int(heldout * n_items_u)]] = True
            idx_bias[arr[int(heldout * n_items_u):]] = True

            tr_list.append(group[np.logical_not(np.logical_or(idx_te, idx_bias))])
            te_list.append(group[idx_te])
            tb_list.append(group[idx_bias])
        else:
            tr_list.append(group)

        if i % 500 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    data_tb = pd.concat(tb_list)
    
    return data_tr, data_te, data_tb

In [None]:
vad_plays = raw_data.loc[raw_data['userId'].isin(vd_users)]
vad_plays = vad_plays.loc[vad_plays['itemId'].isin(unique_sid)]

In [None]:
vad_plays_tr, vad_plays_te, vad_plays_tb = split_train_test_proportion(vad_plays)

In [None]:
test_plays = raw_data.loc[raw_data['userId'].isin(te_users)]
test_plays = test_plays.loc[test_plays['itemId'].isin(unique_sid)]

In [None]:
test_plays_tr, test_plays_te, test_plays_tb = split_train_test_proportion(test_plays)

In [None]:
def numerize(tp):
    uid = list(map(lambda x: profile2id[x], tp['userId']))
    sid = list(map(lambda x: show2id[x], tp['itemId']))
    return pd.DataFrame(data={'uid': uid, 'sid': sid, 'rating': tp['rating']}, columns=['uid', 'sid', 'rating'])

In [None]:
full_data = numerize(raw_data)
size = (full_data['uid'].max() + 1, full_data['sid'].max() + 1)
size

In [32]:
data = numerize(train_plays)
urm = np.zeros(size)
for uid, sid, rating in zip(data['uid'], data['sid'], data['rating']):
    urm[uid, sid] = rating
data = numerize(vad_plays_tr)
for uid, sid, rating in zip(data['uid'], data['sid'], data['rating']):
    urm[uid, sid] = rating
data = numerize(test_plays_tr)
for uid, sid, rating in zip(data['uid'], data['sid'], data['rating']):
    urm[uid, sid] = rating
sparse_urm = sparse.csr_array(urm, dtype=np.float64)
sparse.save_npz(os.path.join(DATA_DIR, 'URM_train.npz'), sparse_urm)

In [33]:
data = numerize(test_plays_te)
urm = np.zeros(size)
for uid, sid, rating in zip(data['uid'], data['sid'], data['rating']):
    urm[uid, sid] = rating
data = numerize(vad_plays_te)
for uid, sid, rating in zip(data['uid'], data['sid'], data['rating']):
    urm[uid, sid] = rating
sparse_urm = sparse.csr_array(urm, dtype=np.float64)
sparse.save_npz(os.path.join(DATA_DIR, 'URM_heldout.npz'), sparse_urm)

In [34]:
data = numerize(test_plays_tb)
urm = np.zeros(size)
for uid, sid, rating in zip(data['uid'], data['sid'], data['rating']):
    urm[uid, sid] = rating
data = numerize(vad_plays_tb)
for uid, sid, rating in zip(data['uid'], data['sid'], data['rating']):
    urm[uid, sid] = rating
sparse_urm = sparse.csr_array(urm, dtype=np.float64)
sparse.save_npz(os.path.join(DATA_DIR, 'URM_unbias.npz'), sparse_urm)