In [1]:
import os
import shutil
import sys
import numpy as np
from scipy import sparse
import pandas as pd
import pickle

In [2]:
with open(os.path.join('data', 'pinterest_iccv', 'board_to_info.pkl'),'rb') as f:
    data = pickle.load(f)
with open(os.path.join('data', 'pinterest_iccv', 'pin_id_to_image_name.pkl'),'rb') as f:
    b = pickle.load(f)
    pid_to_imname = {x['pin_id']:x['im_name'] for x in b}
uids = []
iids = []
rats = []
for e, row in enumerate(data): 
    x = row['board_id']
    y = row['pins']
    uids.extend([e for _ in range(len(y))])
    iids.extend([pid_to_imname[x] for x in y])
    rats.extend([1 for _ in range(len(y))])
raw_data = pd.DataFrame({
    'userId' : uids,
    'movieId': iids,
    'rating': rats,
})


In [3]:
min_uc=5
min_sc=3
dset = 'ml-1m'
if dset == 'ml-20m':
    raw_data = pd.read_csv(os.path.join('data', 'ml-20m', 'ratings.csv'), header=None)
    raw_data = raw_data[raw_data['rating'] > 3.5]
    n_heldout_users = 10000

elif dset == "ml-1m":
    raw_data = pd.read_csv(os.path.join('data', 'ml-1m', 'ratings.dat'), header=None, sep='::')
    raw_data.columns =['userId', 'movieId', 'rating', 'ts']
    raw_data = raw_data[raw_data['rating'] > 3.5]
    n_heldout_users = 1000
elif dset == 'melon':
    raw_data = pd.read_json(os.path.join('data', 'melon', 'train.json'))
    rows = []
    cols = []
    for i, r in raw_data.iterrows():
        rows.extend([i] * len(r.songs))
        cols.extend(r.songs)
    raw_data = pd.DataFrame({"userId": rows, "movieId": cols})
    min_sc = 30
    n_heldout_users = 10000

elif dset == 'anime':
    raw_data = pd.read_csv(os.path.join("data","anime", "rating.csv"))
    raw_data.columns = ['userId', 'movieId', 'rating']
    n_heldout_users = 10000
elif dset == 'epinion':
    import scipy.io
    rat = scipy.io.loadmat("data/epinion/rating_with_timestamp.mat")['rating_with_timestamp']
    u = rat[:, 0]
    i = rat[:, 1]
    r = rat[:, 3]
    raw_data = pd.DataFrame({
        'userId' : u,
        'movieId': i,
        'rating': r,
    })
elif dset == 'pinterest':
    with open(os.path.join('data', 'pinterest_iccv', 'board_to_info.pkl'),'rb') as f:
        data = pickle.load(f)
    with open(os.path.join('data', 'pinterest_iccv', 'pin_id_to_image_name.pkl'),'rb') as f:
        b = pickle.load(f)
        pid_to_imname = {x['pin_id']:x['im_name'] for x in b}
    uids = []
    iids = []
    rats = []
    for e, row in enumerate(data): 
        x = row['board_id']
        y = row['pins']
        uids.extend([e for _ in range(len(y))])
        iids.extend([pid_to_imname[x] for x in y])
        rats.extend([1 for _ in range(len(y))])
    raw_data = pd.DataFrame({
        'userId' : uids,
        'movieId': iids,
        'rating': rats,
    })
    min_sc = 10


  # Remove the CWD from sys.path while we load stuff.


In [5]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

In [6]:
def filter_triplets(tp, min_uc=5, min_sc=0):
    # Only keep the triplets for items which were clicked on by at least min_sc users. 
    if min_sc > 0:
        itemcount = get_count(tp, 'movieId')
        tp = tp[tp['movieId'].isin(itemcount.index[itemcount >= min_sc])]
    
    # Only keep the triplets for users who clicked on at least min_uc items
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId') 
    return tp, usercount, itemcount

Only keep items that are clicked on by at least 5 users

In [7]:
min_sc=10
_raw_data, user_activity, item_popularity = filter_triplets(raw_data, min_uc=5, min_sc=min_sc)

In [8]:
sparsity = 1. * _raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (_raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

After filtering, there are 572194 watching events from 6034 users and 2811 movies (sparsity: 3.373%)


In [9]:
raw_data = _raw_data

In [10]:
unique_uid = user_activity.index

np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

In [11]:
unique_uid

Int64Index([1867, 5662,  646, 3942, 1412, 5374, 4889, 1496, 2999, 3557,
            ...
            4483,  922, 1413, 2058, 4833, 5046, 4015, 1389, 3572, 3863],
           dtype='int64', name='userId', length=6034)

In [12]:
# create train/validation/test users
n_users = unique_uid.size

In [13]:
train_plays = raw_data

In [14]:
unique_sid = pd.unique(train_plays['movieId'])

In [15]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [16]:
raw_data['uid'] = raw_data.userId.apply(lambda x : profile2id[x] if x in profile2id else -1)
raw_data['iid']  = raw_data.movieId.apply(lambda x: show2id[x] if x in show2id else -1)

In [17]:
ret = raw_data.groupby('uid').iid.apply(list).reset_index()

In [18]:
from scipy.sparse import *

def leavek(data, k=1, tots=100):
    n_items = len(show2id)
    _pos, _neg = [], []
    for i, t in enumerate(data.itertuples()):
        uid = t.uid
        iids = t.iid
        negs = np.random.choice(iids, k).tolist()
        while len(negs) < tots:
            negs += [x for x in np.random.choice(n_items, tots * 2).tolist() if x not in iids]
        negs = negs[:tots]
        poss = [x for x in iids if x not in negs]
        _pos.append(poss)
        _neg.append(negs)
        if (i+1) % 1000 == 0:
            print(i+1, "user sampled!")
    return _pos, _neg

def lltomat(pos):
    x = lil_matrix((len(profile2id), len(show2id)))
    for u, i in enumerate(pos):
        x[u, i] = 1
    return x.tocsr()

In [19]:
import pickle
for tots in [100]:#, 300, 500]:
    for k in [1]:#, 3, 5]:
        pos, neg = leavek(ret, k=k, tots=tots)
        pos_mat = lltomat(pos)
        p = os.path.join('data','parsed', '%s-l-%d-%d' %(dset, k, tots))
        with open(p, 'wb') as f:
            pickle.dump((pos_mat, np.array(neg, dtype=np.int32)), f)

1000 user sampled!
2000 user sampled!
3000 user sampled!
4000 user sampled!
5000 user sampled!
6000 user sampled!


In [20]:
pwd

'/Users/ita/Dropbox/code/neurank/code'