In [31]:
import os
import shutil
import sys
import csv
import numpy as np
from scipy import sparse
import pandas as pd
import json

In [44]:
min_uc=5
min_sc=3
dset = 'epinion'

if dset == 'ml-20m':
    raw_data = pd.read_csv(os.path.join('data', 'ml-20m', 'ratings.csv'))
    raw_data.columns = ['userId', 'movieId', 'rating', 'ts']
    print(raw_data)
    raw_data = raw_data[raw_data['rating'] > 3.5]
    n_heldout_users = 10000

# elif dset == "ml-1m":
#     raw_data = pd.read_csv(os.path.join('data', 'ml-1m', 'ratings.dat'), header=None, sep='::')
#     raw_data.columns = ['userId', 'movieId', 'rating', 'ts']
#     print(raw_data)
#     raw_data = raw_data[raw_data['rating'] > 3.5]
#     n_heldout_users = 1000
    
elif dset == 'sk':
    raw_data = pd.read_csv(os.path.join('data', 'sk', 'model_likes_anon.psv'), 
                           delimiter='|', 
                           quotechar='\\')
    raw_data['userId'] = raw_data['uid'].astype("object")
    raw_data['movieId'] = raw_data['mid'].astype("object")
    print(raw_data)
    n_heldout_users = 1500
    min_sc = 4
    
elif dset == 'melon':
    raw_data = pd.read_json(os.path.join('data', 'melon', 'train.json'))
    rows = []
    cols = []
    for i, r in raw_data.iterrows():
        rows.extend([i] * len(r.songs))
        cols.extend(r.songs)
    raw_data = pd.DataFrame({"userId": rows, "movieId": cols})
    min_sc = 10
    n_heldout_users = 10000

elif dset == 'epinion':
    import scipy.io
    rat = scipy.io.loadmat("data/epinion/rating_with_timestamp.mat")['rating_with_timestamp']
    u = rat[:, 0]
    i = rat[:, 1]
    r = rat[:, 3]
    raw_data = pd.DataFrame({
        'userId' : u,
        'movieId': i,
        'rating': r,
    })
    n_heldout_users = 2000
    
# elif dset == 'ciao':
#     import scipy.io
#     rat = scipy.io.loadmat("data/ciao/rating_with_timestamp.mat")['rating']
#     u = rat[:, 0]
#     i = rat[:, 1]
#     r = rat[:, 3]
#     raw_data = pd.DataFrame({
#         'userId' : u,
#         'movieId': i,
#         'rating': r,
#     })
#     n_heldout_users = 1000
    
# elif dset == 'pinterest':
#     import json
#     with open(os.path.join('data', 'pinterest_iccv', '_pins2.json'),'r') as f:
#         data = json.load(f)
#     with open(os.path.join('data', 'pinterest_iccv', '_pin_im2.json'),'r') as f:
#         b = json.load(f)
#         pid_to_imname = {x['pin_id']:x['im_name'] for x in b}
#     uids = []
#     iids = []
#     rats = []
#     for e, row in enumerate(data): 
#         x = row['board_id']
#         y = row['pins']
#         uids.extend([e for _ in range(len(y))])
#         iids.extend([pid_to_imname[x] for x in y])
#         rats.extend([1 for _ in range(len(y))])
#     raw_data = pd.DataFrame({
#         'userId' : uids,
#         'movieId': iids,
#         'rating': rats,
#     })
#     min_sc = 10



In [45]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

In [46]:
def filter_triplets(tp, min_uc=5, min_sc=0):
    # Only keep the triplets for items which were clicked on by at least min_sc users. 
    if min_sc > 0:
        itemcount = get_count(tp, 'movieId')
        tp = tp[tp['movieId'].isin(itemcount.index[itemcount >= min_sc])]
    
    # Only keep the triplets for users who clicked on at least min_uc items
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId') 
    return tp, usercount, itemcount

Only keep items that are clicked on by at least 5 users

In [47]:
_raw_data, user_activity, item_popularity = filter_triplets(raw_data, min_uc=5, min_sc=min_sc)

In [48]:
sparsity = 1. - _raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (_raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

After filtering, there are 640918 watching events from 21396 users and 59377 movies (sparsity: 99.950%)


In [49]:
raw_data = _raw_data

In [50]:
unique_uid = user_activity.index

np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

In [42]:
unique_uid

Int64Index([ 76156,  11200,  62087,  76767,  90790,  52510,  76919,   1617,
             58161,  93044,
            ...
            121718, 110462,  20836,  37744, 117129,   2082,  38309, 128556,
             70000, 136725],
           dtype='int64', name='userId', length=136675)

In [43]:
# create train/validation/test users
n_users = unique_uid.size
#n_heldout_users = 3000
tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

In [15]:
len(tr_users), len(vd_users), len(te_users)

(116675, 10000, 10000)

In [16]:
train_plays = raw_data.loc[raw_data['userId'].isin(tr_users)]

In [17]:
unique_sid = pd.unique(train_plays['movieId'])

In [18]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [19]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('userId')
    tr_list, te_list = list(), list()

    np.random.seed(98765)
    for i, (_, group) in enumerate(data_grouped_by_user):
        
        n_items_u = len(group)
        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if (i+1) % 100 == 0:
            print("%d users sampled" % (1+i))
            sys.stdout.flush()
    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [20]:
vad_plays = raw_data.loc[raw_data['userId'].isin(vd_users)]
vad_plays = vad_plays.loc[vad_plays['movieId'].isin(unique_sid)]

In [17]:
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)

100 users sampled
200 users sampled
300 users sampled
400 users sampled
500 users sampled
600 users sampled
700 users sampled
800 users sampled
900 users sampled
1000 users sampled
1100 users sampled
1200 users sampled
1300 users sampled
1400 users sampled
1500 users sampled


In [18]:
test_plays = raw_data.loc[raw_data['userId'].isin(te_users)]
test_plays = test_plays.loc[test_plays['movieId'].isin(unique_sid)]

In [19]:
test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)

100 users sampled
200 users sampled
300 users sampled
400 users sampled
500 users sampled
600 users sampled
700 users sampled
800 users sampled
900 users sampled
1000 users sampled
1100 users sampled
1200 users sampled
1300 users sampled
1400 users sampled
1500 users sampled


### Save the data into (user_index, item_index) format

In [20]:
def numerize(tp):
    row, col = tp.userId.apply(lambda x: profile2id[x]).tolist(), tp.movieId.apply(lambda x: show2id[x]).tolist()
    return sparse.coo_matrix((np.ones_like(row), (row, col)), shape=(len(profile2id), len(show2id))).tocsr()

In [21]:
train_data = numerize(train_plays)
val_tr = numerize(vad_plays_tr)
val_te = numerize(vad_plays_te)

In [22]:
te_tr = numerize(test_plays_tr)
te_te = numerize(test_plays_te)

In [23]:
print(len(np.arange(n_users)[np.asarray(train_data.sum(1)).ravel() > 0]))
print(len(np.arange(n_users)[np.asarray(val_tr.sum(1)).ravel() > 0]))
print(len(np.arange(n_users)[np.asarray(te_tr.sum(1)).ravel() > 0]))

12562
1500
1500


In [24]:
r = np.arange(n_users)
tr_users = r[np.asarray(train_data.sum(1)).ravel() > 0]
val_users = r[np.asarray(val_tr.sum(1)).ravel() > 0]
te_users = r[np.asarray(te_tr.sum(1)).ravel() > 0]

In [25]:
val_te.nnz / val_tr.nnz

0.23660119482528905

In [26]:
te_te.nnz / te_tr.nnz

0.23533681425460676

In [27]:
import pickle
with open(f"data/parsed/{dset}-new", 'wb') as f:
    pickle.dump((tr_users, val_users, te_users, train_data, val_tr, val_te, te_tr, te_te), f)

In [28]:
from implicit.evaluation import train_test_split
total = (train_data + val_tr + te_tr + val_te + te_te)
train, te = train_test_split(total, 0.8)
tr, val = train_test_split(train, 0.875)
with open(f'data/parsed/{dset}-712-', 'wb') as f:
    pickle.dump((tr, val, te), f)

In [1]:
ls

 1                                     mp-cml-pt.py
 adam_clr.pkl                          mp-our-ae.py
 adam_dot.pkl                          mp-ours-pt.py
 adam_warp.pkl                         ncf-pt.py
 a.out                                 [0m[01;34mneuralsort[0m/
 asdasd                                [01;34mold[0m/
 autoencoder-pt.py                     opop.py
 backup.sh                             ours-fb.py
 [01;34mbest_res[0m/                             ours-pt.py
 cdae-fb.py                            ours-trace.py
 [01;34mciao[0m/                                'Remark test.ipynb'
 CMAE.ipynb                            res1541.pkl
 cml-fb.py                             res2.pkl
 cml-pt.py                             res33123323.pkl
 cmltemp                               res33323.pkl
 cml-trace.py                          res.pkl
 DAE-ff.ipynb                          rmsprop_dot.pkl
 DAE-Finetuning.ipynb                  rmsprop_warp.pkl
 DAE.ip

In [34]:
import torch
import pickle
dset = 'melon'
with open(f'data/parsed/{dset}-712', 'rb') as f:
    data = pickle.load(f)
    print(data)

(<104645x81219 sparse matrix of type '<class 'numpy.int64'>'
	with 2873874 stored elements in Compressed Sparse Row format>, <104645x81219 sparse matrix of type '<class 'numpy.int64'>'
	with 411065 stored elements in Compressed Sparse Row format>, <104645x81219 sparse matrix of type '<class 'numpy.int64'>'
	with 821230 stored elements in Compressed Sparse Row format>)
