In [5]:
import os
import shutil
import sys
import csv
import numpy as np
from scipy import sparse
import pandas as pd
import json

In [6]:
!pwd

/home/ita/code/neurank/code


In [7]:
# please input the dataset name in this list: ['ml-20m', 'sk', 'epinion', 'melon']
!mkdir drm_data
!mkdir drm_data/parsed
dset = 'ml-20m'

mkdir: cannot create directory ‘drm_data’: File exists
mkdir: cannot create directory ‘drm_data/parsed’: File exists


In [8]:
min_uc=5
min_sc=3

if dset == 'ml-20m':
    !wget -c -O drm_data/ml-20m.zip http://files.grouplens.org/datasets/movielens/ml-20m.zip
    !unzip -d drm_data/ drm_data/ml-20m.zip
    raw_data = pd.read_csv(os.path.join('drm_data', 'ml-20m', 'ratings.csv'))
    print(raw_data)
    raw_data.columns = ['userId', 'movieId', 'rating', 'ts']
    raw_data = raw_data[raw_data['rating'] > 3.5]
    n_heldout_users = 10000
    print(raw_data)
    
    
elif dset == 'sk':
    !mkdir drm_data/sk
    !wget -c -O drm_data/sk/model_likes_anon.psv https://raw.githubusercontent.com/EthanRosenthal/rec-a-sketch/master/data/model_likes_anon.psv
    raw_data = pd.read_csv(os.path.join('drm_data', 'sk', 'model_likes_anon.psv'), 
                           delimiter='|', 
                           quotechar='\\')
    print(raw_data)
    raw_data['userId'] = raw_data['uid'].astype("object")
    raw_data['movieId'] = raw_data['mid'].astype("object")
    n_heldout_users = 1500
    min_sc = 4
    
elif dset == 'melon':
    raw_data = pd.read_json(os.path.join('drm_data', 'melon', 'train.json'))
    print(raw_data)
    rows = []
    cols = []
    for i, r in raw_data.iterrows():
        rows.extend([i] * len(r.songs))
        cols.extend(r.songs)
    raw_data = pd.DataFrame({"userId": rows, "movieId": cols})
    min_sc = 10
    n_heldout_users = 10000

elif dset == 'epinion':
    !wget -c -O drm_data/epinion.zip https://www.cse.msu.edu/~tangjili/datasetcode/epinions_with_rating_timestamp.zip
#     !unzip -d drm_data/ drm_data/epinion.zip
    import scipy.io
    rat = scipy.io.loadmat("drm_data/epinion_with_rating_timestamp/rating_with_timestamp.mat")['rating_with_timestamp']
    u = rat[:, 0]
    i = rat[:, 1]
    r = rat[:, 3]
    raw_data = pd.DataFrame({
        'userId' : u,
        'movieId': i,
        'rating': r,
    })
    print(raw_data)
    n_heldout_users = 2000

          userId  movieId  rating   timestamp
0              1        2     3.5  1112486027
1              1       29     3.5  1112484676
2              1       32     3.5  1112484819
3              1       47     3.5  1112484727
4              1       50     3.5  1112484580
...          ...      ...     ...         ...
20000258  138493    68954     4.5  1258126920
20000259  138493    69526     4.5  1259865108
20000260  138493    69644     3.0  1260209457
20000261  138493    70286     5.0  1258126944
20000262  138493    71619     2.5  1255811136

[20000263 rows x 4 columns]
          userId  movieId  rating          ts
6              1      151     4.0  1094785734
7              1      223     4.0  1112485573
8              1      253     4.0  1112484940
9              1      260     4.0  1112484826
10             1      293     4.0  1112484703
...          ...      ...     ...         ...
20000256  138493    66762     4.5  1255805408
20000257  138493    68319     4.5  1260209720
20000

In [9]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

In [10]:
def filter_triplets(tp, min_uc=5, min_sc=0):
    # Only keep the triplets for items which were clicked on by at least min_sc users. 
    if min_sc > 0:
        itemcount = get_count(tp, 'movieId')
        tp = tp[tp['movieId'].isin(itemcount.index[itemcount >= min_sc])]
    
    # Only keep the triplets for users who clicked on at least min_uc items
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId') 
    return tp, usercount, itemcount

Only keep items that are clicked on by at least 5 users

In [11]:
_raw_data, user_activity, item_popularity = filter_triplets(raw_data, min_uc=min_uc, min_sc=min_sc)

In [12]:
sparsity = 1. - _raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (_raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

After filtering, there are 9983789 watching events from 136675 users and 15529 movies (sparsity: 99.530%)


In [13]:
raw_data = _raw_data

In [14]:
unique_uid = user_activity.index

np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

In [15]:
# create train/validation/test users
n_users = unique_uid.size
tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

In [16]:
len(tr_users), len(vd_users), len(te_users)

(116675, 10000, 10000)

In [17]:
train_plays = raw_data.loc[raw_data['userId'].isin(tr_users)]
unique_sid = pd.unique(train_plays['movieId'])

In [18]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [19]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('userId')
    tr_list, te_list = list(), list()

    np.random.seed(98765)
    for i, (_, group) in enumerate(data_grouped_by_user):
        
        n_items_u = len(group)
        if n_items_u >= min_uc:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if (i+1) % 100 == 0:
            print("%d users sampled" % (1+i))
            sys.stdout.flush()
    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [20]:
vad_plays = raw_data.loc[raw_data['userId'].isin(vd_users)]
vad_plays = vad_plays.loc[vad_plays['movieId'].isin(unique_sid)]

In [21]:
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)

100 users sampled
200 users sampled
300 users sampled
400 users sampled
500 users sampled
600 users sampled
700 users sampled
800 users sampled
900 users sampled
1000 users sampled
1100 users sampled
1200 users sampled
1300 users sampled
1400 users sampled
1500 users sampled
1600 users sampled
1700 users sampled
1800 users sampled
1900 users sampled
2000 users sampled
2100 users sampled
2200 users sampled
2300 users sampled
2400 users sampled
2500 users sampled
2600 users sampled
2700 users sampled
2800 users sampled
2900 users sampled
3000 users sampled
3100 users sampled
3200 users sampled
3300 users sampled
3400 users sampled
3500 users sampled
3600 users sampled
3700 users sampled
3800 users sampled
3900 users sampled
4000 users sampled
4100 users sampled
4200 users sampled
4300 users sampled
4400 users sampled
4500 users sampled
4600 users sampled
4700 users sampled
4800 users sampled
4900 users sampled
5000 users sampled
5100 users sampled
5200 users sampled
5300 users sampled
54

In [22]:
test_plays = raw_data.loc[raw_data['userId'].isin(te_users)]
test_plays = test_plays.loc[test_plays['movieId'].isin(unique_sid)]

In [23]:
test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)

100 users sampled
200 users sampled
300 users sampled
400 users sampled
500 users sampled
600 users sampled
700 users sampled
800 users sampled
900 users sampled
1000 users sampled
1100 users sampled
1200 users sampled
1300 users sampled
1400 users sampled
1500 users sampled
1600 users sampled
1700 users sampled
1800 users sampled
1900 users sampled
2000 users sampled
2100 users sampled
2200 users sampled
2300 users sampled
2400 users sampled
2500 users sampled
2600 users sampled
2700 users sampled
2800 users sampled
2900 users sampled
3000 users sampled
3100 users sampled
3200 users sampled
3300 users sampled
3400 users sampled
3500 users sampled
3600 users sampled
3700 users sampled
3800 users sampled
3900 users sampled
4000 users sampled
4100 users sampled
4200 users sampled
4300 users sampled
4400 users sampled
4500 users sampled
4600 users sampled
4700 users sampled
4800 users sampled
4900 users sampled
5000 users sampled
5100 users sampled
5200 users sampled
5300 users sampled
54

### Save the data into (user_index, item_index) format

In [24]:
def numerize(tp):
    row, col = tp.userId.apply(lambda x: profile2id[x]).tolist(), tp.movieId.apply(lambda x: show2id[x]).tolist()
    return sparse.coo_matrix((np.ones_like(row), (row, col)), shape=(len(profile2id), len(show2id))).tocsr()

In [25]:
train_data = numerize(train_plays)
val_tr = numerize(vad_plays_tr)
val_te = numerize(vad_plays_te)

In [26]:
te_tr = numerize(test_plays_tr)
te_te = numerize(test_plays_te)

In [27]:
print(len(np.arange(n_users)[np.asarray(train_data.sum(1)).ravel() > 0]))
print(len(np.arange(n_users)[np.asarray(val_tr.sum(1)).ravel() > 0]))
print(len(np.arange(n_users)[np.asarray(te_tr.sum(1)).ravel() > 0]))

116675
10000
10000


In [28]:
r = np.arange(n_users)
tr_users = r[np.asarray(train_data.sum(1)).ravel() > 0]
val_users = r[np.asarray(val_tr.sum(1)).ravel() > 0]
te_users = r[np.asarray(te_tr.sum(1)).ravel() > 0]

In [29]:
val_te.nnz / val_tr.nnz

0.2414057886427456

In [30]:
te_te.nnz / te_tr.nnz

0.24155100167441546

In [31]:
import pickle
# with open(f"drm_data/parsed/{dset}-new", 'wb') as f:
#     pickle.dump((tr_users, val_users, te_users, train_data, val_tr, val_te, te_tr, te_te), f)
from implicit.evaluation import train_test_split
total = (train_data + val_tr + te_tr + val_te + te_te)
train, te = train_test_split(total, 0.8)
tr, val = train_test_split(train, 0.875)
with open(f'drm_data/parsed/{dset}-parsed', 'wb') as f:
    pickle.dump((tr, val, te), f)

In [86]:
# from implicit.evaluation import train_test_split
# total = (train_data + val_tr + te_tr + val_te + te_te)
# train, te = train_test_split(total, 0.8)
# tr, val = train_test_split(train, 0.875)
# with open(f'data/parsed/{dset}-712-', 'wb') as f:
#     pickle.dump((tr, val, te), f)

In [32]:
with open(f'drm_data/parsed/{dset}-parsed', 'rb') as f:
    print(pickle.load(f))

(<136675x15525 sparse matrix of type '<class 'numpy.int64'>'
	with 6990505 stored elements in Compressed Sparse Row format>, <136675x15525 sparse matrix of type '<class 'numpy.int64'>'
	with 997690 stored elements in Compressed Sparse Row format>, <136675x15525 sparse matrix of type '<class 'numpy.int64'>'
	with 1995581 stored elements in Compressed Sparse Row format>)
