### Importing packages

In [21]:
import os
import shutil
import sys
import csv
import numpy as np
from scipy import sparse
import pandas as pd
import json
import pickle

### Check the current location

In [4]:
!pwd

/home/ita/drm_test


### Choose the dataset
#### Please input the dataset name in this list: `['ml-20m', 'sketchfab', 'epinion', 'melon']`

In [30]:
!mkdir drm_data
!mkdir drm_data/parsed
dset = 'epinion'

mkdir: cannot create directory ‘drm_data’: File exists
mkdir: cannot create directory ‘drm_data/parsed’: File exists


### Download the raw dataset

In [31]:
min_uc=5
min_sc=3

if dset == 'ml-20m':
    !wget -c -O drm_data/ml-20m.zip http://files.grouplens.org/datasets/movielens/ml-20m.zip
    if not os.path.isdir('./drm_data/ml-20m'):
        !unzip -d drm_data/ drm_data/ml-20m.zip
    raw_data = pd.read_csv(os.path.join('drm_data', 'ml-20m', 'ratings.csv'))
    raw_data.columns = ['userId', 'movieId', 'rating', 'ts']
    raw_data = raw_data[raw_data['rating'] > 3.5]

elif dset == 'sketchfab':
    !mkdir drm_data/sketchfab
    !wget -c -O drm_data/sketchfab/model_likes_anon.psv https://raw.githubusercontent.com/EthanRosenthal/rec-a-sketch/master/data/model_likes_anon.psv
    raw_data = pd.read_csv(os.path.join('drm_data', 'sketchfab', 'model_likes_anon.psv'), 
                           delimiter='|', 
                           quotechar='\\')
    raw_data['userId'] = raw_data['uid'].astype("object")
    raw_data['movieId'] = raw_data['mid'].astype("object")
    min_sc = 5
    
elif dset == 'melon':
    raw_data = pd.read_json(os.path.join('drm_data', 'melon', 'train.json'))
    print(raw_data)
    rows = []
    cols = []
    for i, r in raw_data.iterrows():
        rows.extend([i] * len(r.songs))
        cols.extend(r.songs)
    raw_data = pd.DataFrame({"userId": rows, "movieId": cols})
    min_sc = 10

elif dset == 'epinion':
    !wget -c -O drm_data/epinion.zip https://www.cse.msu.edu/~tangjili/datasetcode/epinions_with_rating_timestamp.zip
    if not os.path.isdir('./drm_data/epinion_with_rating_timestamp'):
        !unzip -d drm_data/ drm_data/epinion.zip
    import scipy.io
    rat = scipy.io.loadmat("drm_data/epinion_with_rating_timestamp/rating_with_timestamp.mat")['rating_with_timestamp']
    u = rat[:, 0]
    i = rat[:, 1]
    r = rat[:, 3]
    raw_data = pd.DataFrame({
        'userId' : u,
        'movieId': i,
        'rating': r,
    })
    print(raw_data)

--2020-09-15 01:21:50--  https://www.cse.msu.edu/~tangjili/datasetcode/epinions_with_rating_timestamp.zip
Resolving www.cse.msu.edu (www.cse.msu.edu)... 35.9.20.103
Connecting to www.cse.msu.edu (www.cse.msu.edu)|35.9.20.103|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6289772 (6.0M) [application/zip]
Saving to: ‘drm_data/epinion.zip’


2020-09-15 01:24:07 (45.6 KB/s) - ‘drm_data/epinion.zip’ saved [6289772/6289772]

Archive:  drm_data/epinion.zip
   creating: drm_data/epinion_with_rating_timestamp/
  inflating: drm_data/epinion_with_rating_timestamp/rating_with_timestamp.mat  
  inflating: drm_data/epinion_with_rating_timestamp/trust.mat  
        userId  movieId  rating
0            1        1       2
1            1        2       2
2            1        3       2
3            1        4       5
4            1        5       3
...        ...      ...     ...
922262   22166    83922       4
922263   22166    23442       4
922264   22166    43538       5
92

### Parsing the data

In [32]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

In [33]:
def filter_triplets(tp, min_uc=5, min_sc=0):
    # Only keep the triplets for items which were clicked on by at least min_sc users. 
    if min_sc > 0:
        itemcount = get_count(tp, 'movieId')
        tp = tp[tp['movieId'].isin(itemcount.index[itemcount >= min_sc])]
    
    # Only keep the triplets for users who clicked on at least min_uc items
    # After doing this, some of the items will have less than min_uc users, but should only be a small proportion
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId') 
    return tp, usercount, itemcount

Only keep items that are clicked on by at least 5 users

In [34]:
_raw_data, user_activity, item_popularity = filter_triplets(raw_data, min_uc=min_uc, min_sc=min_sc)

In [35]:
sparsity = 1. - _raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (_raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

After filtering, there are 640918 watching events from 21396 users and 59377 movies (sparsity: 99.950%)


In [36]:
raw_data = _raw_data

In [37]:
unique_uid = user_activity.index

np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

In [38]:
plays = raw_data
unique_sid = pd.unique(plays['movieId'])
unique_uid = pd.unique(plays['userId'])
n_users = len(unique_uid)
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

### Save the data into (user_index, item_index) format

In [39]:
def numerize(tp):
    row, col = tp.userId.apply(lambda x: profile2id[x]).tolist(), tp.movieId.apply(lambda x: show2id[x]).tolist()
    return sparse.coo_matrix((np.ones_like(row), (row, col)), shape=(len(profile2id), len(show2id))).tocsr()

In [40]:
data = numerize(plays)

In [46]:
r = np.arange(n_users)
r[np.asarray(data.sum(1)).ravel() > 0], len(r)

(array([    0,     1,     2, ..., 21393, 21394, 21395]), 21396)

In [43]:
from implicit.evaluation import train_test_split
total = data
train, te = train_test_split(total, 0.8)
tr, val = train_test_split(train, 0.875)

In [44]:
if not os.path.isdir(f'./data/parsed'):
    !mkdir ./data/parsed
with open(f'data/parsed/{dset}-parsed', 'wb') as f:
    pickle.dump((tr, val, te), f)

In [45]:
with open(f'data/parsed/{dset}-parsed', 'rb') as f:
    print(pickle.load(f))

(<21396x59377 sparse matrix of type '<class 'numpy.int64'>'
	with 442122 stored elements in Compressed Sparse Row format>, <21396x59377 sparse matrix of type '<class 'numpy.int64'>'
	with 62852 stored elements in Compressed Sparse Row format>, <21396x59377 sparse matrix of type '<class 'numpy.int64'>'
	with 126448 stored elements in Compressed Sparse Row format>)
