In [1]:
import os
import sys

import numpy as np
from scipy import sparse
import pandas as pd

In [2]:
dataset = 'dataset/ml-20m/ratings.csv'
output_dir = 'dataset/afterPre'
threshold = 3.5
min_uc = 5
min_sc = 0
n_heldout_users = 10000

raw_data = pd.read_csv(dataset, header=0)
raw_data = raw_data[raw_data['rating'] > threshold]
raw_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
6,1,151,4.0,1094785734
7,1,223,4.0,1112485573
8,1,253,4.0,1112484940
9,1,260,4.0,1112484826
10,1,293,4.0,1112484703


In [3]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count


def filter_triplets(tp, min_uc=min_uc, min_sc=min_sc): 
    if min_sc > 0:
        itemcount = get_count(tp, 'movieId')
        itemcount = itemcount['movieId']
        tp = tp[tp['movieId'].isin(itemcount.index[itemcount >= min_sc])]
    
    if min_uc > 0:
        usercount = get_count(tp, 'userId')
        usercount = usercount['userId']
        tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])]
    
    usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId') 
    return tp, usercount, itemcount

In [4]:
itemcount = get_count(raw_data, 'movieId')
itemcount

Unnamed: 0,movieId,size
0,1,33294
1,2,7272
2,3,4015
3,4,694
4,5,3288
...,...,...
20715,131250,1
20716,131252,1
20717,131254,1
20718,131256,1


In [5]:
itemcount = itemcount['movieId']
itemcount

0             1
1             2
2             3
3             4
4             5
          ...  
20715    131250
20716    131252
20717    131254
20718    131256
20719    131262
Name: movieId, Length: 20720, dtype: int64

In [6]:
raw_data = raw_data[raw_data['movieId'].isin(itemcount.index[itemcount >= min_sc])]
print(raw_data)

          userId  movieId  rating   timestamp
6              1      151     4.0  1094785734
7              1      223     4.0  1112485573
8              1      253     4.0  1112484940
9              1      260     4.0  1112484826
10             1      293     4.0  1112484703
...          ...      ...     ...         ...
20000175  138493     8636     4.0  1258135133
20000177  138493     8884     4.0  1255817148
20000178  138493     8905     5.0  1255856919
20000179  138493     8907     4.0  1255816947
20000181  138493     8973     5.0  1255811257

[8861522 rows x 4 columns]


In [16]:
raw_data, user_activity, item_popularity = filter_triplets(raw_data)

sparsity = 1. * raw_data.shape[0] / (user_activity.shape[0] * item_popularity.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (raw_data.shape[0], user_activity.shape[0], item_popularity.shape[0], sparsity * 100))

After filtering, there are 8844835 watching events from 137964 users and 8242 movies (sparsity: 0.778%)


In [17]:
unique_uid = user_activity.index

np.random.seed(98765)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

n_users = unique_uid.size

tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

train_plays = raw_data.loc[raw_data['userId'].isin(tr_users)]

unique_sid = pd.unique(train_plays['movieId'])

show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [28]:
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))
profile2id

{116813: 0,
 39630: 1,
 112214: 2,
 60683: 3,
 110911: 4,
 124910: 5,
 71585: 6,
 8359: 7,
 109411: 8,
 23359: 9,
 120163: 10,
 108432: 11,
 127224: 12,
 99183: 13,
 131257: 14,
 41973: 15,
 5785: 16,
 22994: 17,
 118286: 18,
 135793: 19,
 33205: 20,
 55988: 21,
 132425: 22,
 16725: 23,
 4436: 24,
 90137: 25,
 14496: 26,
 132055: 27,
 126512: 28,
 74236: 29,
 112897: 30,
 70077: 31,
 784: 32,
 133455: 33,
 47579: 34,
 80421: 35,
 135294: 36,
 127597: 37,
 103148: 38,
 67329: 39,
 90830: 40,
 9543: 41,
 95399: 42,
 54705: 43,
 79358: 44,
 128180: 45,
 56900: 46,
 74358: 47,
 13240: 48,
 66251: 49,
 118937: 50,
 70280: 51,
 91126: 52,
 116628: 53,
 45622: 54,
 99388: 55,
 43226: 56,
 88275: 57,
 118008: 58,
 69689: 59,
 118564: 60,
 89947: 61,
 113684: 62,
 82403: 63,
 93038: 64,
 97870: 65,
 63281: 66,
 25540: 67,
 30648: 68,
 11659: 69,
 101980: 70,
 69053: 71,
 118255: 72,
 63069: 73,
 36146: 74,
 73206: 75,
 98831: 76,
 56754: 77,
 5685: 78,
 39677: 79,
 4724: 80,
 121774: 81,
 13659

In [29]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

with open(os.path.join(output_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)
        
with open(os.path.join(output_dir, 'unique_uid.txt'), 'w') as f:
    for uid in unique_uid:
        f.write('%s\n' % uid)


def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('userId')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 1000 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [30]:
vad_plays = raw_data.loc[raw_data['userId'].isin(vd_users)]
vad_plays = vad_plays.loc[vad_plays['movieId'].isin(unique_sid)]

vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)

test_plays = raw_data.loc[raw_data['userId'].isin(te_users)]
test_plays = test_plays.loc[test_plays['movieId'].isin(unique_sid)]

test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)

def numerize(tp):
    uid = list(map(lambda x: profile2id[x], tp['userId']))
    sid = list(map(lambda x: show2id[x], tp['movieId']))
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])


train_data = numerize(train_plays)
train_data.to_csv(os.path.join(output_dir, 'train.csv'), index=False)

vad_data_tr = numerize(vad_plays_tr)
vad_data_tr.to_csv(os.path.join(output_dir, 'validation_tr.csv'), index=False)

vad_data_te = numerize(vad_plays_te)
vad_data_te.to_csv(os.path.join(output_dir, 'validation_te.csv'), index=False)

test_data_tr = numerize(test_plays_tr)
test_data_tr.to_csv(os.path.join(output_dir, 'test_tr.csv'), index=False)

test_data_te = numerize(test_plays_te)
test_data_te.to_csv(os.path.join(output_dir, 'test_te.csv'), index=False)

0 users sampled
1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled
0 users sampled
1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled
