In [9]:
!pip3 install torch tqdm



In [10]:
import sys
sys.path.insert(0, "/mnt/dldata/vinhn/DeepLearningExamples/PyTorch/Recommendation/NCF")

In [11]:
from argparse import ArgumentParser
import pandas as pd
from load import implicit_load
import torch
import tqdm

In [12]:
MIN_RATINGS = 20
USER_COLUMN = 'user_id'
ITEM_COLUMN = 'item_id'

In [13]:
class _TestNegSampler:
    def __init__(self, train_ratings, nb_neg):
        self.nb_neg = nb_neg
        self.nb_users = int(train_ratings[:, 0].max()) + 1
        self.nb_items = int(train_ratings[:, 1].max()) + 1

        # compute unique ids for quickly created hash set and fast lookup
        ids = (train_ratings[:, 0] * self.nb_items) + train_ratings[:, 1]
        self.set = set(ids)

    def generate(self, batch_size=128*1024):
        users = torch.arange(0, self.nb_users).reshape([1, -1]).repeat([self.nb_neg, 1]).transpose(0, 1).reshape(-1)

        items = [-1] * len(users)

        random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
        print('Generating validation negatives...')
        for idx, u in enumerate(tqdm.tqdm(users.tolist())):
            if not random_items:
                random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
            j = random_items.pop()
            while u * self.nb_items + j in self.set:
                if not random_items:
                    random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
                j = random_items.pop()

            items[idx] = j
        items = torch.LongTensor(items)
        return items

In [14]:
df = implicit_load('/mnt/dldata/vinhn/DeepLearningExamples/PyTorch/Recommendation/NCF/data/ml-20m/ratings.csv', sort=False)

print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
grouped = df.groupby(USER_COLUMN)
df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)

print("Mapping original user and item IDs to new sequential IDs")
df[USER_COLUMN], unique_users = pd.factorize(df[USER_COLUMN])
df[ITEM_COLUMN], unique_items = pd.factorize(df[ITEM_COLUMN])


20000263 ratings on 26744 items from 138493 users from 1995-01-09 11:46:44 to 2015-03-31 06:40:02
Filtering out users with less than 20 ratings
Mapping original user and item IDs to new sequential IDs


In [15]:
df[ITEM_COLUMN]

0              0
1              1
2              2
3              3
4              4
            ... 
20000258    1814
20000259    1037
20000260    3950
20000261    1818
20000262    4010
Name: item_id, Length: 20000263, dtype: int64

In [16]:
unique_users

Int64Index([     1,      2,      3,      4,      5,      6,      7,      8,
                 9,     10,
            ...
            138484, 138485, 138486, 138487, 138488, 138489, 138490, 138491,
            138492, 138493],
           dtype='int64', length=138493)

In [17]:
unique_items

Int64Index([     2,     29,     32,     47,     50,    112,    151,    223,
               253,    260,
            ...
            104307, 106170, 106401, 113539, 118856, 121017, 121019, 121021,
            110167, 110510],
           dtype='int64', length=26744)

In [18]:
import pickle

with open('./mappings.pickle', 'wb') as handle:
    pickle.dump({"users": unique_users, "items": unique_items}, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
# Need to sort before popping to get last item
df.sort_values(by='timestamp', inplace=True)

# clean up data
del df['rating'], df['timestamp']
df = df.drop_duplicates() # assuming it keeps order

# now we have filtered and sorted by time data, we can split test data out
grouped_sorted = df.groupby(USER_COLUMN, group_keys=False)
test_data = grouped_sorted.tail(1).sort_values(by='user_id')
# need to pop for each group
train_data = grouped_sorted.apply(lambda x: x.iloc[:-1])



In [20]:
# Note: no way to keep reference training data ordering because use of python set and multi-process
# It should not matter since it will be later randomized again
# save train and val data that is fixed.
train_ratings = torch.from_numpy(train_data.values)
torch.save(train_ratings, './train_ratings.pt')
test_ratings = torch.from_numpy(test_data.values)
torch.save(test_ratings, './test_ratings.pt')



In [21]:
train_ratings.shape

torch.Size([19861770, 2])

In [22]:
test_ratings

tensor([[     0,     62],
        [     1,     15],
        [     2,    336],
        ...,
        [138490,    173],
        [138491,    204],
        [138492,    695]])

In [23]:
sampler = _TestNegSampler(train_ratings.cpu().numpy(), 100)  # using 100 negative samples
test_negs = sampler.generate().cuda()
test_negs = test_negs.reshape(-1, 100)
torch.save(test_negs, './test_negatives.pt')

  1%|          | 147596/13849300 [00:00<00:09, 1475954.37it/s]

Generating validation negatives...


100%|██████████| 13849300/13849300 [00:08<00:00, 1688484.61it/s]


In [24]:
test_negs.shape

torch.Size([138493, 100])

In [25]:
test_negs[1,]

tensor([13526,  6010,  2580, 20795, 17019, 13412,  9002,   278,  2323, 15147,
         6525,  9238,  6427,  5638, 21305,  8535, 22253,   680,  6121, 23959,
        12321,  4001,  1903,  4116,  4869, 20119, 12954,  2814, 20602, 12807,
        15583,  1976, 26648,  6573,  8468, 21415, 24641, 13358,  9048, 23819,
        17701,   986, 11426, 11518,  8002, 18193,  5919,  2840, 20801, 22531,
        22351, 18287, 22100,  5961, 24353, 18816, 13567,   585,  7582, 10957,
        12966, 12392, 16984, 23337, 23803, 10115,  9488, 23602, 26272, 12293,
        19981, 16574, 16295,  7151, 10912,  3921,  8372, 16605, 21906,  7044,
         8300, 14418,  2519,  8738,  8490, 24165, 20467, 12010,  9091, 20323,
        13288, 18053,  5985, 17934, 11163, 15424,  5963, 13823,  1443, 25891],
       device='cuda:0')