In [1]:
!pip3 install torch tqdm
!apt install unzip

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
Collecting tqdm
  Downloading tqdm-4.48.2-py2.py3-none-any.whl (68 kB)
[K     |################################| 68 kB 4.6 MB/s eta 0:00:011
Installing collected packages: tqdm
Successfully installed tqdm-4.48.2
Reading package lists... Done
Building dependency tree       
Reading state information... Done
Suggested packages:
  zip
The following NEW packages will be installed:
  unzip
0 upgraded, 1 newly installed, 0 to remove and 2 not upgraded.
Need to get 167 kB of archives.
After this operation, 558 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 unzip amd64 6.0-21ubuntu1 [167 kB]
Fetched 167 kB in 1s (182 kB/s)[0m[33m
debconf: delaying package configuration, since apt-utils is not installed

7[0;23r8[1ASelecting previously unselected package 

In [2]:
%%bash
mkdir -p data
cd data
if [ ! -f "ml-20m.zip" ]; then
    echo "Downloading data"
    wget http://files.grouplens.org/datasets/movielens/ml-20m.zip
fi
unzip -f ml-20m.zip

Archive:  ml-20m.zip


In [3]:
from argparse import ArgumentParser
import pandas as pd
import numpy as np
import torch
import tqdm

In [4]:
MIN_RATINGS = 20
USER_COLUMN = 'userId'
ITEM_COLUMN = 'movieId'

In [5]:
class _TestNegSampler:
    def __init__(self, train_ratings, nb_neg):
        self.nb_neg = nb_neg
        self.nb_users = int(train_ratings[:, 0].max()) + 1
        self.nb_items = int(train_ratings[:, 1].max()) + 1

        # compute unique ids for quickly created hash set and fast lookup
        ids = (train_ratings[:, 0] * self.nb_items) + train_ratings[:, 1]
        self.set = set(ids)

    def generate(self, batch_size=128*1024):
        users = torch.arange(0, self.nb_users).reshape([1, -1]).repeat([self.nb_neg, 1]).transpose(0, 1).reshape(-1)

        items = [-1] * len(users)

        random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
        print('Generating validation negatives...')
        for idx, u in enumerate(tqdm.tqdm(users.tolist())):
            if not random_items:
                random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
            j = random_items.pop()
            while u * self.nb_items + j in self.set:
                if not random_items:
                    random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
                j = random_items.pop()

            items[idx] = j
        items = torch.LongTensor(items)
        return items

In [23]:
df = pd.read_csv('./data/ml-20m/ratings.csv')
print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
grouped = df.groupby(USER_COLUMN)
df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)

print("Mapping original user and item IDs to new sequential IDs")
df[USER_COLUMN], unique_users = pd.factorize(df[USER_COLUMN])
df[ITEM_COLUMN], unique_items = pd.factorize(df[ITEM_COLUMN])


Filtering out users with less than 20 ratings
Mapping original user and item IDs to new sequential IDs


In [24]:
# Need to sort before popping to get last item
df.sort_values(by='timestamp', inplace=True)

# clean up data
del df['rating'], df['timestamp']
df = df.drop_duplicates() # assuming it keeps order

# now we have filtered and sorted by time data, we can split test data out
grouped_sorted = df.groupby(USER_COLUMN, group_keys=False)
test_data = grouped_sorted.tail(1).sort_values(by=USER_COLUMN)
# need to pop for each group
train_data = grouped_sorted.apply(lambda x: x.iloc[:-1])

In [25]:
train_data['target']=1
test_data['target']=1
train_data.head()

Unnamed: 0,userId,movieId,target
20,0,20,1
19,0,19,1
86,0,86,1
61,0,61,1
23,0,23,1


In [26]:
test_data.head()

Unnamed: 0,userId,movieId,target
62,0,62,1
184,1,15,1
389,2,336,1
445,3,381,1
515,4,50,1


In [42]:
sampler = _TestNegSampler(df.values, 200)  # using 200 negative samples
train_negs = sampler.generate()
train_negs = train_negs.reshape(-1, 200)

sampler = _TestNegSampler(df.values, 100)  # using 100 negative samples
test_negs = sampler.generate()
test_negs = test_negs.reshape(-1, 100)

  0%|          | 99992/27698600 [00:00<00:27, 999916.19it/s]

Generating validation negatives...


100%|██████████| 27698600/27698600 [00:17<00:00, 1612585.26it/s]
  1%|          | 147479/13849300 [00:00<00:09, 1474787.89it/s]

Generating validation negatives...


100%|██████████| 13849300/13849300 [00:08<00:00, 1639083.71it/s]


In [43]:
# generating negative samples for training
train_data_neg = np.zeros((train_negs.shape[0]*train_negs.shape[1],3), dtype=int)
idx = 0
for i in tqdm.tqdm(range(train_negs.shape[0])):
    for j in range(train_negs.shape[1]):
        train_data_neg[idx, 0] = i # user ID
        train_data_neg[idx, 1] = train_negs[i, j] # negative item ID
        idx += 1

100%|██████████| 138493/138493 [02:07<00:00, 1083.38it/s]


In [44]:
# generating negative samples for testing
test_data_neg = np.zeros((test_negs.shape[0]*test_negs.shape[1],3), dtype=int)
idx = 0
for i in tqdm.tqdm(range(test_negs.shape[0])):
    for j in range(test_negs.shape[1]):
        test_data_neg[idx, 0] = i
        test_data_neg[idx, 1] = test_negs[i, j]
        idx += 1

100%|██████████| 138493/138493 [01:05<00:00, 2130.65it/s]


In [45]:
train_data_np= np.concatenate([train_data_neg, train_data.values])
np.random.shuffle(train_data_np)

test_data_np= np.concatenate([test_data_neg, test_data.values])
np.random.shuffle(test_data_np)


In [46]:
train_data_np

array([[ 34445,   1084,      1],
       [  3522,   1929,      1],
       [ 75910,  14211,      0],
       ...,
       [ 71244,    147,      1],
       [120159,   8188,      0],
       [ 74903,  19952,      0]])

In [47]:
test_data_np

array([[113722,  21868,      0],
       [ 93078,  19998,      0],
       [ 91055,  24771,      0],
       ...,
       [ 97670,  18205,      0],
       [122801,  24227,      0],
       [ 29252,   6700,      0]])

In [48]:
np.sum(test_data_np[:,2])

138493

In [49]:
np.sum(train_data_np[:,2])

19861770

# Write HugeCTR data files

## Train data

In [50]:
from ctypes import c_longlong as ll
from ctypes import c_uint
from ctypes import c_float
from ctypes import c_int

def write_hugeCTR_data(huge_ctr_data, filename='huge_ctr_data.dat'):
    print("Writing %d samples"%huge_ctr_data.shape[0])
    with open(filename, 'wb') as f:
        #write header
        f.write(ll(0)) # 0: no error check; 1: check_num
        f.write(ll(huge_ctr_data.shape[0])) # the number of samples in this data file
        f.write(ll(1)) # dimension of label
        f.write(ll(1)) # dimension of dense feature
        f.write(ll(2)) # long long slot_num
        for _ in range(3): f.write(ll(0)) # reserved for future use

        for i in tqdm.tqdm(range(huge_ctr_data.shape[0])):
            f.write(c_float(huge_ctr_data[i,2])) # float label[label_dim];
            f.write(c_float(0)) # dummy dense feature
            f.write(c_int(1)) # slot 1 nnz: user ID
            f.write(c_uint(huge_ctr_data[i,0]))
            f.write(c_int(1)) # slot 2 nnz: item ID
            f.write(c_uint(huge_ctr_data[i,1]))

In [51]:
!mkdir ./data/hugeCTR
for i, data_arr in enumerate(np.array_split(train_data_np,10)):
    write_hugeCTR_data(data_arr, filename='./data/hugeCTR/huge_ctr_data_%d.dat'%i)

mkdir: cannot create directory './data/hugeCTR': File exists
Writing 4756037 samples


100%|██████████| 4756037/4756037 [00:09<00:00, 501618.24it/s]
  2%|▏         | 101399/4756037 [00:00<00:09, 506853.90it/s]

Writing 4756037 samples


100%|██████████| 4756037/4756037 [00:09<00:00, 523266.60it/s]
  1%|          | 49308/4756037 [00:00<00:09, 493074.59it/s]

Writing 4756037 samples


100%|██████████| 4756037/4756037 [00:09<00:00, 526751.91it/s]
  1%|          | 49593/4756037 [00:00<00:09, 495928.11it/s]

Writing 4756037 samples


100%|██████████| 4756037/4756037 [00:09<00:00, 521966.02it/s]
  1%|          | 50883/4756037 [00:00<00:09, 508824.42it/s]

Writing 4756037 samples


100%|██████████| 4756037/4756037 [00:09<00:00, 522543.23it/s]
  1%|          | 51114/4756037 [00:00<00:09, 511139.27it/s]

Writing 4756037 samples


100%|██████████| 4756037/4756037 [00:09<00:00, 521857.35it/s]
  1%|          | 51048/4756037 [00:00<00:09, 510473.18it/s]

Writing 4756037 samples


100%|██████████| 4756037/4756037 [00:09<00:00, 517325.40it/s]
  1%|          | 51057/4756037 [00:00<00:09, 510565.62it/s]

Writing 4756037 samples


100%|██████████| 4756037/4756037 [00:09<00:00, 526401.42it/s]
  2%|▏         | 101642/4756037 [00:00<00:09, 508595.72it/s]

Writing 4756037 samples


100%|██████████| 4756037/4756037 [00:09<00:00, 525027.54it/s]
  1%|          | 50464/4756037 [00:00<00:09, 504636.87it/s]

Writing 4756037 samples


100%|██████████| 4756037/4756037 [00:09<00:00, 526082.15it/s]


In [52]:
with open('./data/hugeCTR/filelist.txt', 'wt') as f:
    f.write('10\n');
    for i in range(10):
        f.write('./data/hugeCTR/huge_ctr_data_%d.dat\n'%i)

In [53]:
!cat ./data/hugeCTR/filelist.txt

10
./data/hugeCTR/huge_ctr_data_0.dat
./data/hugeCTR/huge_ctr_data_1.dat
./data/hugeCTR/huge_ctr_data_2.dat
./data/hugeCTR/huge_ctr_data_3.dat
./data/hugeCTR/huge_ctr_data_4.dat
./data/hugeCTR/huge_ctr_data_5.dat
./data/hugeCTR/huge_ctr_data_6.dat
./data/hugeCTR/huge_ctr_data_7.dat
./data/hugeCTR/huge_ctr_data_8.dat
./data/hugeCTR/huge_ctr_data_9.dat


## Test data


In [54]:
for i, data_arr in enumerate(np.array_split(test_data_np,10)):
    write_hugeCTR_data(data_arr, filename='./data/hugeCTR/test_huge_ctr_data_%d.dat'%i)

  0%|          | 0/1398780 [00:00<?, ?it/s]

Writing 1398780 samples


100%|██████████| 1398780/1398780 [00:02<00:00, 510143.69it/s]
  0%|          | 0/1398780 [00:00<?, ?it/s]

Writing 1398780 samples


100%|██████████| 1398780/1398780 [00:02<00:00, 523339.13it/s]
  7%|▋         | 101146/1398780 [00:00<00:02, 505430.76it/s]

Writing 1398780 samples


100%|██████████| 1398780/1398780 [00:02<00:00, 515810.91it/s]
  4%|▎         | 50587/1398779 [00:00<00:02, 505860.83it/s]

Writing 1398779 samples


100%|██████████| 1398779/1398779 [00:02<00:00, 517477.87it/s]
  7%|▋         | 101771/1398779 [00:00<00:02, 510141.77it/s]

Writing 1398779 samples


100%|██████████| 1398779/1398779 [00:02<00:00, 506829.12it/s]
  4%|▎         | 51161/1398779 [00:00<00:02, 511603.17it/s]

Writing 1398779 samples


100%|██████████| 1398779/1398779 [00:02<00:00, 523018.81it/s]
  4%|▎         | 52195/1398779 [00:00<00:02, 521944.28it/s]

Writing 1398779 samples


100%|██████████| 1398779/1398779 [00:02<00:00, 520899.57it/s]
  4%|▎         | 49561/1398779 [00:00<00:02, 495601.02it/s]

Writing 1398779 samples


100%|██████████| 1398779/1398779 [00:02<00:00, 522423.51it/s]
  4%|▎         | 51535/1398779 [00:00<00:02, 515349.26it/s]

Writing 1398779 samples


100%|██████████| 1398779/1398779 [00:02<00:00, 524258.63it/s]
  4%|▎         | 52054/1398779 [00:00<00:02, 520534.29it/s]

Writing 1398779 samples


100%|██████████| 1398779/1398779 [00:02<00:00, 521648.21it/s]


In [55]:
with open('./data/hugeCTR/test_filelist.txt', 'wt') as f:
    f.write('10\n');
    for i in range(10):
        f.write('./data/hugeCTR/test_huge_ctr_data_%d.dat\n'%i)