In [2]:
!pip3 install torch tqdm
!apt install unzip

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
Collecting torch
  Downloading torch-1.6.0-cp36-cp36m-manylinux1_x86_64.whl (748.8 MB)
[K     |################################| 748.8 MB 4.1 kB/s  eta 0:00:01    |###                             | 85.7 MB 73.8 MB/s eta 0:00:09     |#########                       | 223.2 MB 74.1 MB/s eta 0:00:08     |#######################         | 541.9 MB 81.5 MB/s eta 0:00:03     |########################        | 575.6 MB 81.5 MB/s eta 0:00:03     |#############################   | 694.0 MB 72.7 MB/s eta 0:00:01
[?25hCollecting tqdm
  Downloading tqdm-4.48.2-py2.py3-none-any.whl (68 kB)
[K     |################################| 68 kB 4.4 MB/s  eta 0:00:01
[?25hCollecting future
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |################################| 829 kB 67.6 MB/s eta 0:00:01
Building wheels fo

In [8]:
%%bash
mkdir -p data
cd data
if [ ! -f "ml-20m.zip" ]; then
    echo "Downloading data"
    wget http://files.grouplens.org/datasets/movielens/ml-20m.zip
fi
unzip -f ml-20m.zip

Archive:  ml-20m.zip


In [10]:
from argparse import ArgumentParser
import pandas as pd
import torch
import tqdm

In [20]:
MIN_RATINGS = 20
USER_COLUMN = 'userId'
ITEM_COLUMN = 'movieId'

In [12]:
class _TestNegSampler:
    def __init__(self, train_ratings, nb_neg):
        self.nb_neg = nb_neg
        self.nb_users = int(train_ratings[:, 0].max()) + 1
        self.nb_items = int(train_ratings[:, 1].max()) + 1

        # compute unique ids for quickly created hash set and fast lookup
        ids = (train_ratings[:, 0] * self.nb_items) + train_ratings[:, 1]
        self.set = set(ids)

    def generate(self, batch_size=128*1024):
        users = torch.arange(0, self.nb_users).reshape([1, -1]).repeat([self.nb_neg, 1]).transpose(0, 1).reshape(-1)

        items = [-1] * len(users)

        random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
        print('Generating validation negatives...')
        for idx, u in enumerate(tqdm.tqdm(users.tolist())):
            if not random_items:
                random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
            j = random_items.pop()
            while u * self.nb_items + j in self.set:
                if not random_items:
                    random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
                j = random_items.pop()

            items[idx] = j
        items = torch.LongTensor(items)
        return items

In [24]:
df = pd.read_csv('./data/ml-20m/ratings.csv')
print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
grouped = df.groupby(USER_COLUMN)
df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)

print("Mapping original user and item IDs to new sequential IDs")
df[USER_COLUMN], unique_users = pd.factorize(df[USER_COLUMN])
df[ITEM_COLUMN], unique_items = pd.factorize(df[ITEM_COLUMN])


Filtering out users with less than 20 ratings
Mapping original user and item IDs to new sequential IDs


In [25]:
# Need to sort before popping to get last item
df.sort_values(by='timestamp', inplace=True)

# clean up data
del df['rating'], df['timestamp']
df = df.drop_duplicates() # assuming it keeps order

# now we have filtered and sorted by time data, we can split test data out
grouped_sorted = df.groupby(USER_COLUMN, group_keys=False)
test_data = grouped_sorted.tail(1).sort_values(by=USER_COLUMN)
# need to pop for each group
train_data = grouped_sorted.apply(lambda x: x.iloc[:-1])

In [26]:
train_data['target']=1
train_data.head()

Unnamed: 0,userId,movieId,target
20,0,20,1
19,0,19,1
86,0,86,1
61,0,61,1
23,0,23,1


In [27]:
sampler = _TestNegSampler(train_data.values, 100)  # using 100 negative samples
test_negs = sampler.generate().cuda()
test_negs = test_negs.reshape(-1, 100)

  1%|          | 85397/13849300 [00:00<00:16, 853964.71it/s]

Generating validation negatives...


100%|██████████| 13849300/13849300 [00:14<00:00, 970617.69it/s] 


In [28]:
import numpy as np
train_data_neg = np.zeros((test_negs.shape[0]*test_negs.shape[1],3), dtype=int)

In [29]:
idx = 0
for i in range(test_negs.shape[0]):
    for j in range(test_negs.shape[1]):
        train_data_neg[idx, 0] = i
        train_data_neg[idx, 1] = j
        idx += 1

In [30]:
import numpy as np

def gen_test_data(test_data):
    sampler = _TestNegSampler(test_data.values, 1)  # using 100 negative samples
    test_negs = sampler.generate()
    test_negs = test_negs.reshape(-1, 1)


    test_data_neg = np.zeros((test_negs.shape[0]*test_negs.shape[1],3), dtype=int)
    
    idx = 0
    for i in range(test_negs.shape[0]):
        for j in range(test_negs.shape[1]):
            test_data_neg[idx, 0] = i
            test_data_neg[idx, 1] = j
            idx += 1
            
    test_data['target'] = 1
    return np.concatenate([test_data_neg, test_data.values])

test_data = gen_test_data(test_data)

100%|██████████| 138493/138493 [00:00<00:00, 975720.84it/s]

Generating validation negatives...





"""
typedef struct DataSetHeader_ {
  long long error_check;        // 0: no error check; 1: check_num
  long long number_of_records;  // the number of samples in this data file
  long long label_dim;          // dimension of label
  long long dense_dim;          // dimension of dense feature
  long long slot_num;           // slot_num for each embedding
  long long reserved[3];        // reserved for future use
} DataSetHeader;
"""

In [32]:
from ctypes import c_longlong as ll
from ctypes import c_uint as uint
from ctypes import c_float as float
from ctypes import c_int as int

def write_hugeCTR_data(huge_ctr_data, filename='huge_ctr_data.dat'):
    print("Writing %d samples"%huge_ctr_data.shape[0])
    with open(filename, 'wb') as f:
        #write header
        f.write(ll(0)) # 0: no error check; 1: check_num
        f.write(ll(huge_ctr_data.shape[0])) # the number of samples in this data file
        f.write(ll(1)) # dimension of label
        f.write(ll(1)) # dimension of dense feature
        f.write(ll(2)) # long long slot_num
        for _ in range(3): f.write(ll(0)) # reserved for future use

        for i in tqdm.tqdm(range(huge_ctr_data.shape[0])):
            f.write(float(huge_ctr_data[i,2])) # float label[label_dim];
            # skip float dense[dense_dim];
            f.write(float(0)) # dummy dense feature
            f.write(int(1)) # slot 1 nnz: user ID
            f.write(uint(huge_ctr_data[i,0]))
            f.write(int(1)) # slot 2 nnz: item ID
            f.write(uint(huge_ctr_data[i,1]))

In [33]:
data= np.concatenate([train_data_neg, train_data.values])

In [34]:
data.shape

(33711070, 3)

In [35]:
np.random.shuffle(data)

In [36]:
data

array([[ 83223,   1007,      1],
       [ 68531,      2,      1],
       [  7057,    410,      1],
       ...,
       [114155,    230,      1],
       [ 29355,     67,      1],
       [115519,     84,      0]])

In [37]:
for i, data_arr in enumerate(np.split(data,10)):
    write_hugeCTR_data(data_arr, filename='huge_ctr_data_%d.dat'%i)

  1%|          | 23512/3371107 [00:00<00:14, 235110.13it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 248304.64it/s]
  1%|          | 23105/3371107 [00:00<00:14, 231040.86it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 246376.43it/s]
  1%|          | 23406/3371107 [00:00<00:14, 234052.97it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 246399.96it/s]
  1%|          | 23200/3371107 [00:00<00:14, 231996.35it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 246532.97it/s]
  1%|          | 23207/3371107 [00:00<00:14, 232061.92it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 246710.48it/s]
  1%|          | 23144/3371107 [00:00<00:14, 231434.15it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 247659.21it/s]
  1%|          | 22997/3371107 [00:00<00:14, 229960.35it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 244852.51it/s]
  1%|          | 23259/3371107 [00:00<00:14, 232583.57it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 244783.08it/s]
  1%|          | 23159/3371107 [00:00<00:14, 231583.04it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 244629.61it/s]
  1%|          | 23036/3371107 [00:00<00:14, 230343.74it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 244177.54it/s]


In [38]:
with open('filelist.txt', 'wt') as f:
    f.write('10\n');
    for i in range(10):
        f.write('huge_ctr_data_%d.dat\n'%i)

In [39]:
!cat filelist.txt

10
huge_ctr_data_0.dat
huge_ctr_data_1.dat
huge_ctr_data_2.dat
huge_ctr_data_3.dat
huge_ctr_data_4.dat
huge_ctr_data_5.dat
huge_ctr_data_6.dat
huge_ctr_data_7.dat
huge_ctr_data_8.dat
huge_ctr_data_9.dat


# Test data


In [40]:
write_hugeCTR_data(test_data, filename='test_huge_ctr_data.dat')

  0%|          | 0/276986 [00:00<?, ?it/s]

Writing 276986 samples


100%|██████████| 276986/276986 [00:01<00:00, 240041.61it/s]
