In [1]:
!pip3 install torch tqdm
!apt install unzip

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
unzip is already the newest version (6.0-21ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 2 not upgraded.


In [2]:
%%bash
mkdir -p data
cd data
if [ ! -f "ml-20m.zip" ]; then
    echo "Downloading data"
    wget http://files.grouplens.org/datasets/movielens/ml-20m.zip
fi
unzip -f ml-20m.zip

Archive:  ml-20m.zip


In [3]:
from argparse import ArgumentParser
import pandas as pd
import numpy as np
import torch
import tqdm

In [4]:
MIN_RATINGS = 20
USER_COLUMN = 'userId'
ITEM_COLUMN = 'movieId'

In [5]:
class _TestNegSampler:
    def __init__(self, train_ratings, nb_neg):
        self.nb_neg = nb_neg
        self.nb_users = int(train_ratings[:, 0].max()) + 1
        self.nb_items = int(train_ratings[:, 1].max()) + 1

        # compute unique ids for quickly created hash set and fast lookup
        ids = (train_ratings[:, 0] * self.nb_items) + train_ratings[:, 1]
        self.set = set(ids)

    def generate(self, batch_size=128*1024):
        users = torch.arange(0, self.nb_users).reshape([1, -1]).repeat([self.nb_neg, 1]).transpose(0, 1).reshape(-1)

        items = [-1] * len(users)

        random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
        print('Generating validation negatives...')
        for idx, u in enumerate(tqdm.tqdm(users.tolist())):
            if not random_items:
                random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
            j = random_items.pop()
            while u * self.nb_items + j in self.set:
                if not random_items:
                    random_items = torch.LongTensor(batch_size).random_(0, self.nb_items).tolist()
                j = random_items.pop()

            items[idx] = j
        items = torch.LongTensor(items)
        return items

In [6]:
def gen_negative_data(data, num_neg=100):
    # Generate negative data from a data frame
    sampler = _TestNegSampler(data.values, num_neg)  # using 100 negative samples
    test_negs = sampler.generate()
    test_negs = test_negs.reshape(-1, 1)


    test_data_neg = np.zeros((test_negs.shape[0]*test_negs.shape[1],3), dtype=int)
    
    idx = 0
    for i in range(test_negs.shape[0]):
        for j in range(test_negs.shape[1]):
            test_data_neg[idx, 0] = i
            test_data_neg[idx, 1] = j
            idx += 1
            
    return test_data_neg        
    

In [7]:
df = pd.read_csv('./data/ml-20m/ratings.csv')
print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
grouped = df.groupby(USER_COLUMN)
df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)

print("Mapping original user and item IDs to new sequential IDs")
df[USER_COLUMN], unique_users = pd.factorize(df[USER_COLUMN])
df[ITEM_COLUMN], unique_items = pd.factorize(df[ITEM_COLUMN])


Filtering out users with less than 20 ratings
Mapping original user and item IDs to new sequential IDs


In [8]:
# Need to sort before popping to get last item
df.sort_values(by='timestamp', inplace=True)

# clean up data
del df['rating'], df['timestamp']
df = df.drop_duplicates() # assuming it keeps order

# now we have filtered and sorted by time data, we can split test data out
grouped_sorted = df.groupby(USER_COLUMN, group_keys=False)
test_data = grouped_sorted.tail(1).sort_values(by=USER_COLUMN)
# need to pop for each group
train_data = grouped_sorted.apply(lambda x: x.iloc[:-1])

In [9]:
train_data['target']=1
test_data['target']=1
train_data.head()

Unnamed: 0,userId,movieId,target
20,0,20,1
19,0,19,1
86,0,86,1
61,0,61,1
23,0,23,1


In [10]:
test_data.head()

Unnamed: 0,userId,movieId,target
62,0,62,1
184,1,15,1
389,2,336,1
445,3,381,1
515,4,50,1


In [11]:
sampler = _TestNegSampler(df.values, 100)  # using 100 negative samples
train_negs = sampler.generate()
train_negs = train_negs.reshape(-1, 100)

sampler = _TestNegSampler(df.values, 100)  # using 100 negative samples
test_negs = sampler.generate()
test_negs = test_negs.reshape(-1, 10)

  1%|          | 85084/13849300 [00:00<00:16, 850838.78it/s]

Generating validation negatives...


100%|██████████| 13849300/13849300 [00:13<00:00, 1019308.80it/s]
  1%|          | 86215/13849300 [00:00<00:15, 862148.77it/s]

Generating validation negatives...


100%|██████████| 13849300/13849300 [00:13<00:00, 1006633.86it/s]


In [12]:
train_data_neg = np.zeros((train_negs.shape[0]*train_negs.shape[1],3), dtype=int)
idx = 0
for i in range(test_negs.shape[0]):
    for j in range(test_negs.shape[1]):
        train_data_neg[idx, 0] = i
        train_data_neg[idx, 1] = j
        idx += 1

In [13]:
test_data_neg = np.zeros((test_negs.shape[0]*test_negs.shape[1],3), dtype=int)
idx = 0
for i in range(test_negs.shape[0]):
    for j in range(test_negs.shape[1]):
        test_data_neg[idx, 0] = i
        test_data_neg[idx, 1] = j
        idx += 1

In [14]:
train_data_np= np.concatenate([train_data_neg, train_data.values])
np.random.shuffle(train_data_np)

test_data_np= np.concatenate([test_data_neg, test_data.values])
np.random.shuffle(test_data_np)


# Write HugeCTR data files

## Train data

In [15]:
from ctypes import c_longlong as ll
from ctypes import c_uint
from ctypes import c_float
from ctypes import c_int

def write_hugeCTR_data(huge_ctr_data, filename='huge_ctr_data.dat'):
    print("Writing %d samples"%huge_ctr_data.shape[0])
    with open(filename, 'wb') as f:
        #write header
        f.write(ll(0)) # 0: no error check; 1: check_num
        f.write(ll(huge_ctr_data.shape[0])) # the number of samples in this data file
        f.write(ll(1)) # dimension of label
        f.write(ll(1)) # dimension of dense feature
        f.write(ll(2)) # long long slot_num
        for _ in range(3): f.write(ll(0)) # reserved for future use

        for i in tqdm.tqdm(range(huge_ctr_data.shape[0])):
            f.write(c_float(huge_ctr_data[i,2])) # float label[label_dim];
            f.write(c_float(0)) # dummy dense feature
            f.write(c_int(1)) # slot 1 nnz: user ID
            f.write(c_uint(huge_ctr_data[i,0]))
            f.write(c_int(1)) # slot 2 nnz: item ID
            f.write(c_uint(huge_ctr_data[i,1]))

In [16]:
!mkdir ./data/hugeCTR
for i, data_arr in enumerate(np.array_split(train_data_np,10)):
    write_hugeCTR_data(data_arr, filename='./data/hugeCTR/huge_ctr_data_%d.dat'%i)

mkdir: cannot create directory './data/hugeCTR': File exists


  0%|          | 0/3371107 [00:00<?, ?it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 244903.83it/s]
  1%|          | 23168/3371107 [00:00<00:14, 231676.91it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 244112.60it/s]
  1%|          | 23451/3371107 [00:00<00:14, 234502.96it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 243427.90it/s]
  1%|          | 23121/3371107 [00:00<00:14, 231202.50it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 244990.91it/s]
  1%|          | 23334/3371107 [00:00<00:14, 233332.99it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 245433.58it/s]
  1%|          | 23192/3371107 [00:00<00:14, 231919.12it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 244587.61it/s]
  1%|          | 23189/3371107 [00:00<00:14, 231887.46it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 244548.72it/s]
  1%|          | 23399/3371107 [00:00<00:14, 233984.09it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 245846.81it/s]
  1%|          | 23228/3371107 [00:00<00:14, 232271.91it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 245625.61it/s]
  1%|          | 23385/3371107 [00:00<00:14, 233849.11it/s]

Writing 3371107 samples


100%|██████████| 3371107/3371107 [00:13<00:00, 244760.18it/s]


In [17]:
with open('./data/hugeCTR/filelist.txt', 'wt') as f:
    f.write('10\n');
    for i in range(10):
        f.write('./data/hugeCTR/huge_ctr_data_%d.dat\n'%i)

In [18]:
!cat ./data/hugeCTR/filelist.txt

10
./data/hugeCTR/huge_ctr_data_0.dat
./data/hugeCTR/huge_ctr_data_1.dat
./data/hugeCTR/huge_ctr_data_2.dat
./data/hugeCTR/huge_ctr_data_3.dat
./data/hugeCTR/huge_ctr_data_4.dat
./data/hugeCTR/huge_ctr_data_5.dat
./data/hugeCTR/huge_ctr_data_6.dat
./data/hugeCTR/huge_ctr_data_7.dat
./data/hugeCTR/huge_ctr_data_8.dat
./data/hugeCTR/huge_ctr_data_9.dat


## Test data


In [19]:
for i, data_arr in enumerate(np.array_split(test_data_np,10)):
    write_hugeCTR_data(data_arr, filename='./data/hugeCTR/test_huge_ctr_data_%d.dat'%i)

  0%|          | 0/1398780 [00:00<?, ?it/s]

Writing 1398780 samples


100%|██████████| 1398780/1398780 [00:05<00:00, 248849.94it/s]
  2%|▏         | 23610/1398780 [00:00<00:05, 236094.03it/s]

Writing 1398780 samples


100%|██████████| 1398780/1398780 [00:05<00:00, 250321.94it/s]
  2%|▏         | 23318/1398780 [00:00<00:05, 233170.77it/s]

Writing 1398780 samples


100%|██████████| 1398780/1398780 [00:05<00:00, 242331.92it/s]
  2%|▏         | 23157/1398779 [00:00<00:05, 231568.01it/s]

Writing 1398779 samples


100%|██████████| 1398779/1398779 [00:05<00:00, 241467.76it/s]
  2%|▏         | 23154/1398779 [00:00<00:05, 231536.91it/s]

Writing 1398779 samples


100%|██████████| 1398779/1398779 [00:05<00:00, 241830.59it/s]
  2%|▏         | 23278/1398779 [00:00<00:05, 232771.90it/s]

Writing 1398779 samples


100%|██████████| 1398779/1398779 [00:05<00:00, 243302.29it/s]
  2%|▏         | 23057/1398779 [00:00<00:05, 230560.87it/s]

Writing 1398779 samples


100%|██████████| 1398779/1398779 [00:05<00:00, 242021.28it/s]
  2%|▏         | 23229/1398779 [00:00<00:05, 232281.36it/s]

Writing 1398779 samples


100%|██████████| 1398779/1398779 [00:05<00:00, 242198.67it/s]
  2%|▏         | 23195/1398779 [00:00<00:05, 231940.82it/s]

Writing 1398779 samples


100%|██████████| 1398779/1398779 [00:05<00:00, 240561.84it/s]
  2%|▏         | 23395/1398779 [00:00<00:05, 233947.43it/s]

Writing 1398779 samples


100%|██████████| 1398779/1398779 [00:05<00:00, 241903.86it/s]


In [20]:
with open('./data/hugeCTR/test_filelist.txt', 'wt') as f:
    f.write('10\n');
    for i in range(10):
        f.write('./data/hugeCTR/test_huge_ctr_data_%d.dat\n'%i)