# Download data

In [6]:
%%bash
apt install unzip
mkdir -p data
cd data
if [ ! -f "ml-20m.zip" ]; then
    echo "Downloading data"
    wget http://files.grouplens.org/datasets/movielens/ml-20m.zip
    unzip ml-20m.zip
fi

Reading package lists...
Building dependency tree...
Reading state information...
unzip is already the newest version (6.0-21ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 32 not upgraded.






In [7]:
!ls ./data

ml-20m	ml-20m.zip


In [8]:
from argparse import ArgumentParser
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

In [9]:
MIN_RATINGS = 20
USER_COLUMN = 'userId'
ITEM_COLUMN = 'movieId'

In [10]:
df = pd.read_csv('./data/ml-20m/ratings.csv')
print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
grouped = df.groupby(USER_COLUMN)
df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)

Filtering out users with less than 20 ratings


In [11]:
# Need to sort before popping to get last item
df.sort_values(by='timestamp', inplace=True)

# clean up data
del df['rating'], df['timestamp']
df = df.drop_duplicates() # assuming it keeps order

# now we have filtered and sorted by time data, we can split test data out
grouped_sorted = df.groupby(USER_COLUMN, group_keys=False)
test_data = grouped_sorted.tail(1).sort_values(by=USER_COLUMN)
# need to pop for each group
train_data = grouped_sorted.apply(lambda x: x.iloc[:-1])

In [12]:
train_data['target']=1
test_data['target']=1


In [13]:
train_data.head()

Unnamed: 0,userId,movieId,target
20,1,924,1
19,1,919,1
86,1,2683,1
61,1,1584,1
23,1,1079,1


In [14]:
from collections import defaultdict 
import os
import pickle

if not os.path.exists('/workspace/dlrm_vinhn/notebooks/user_item_dicts.pickle'):
    print('building dict...')
    user_items_train = defaultdict(set)
    item_users_train = defaultdict(set)
    for _, row in tqdm(train_data.iterrows()):
        user_items_train[row['userId']].add(row['movieId'])
        item_users_train[row['movieId']].add(row['userId'])

    user_items_test = defaultdict(set)
    item_users_test = defaultdict(set)
    for _, row in tqdm(test_data.iterrows()):
        user_items_test[row['userId']].add(row['movieId'])
        item_users_test[row['movieId']].add(row['userId'])

    import pickle

    with open('./user_item_dicts.pickle', 'wb') as handle:
        pickle.dump({"user_items_train": user_items_train, 
                     "item_users_train": item_users_train,
                     "user_items_test": user_items_test,
                     "item_users_test": item_users_test,
                    }, handle, protocol=pickle.HIGHEST_PROTOCOL)
else:
    with open('/workspace/dlrm_vinhn/notebooks/user_item_dicts.pickle', 'rb') as handle:
        dat = pickle.load(handle)
        user_items_train = dat["user_items_train"]
        item_users_train = dat["item_users_train"]
        user_items_test = dat["user_items_test"]
        item_users_test = dat["item_users_test"]

In [15]:
users = list(np.unique(df['userId']))
items = list(np.unique(df['movieId']))

In [16]:
def NegativeSampling(user_items, items, neg_sample_factor=5):
    neg_user_items = defaultdict(set)
    cnt = 0
    for user in tqdm(user_items):
        num_rating = len(user_items[user])
        num_neg = neg_sample_factor * num_rating
        
        idx = np.random.randint(0, len(items), (int(num_neg*1.2)))
        
        for i in idx:
            if items[i] not in user_items[user]:
                neg_user_items[user].add(items[i])
            if len(neg_user_items[user]) == num_neg:
                break
        
        cnt += len(neg_user_items[user])
    return neg_user_items, cnt

In [17]:
#train_negs, num_train_negs = NegativeSampling(user_items_train, items, 30)

In [18]:
#num_train_negs

In [19]:
test_negs, num_test_negs = NegativeSampling(user_items_train, items, 1)

100%|██████████| 138493/138493 [00:26<00:00, 5302.37it/s]


In [20]:
# test negative data
test_data_neg = np.zeros((num_test_negs,3), dtype=int)
idx = 0
for user in tqdm(test_negs):
    for j in test_negs[user]:
        test_data_neg[idx, 0] = user
        test_data_neg[idx, 1] = j
        test_data_neg[idx, 2] = 0
        idx += 1

100%|██████████| 138493/138493 [00:16<00:00, 8437.59it/s]


In [21]:
num_test_negs

19842714

In [22]:
train_data_np= train_data.values
np.random.shuffle(train_data_np)

test_data_np= np.concatenate([test_data_neg, test_data.values])
np.random.shuffle(test_data_np)

In [23]:
np.sum(test_data_np[:,2])

138493

In [24]:
test_data_np

array([[ 39604,   5185,      0],
       [120840,  80354,      0],
       [ 70874, 121797,      0],
       ...,
       [ 84480, 104809,      0],
       [ 86473,   5126,      0],
       [ 55153,   2378,      0]])

# Write to text file

In [25]:
rd = np.random.randint(0, 100, size=(100000,1))
def write_DLRM_data(data, filename='dlrm_data.tsv'):
    print("Writing %d samples"%data.shape[0], filename)
    with open(filename, 'wt') as f:
        for i in tqdm(range(data.shape[0])):
            f.write('%d\t%d\t%d\t%d\n'%(data[i,2], 1, data[i,0], data[i,1])) #label, dummy numeric feat., userID, itemID

In [26]:
!rm -rf /data/dlrm/
!mkdir -p /data/dlrm/criteo
for i, data_arr in enumerate(np.array_split(train_data_np,23)):
    write_DLRM_data(data_arr, filename='/data/dlrm/criteo/day_%d'%i)

  4%|▍         | 38281/863556 [00:00<00:02, 382803.06it/s]

Writing 863556 samples /data/dlrm/criteo/day_0


100%|██████████| 863556/863556 [00:01<00:00, 467697.86it/s]
  9%|▉         | 78331/863556 [00:00<00:02, 366386.48it/s]

Writing 863556 samples /data/dlrm/criteo/day_1


100%|██████████| 863556/863556 [00:01<00:00, 456689.05it/s]
 10%|▉         | 83362/863556 [00:00<00:01, 407061.26it/s]

Writing 863556 samples /data/dlrm/criteo/day_2


100%|██████████| 863556/863556 [00:01<00:00, 462938.49it/s]
 10%|█         | 86759/863556 [00:00<00:01, 431235.05it/s]

Writing 863556 samples /data/dlrm/criteo/day_3


100%|██████████| 863556/863556 [00:01<00:00, 465759.97it/s]
 10%|█         | 87613/863556 [00:00<00:01, 438688.32it/s]

Writing 863556 samples /data/dlrm/criteo/day_4


100%|██████████| 863556/863556 [00:01<00:00, 472516.04it/s]
  9%|▉         | 79911/863555 [00:00<00:02, 378971.23it/s]

Writing 863555 samples /data/dlrm/criteo/day_5


100%|██████████| 863555/863555 [00:01<00:00, 464754.39it/s]
  5%|▌         | 43991/863555 [00:00<00:01, 439902.03it/s]

Writing 863555 samples /data/dlrm/criteo/day_6


100%|██████████| 863555/863555 [00:01<00:00, 449781.00it/s]
 10%|█         | 88840/863555 [00:00<00:01, 440238.58it/s]

Writing 863555 samples /data/dlrm/criteo/day_7


100%|██████████| 863555/863555 [00:01<00:00, 475810.05it/s]
 10%|▉         | 84763/863555 [00:00<00:01, 411952.03it/s]

Writing 863555 samples /data/dlrm/criteo/day_8


100%|██████████| 863555/863555 [00:01<00:00, 465736.25it/s]
 10%|█         | 88812/863555 [00:00<00:01, 442132.80it/s]

Writing 863555 samples /data/dlrm/criteo/day_9


100%|██████████| 863555/863555 [00:01<00:00, 470090.82it/s]
 10%|█         | 86768/863555 [00:00<00:01, 431774.20it/s]

Writing 863555 samples /data/dlrm/criteo/day_10


100%|██████████| 863555/863555 [00:01<00:00, 464487.56it/s]
 10%|█         | 87359/863555 [00:00<00:01, 433599.63it/s]

Writing 863555 samples /data/dlrm/criteo/day_11


100%|██████████| 863555/863555 [00:01<00:00, 458602.26it/s]
 10%|▉         | 83642/863555 [00:00<00:01, 403317.45it/s]

Writing 863555 samples /data/dlrm/criteo/day_12


100%|██████████| 863555/863555 [00:01<00:00, 460339.88it/s]
 10%|█         | 87331/863555 [00:00<00:01, 436417.39it/s]

Writing 863555 samples /data/dlrm/criteo/day_13


100%|██████████| 863555/863555 [00:01<00:00, 469497.81it/s]
  5%|▌         | 43665/863555 [00:00<00:01, 436645.21it/s]

Writing 863555 samples /data/dlrm/criteo/day_14


100%|██████████| 863555/863555 [00:01<00:00, 462191.97it/s]
 10%|▉         | 82366/863555 [00:00<00:01, 395349.24it/s]

Writing 863555 samples /data/dlrm/criteo/day_15


100%|██████████| 863555/863555 [00:01<00:00, 455476.58it/s]
 10%|█         | 88785/863555 [00:00<00:01, 442652.97it/s]

Writing 863555 samples /data/dlrm/criteo/day_16


100%|██████████| 863555/863555 [00:01<00:00, 473528.54it/s]
 10%|█         | 87841/863555 [00:00<00:01, 436944.36it/s]

Writing 863555 samples /data/dlrm/criteo/day_17


100%|██████████| 863555/863555 [00:01<00:00, 469507.42it/s]
 10%|▉         | 82349/863555 [00:00<00:01, 394498.56it/s]

Writing 863555 samples /data/dlrm/criteo/day_18


100%|██████████| 863555/863555 [00:01<00:00, 471163.60it/s]
 10%|█         | 88495/863555 [00:00<00:01, 438725.38it/s]

Writing 863555 samples /data/dlrm/criteo/day_19


100%|██████████| 863555/863555 [00:01<00:00, 474987.90it/s]
  5%|▍         | 43167/863555 [00:00<00:01, 431654.97it/s]

Writing 863555 samples /data/dlrm/criteo/day_20


100%|██████████| 863555/863555 [00:01<00:00, 443510.48it/s]
  9%|▉         | 81957/863555 [00:00<00:01, 394580.67it/s]

Writing 863555 samples /data/dlrm/criteo/day_21


100%|██████████| 863555/863555 [00:01<00:00, 469065.44it/s]
 10%|█         | 88202/863555 [00:00<00:01, 439011.23it/s]

Writing 863555 samples /data/dlrm/criteo/day_22


100%|██████████| 863555/863555 [00:01<00:00, 476167.23it/s]


In [27]:
!head /data/dlrm/criteo/day_0

1	1	34938	517
1	1	116914	480
1	1	67709	457
1	1	119223	2394
1	1	22448	3617
1	1	49081	1572
1	1	45044	5538
1	1	6540	3101
1	1	47912	1193
1	1	34187	3863


In [28]:
write_DLRM_data(test_data_np, filename='/data/dlrm/criteo/day_23')

  0%|          | 41556/19981207 [00:00<00:47, 415557.42it/s]

Writing 19981207 samples /data/dlrm/criteo/day_23


100%|██████████| 19981207/19981207 [00:42<00:00, 472135.02it/s]


In [29]:
!head /data/dlrm/criteo/day_0

1	1	34938	517
1	1	116914	480
1	1	67709	457
1	1	119223	2394
1	1	22448	3617
1	1	49081	1572
1	1	45044	5538
1	1	6540	3101
1	1	47912	1193
1	1	34187	3863


In [30]:
!tail /data/dlrm/criteo/day_23

0	1	137164	51486
0	1	93729	4563
0	1	48652	91208
0	1	84611	99960
0	1	2856	79496
0	1	58921	72913
0	1	74404	108780
0	1	84480	104809
0	1	86473	5126
0	1	55153	2378


# Preprocessing with DLRM

In [31]:
%%bash
apt install psmisc
cd ./preproc
killall -9 java
bash ./prepare_dataset.sh

Reading package lists...
Building dependency tree...
Reading state information...
The following NEW packages will be installed:
  psmisc
0 upgraded, 1 newly installed, 0 to remove and 32 not upgraded.
Need to get 52.5 kB of archives.
After this operation, 266 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 psmisc amd64 23.1-1ubuntu0.1 [52.5 kB]
Fetched 52.5 kB in 1s (40.4 kB/s)
Selecting previously unselected package psmisc.
(Reading database ... (Reading database ... 5%(Reading database ... 10%(Reading database ... 15%(Reading database ... 20%(Reading database ... 25%(Reading database ... 30%(Reading database ... 35%(Reading database ... 40%(Reading database ... 45%(Reading database ... 50%(Reading database ... 55%(Reading database ... 60%(Reading database ... 65%(Reading database ... 70%(Reading database ... 75%(Reading database ... 80%(Reading database ... 85%(Reading database ... 90%(Reading database ...



debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
java: no process found
+ ls -ltrash
+ download_dir=/data/dlrm/criteo
+ ./verify_criteo_downloaded.sh /data/dlrm/criteo
++ download_dir=/data/dlrm/criteo
++ cd /data/dlrm/criteo
+++ seq 0 23
++ for i in $(seq 0 23)
++ filename=day_0
++ '[' -f day_0 ']'
++ echo 'day_0 exists, OK'
++ for i in $(seq 0 23)
++ filename=day_1
++ '[' -f day_1 ']'
++ echo 'day_1 exists, OK'
++ for i in $(seq 0 23)
++ filename=day_2
++ '[' -f day_2 ']'
++ echo 'day_2 exists, OK'
++ for i in $(seq 0 23)
++ filename=day_3
++ '[' -f day_3 ']'
++ echo 'day_3 exists, OK'
++ 

In [32]:
!cat /data/dlrm/binary_dataset/model_size.json


{
    "_c2": 138493,
    "_c3": 26744
}

# Training

In [33]:
%run /workspace/dlrm/dlrm/scripts/main.py \
--mode train \
--dataset /data/dlrm/binary_dataset/ \
--num_numerical_features 1 \
--base_device cuda \
--lr 0.1 \
--embedding_type joint \
--epochs 10 \
--batch_size=8192 \
--save_checkpoint_path ./movie_lens_model

SyntaxError: invalid syntax (main.py, line 317)