# Download data

In [1]:
%%bash
apt install unzip
mkdir -p data
cd data
if [ ! -f "ml-25m.zip" ]; then
    echo "Downloading data"
    wget http://files.grouplens.org/datasets/movielens/ml-25m.zip
    unzip ml-25m.zip
fi

Reading package lists...
Building dependency tree...
Reading state information...
unzip is already the newest version (6.0-21ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 32 not upgraded.






In [2]:
!ls ./data

ml-20m	ml-20m.zip  ml-25m  ml-25m.zip


In [3]:
from argparse import ArgumentParser
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

In [4]:
MIN_RATINGS = 20
USER_COLUMN = 'userId'
ITEM_COLUMN = 'movieId'

In [5]:
#using IMDB ID
df = pd.read_csv('./data/ml-25m/ratings.csv')


In [6]:
links = pd.read_csv('./data/ml-25m/links.csv', dtype={'imdbId':str})

In [7]:
df = df.merge(links, how = 'left', on='movieId')

In [8]:
df['movieId'] = df['imdbId']
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,imdbId,tmdbId
0,1,110912,5.0,1147880044,110912,680.0
1,1,111495,3.5,1147868817,111495,110.0
2,1,108394,5.0,1147868828,108394,108.0
3,1,114787,5.0,1147878820,114787,11902.0
4,1,45152,3.5,1147868510,45152,872.0


In [9]:

print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
grouped = df.groupby(USER_COLUMN)
df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)

Filtering out users with less than 20 ratings


In [10]:
# Need to sort before popping to get last item
df.sort_values(by='timestamp', inplace=True)

# clean up data
del df['rating'], df['timestamp']
df = df.drop_duplicates() # assuming it keeps order

# now we have filtered and sorted by time data, we can split test data out
grouped_sorted = df.groupby(USER_COLUMN, group_keys=False)
test_data = grouped_sorted.tail(1).sort_values(by=USER_COLUMN)
# need to pop for each group
train_data = grouped_sorted.apply(lambda x: x.iloc[:-1])

In [11]:
train_data['target']=1
test_data['target']=1


In [12]:
users = np.unique(train_data['userId'])
items = np.unique(train_data['movieId'])

In [13]:
train_data = train_data[['userId','movieId', 'target']]
train_data.head()

Unnamed: 0,userId,movieId,target
36,1,167261,1
13,1,99088,1
12,1,96874,1
11,1,119177,1
9,1,50212,1


In [14]:
test_data = test_data[['userId','movieId', 'target']]
test_data.head()

Unnamed: 0,userId,movieId,target
48,1,338013,1
148,2,80801,1
307,3,118715,1
919,4,71853,1
1226,5,53271,1


# Write to text file

In [15]:
def write_DLRM_data(data, filename='dlrm_data.tsv'):
    print("Writing %d samples"%data.shape[0], filename)
    with open(filename, 'wt') as f:
        for i in tqdm(range(data.shape[0])):
            f.write('%d\t%d\t%d\t%s\n'%(data[i,2], 1, data[i,0], data[i,1])) #label, dummy numeric feat., userID, itemID

In [16]:
train_data_np= train_data.values
np.random.shuffle(train_data_np)


In [17]:
test_data_np= test_data.values
test_data_np_neg = np.zeros_like(test_data)


In [18]:
test_data_np_neg[:,0] = np.random.randint(1,len(users),test_data_np_neg.shape[0])
test_data_np_neg[:,1] = np.random.randint(2,len(items),test_data_np_neg.shape[0])

In [19]:
test_data_np_neg

array([[130896, 46173, 0],
       [106913, 53325, 0],
       [161987, 38842, 0],
       ...,
       [28231, 38531, 0],
       [87930, 39152, 0],
       [34217, 56868, 0]], dtype=object)

In [20]:
test_data_np = np.concatenate((test_data_np, test_data_np_neg), axis=0)

In [21]:
test_data_np

array([[1, '0338013', 1],
       [2, '0080801', 1],
       [3, '0118715', 1],
       ...,
       [28231, 38531, 0],
       [87930, 39152, 0],
       [34217, 56868, 0]], dtype=object)

In [22]:
!rm -rf /data/dlrm/
!mkdir -p /data/dlrm/criteo
for i, data_arr in enumerate(np.array_split(train_data_np,23)):
    write_DLRM_data(data_arr, filename='/data/dlrm/criteo/day_%d'%i)

  3%|▎         | 30425/1079894 [00:00<00:03, 304245.21it/s]

Writing 1079894 samples /data/dlrm/criteo/day_0


100%|██████████| 1079894/1079894 [00:02<00:00, 514984.05it/s]
  5%|▍         | 51572/1079894 [00:00<00:01, 515716.80it/s]

Writing 1079894 samples /data/dlrm/criteo/day_1


100%|██████████| 1079894/1079894 [00:01<00:00, 545000.69it/s]
 10%|▉         | 104804/1079894 [00:00<00:01, 521081.75it/s]

Writing 1079894 samples /data/dlrm/criteo/day_2


100%|██████████| 1079894/1079894 [00:01<00:00, 569235.82it/s]
  9%|▉         | 101472/1079894 [00:00<00:01, 506162.25it/s]

Writing 1079894 samples /data/dlrm/criteo/day_3


100%|██████████| 1079894/1079894 [00:01<00:00, 558184.29it/s]
  5%|▍         | 49493/1079894 [00:00<00:02, 494929.29it/s]

Writing 1079894 samples /data/dlrm/criteo/day_4


100%|██████████| 1079894/1079894 [00:01<00:00, 565891.18it/s]
  9%|▉         | 95087/1079894 [00:00<00:02, 455658.37it/s]

Writing 1079894 samples /data/dlrm/criteo/day_5


100%|██████████| 1079894/1079894 [00:01<00:00, 563203.79it/s]
  5%|▍         | 50193/1079894 [00:00<00:02, 501923.30it/s]

Writing 1079894 samples /data/dlrm/criteo/day_6


100%|██████████| 1079894/1079894 [00:01<00:00, 567715.82it/s]
  9%|▊         | 92577/1079894 [00:00<00:02, 476814.53it/s]

Writing 1079894 samples /data/dlrm/criteo/day_7


100%|██████████| 1079894/1079894 [00:01<00:00, 556783.78it/s]
  9%|▉         | 99848/1079894 [00:00<00:01, 496852.83it/s]

Writing 1079894 samples /data/dlrm/criteo/day_8


100%|██████████| 1079894/1079894 [00:01<00:00, 556339.80it/s]
 10%|▉         | 102780/1079894 [00:00<00:01, 514750.35it/s]

Writing 1079894 samples /data/dlrm/criteo/day_9


100%|██████████| 1079894/1079894 [00:01<00:00, 573466.66it/s]
  9%|▉         | 97369/1079894 [00:00<00:02, 488321.84it/s]

Writing 1079894 samples /data/dlrm/criteo/day_10


100%|██████████| 1079894/1079894 [00:01<00:00, 549007.85it/s]
  4%|▍         | 45790/1079894 [00:00<00:02, 457898.25it/s]

Writing 1079894 samples /data/dlrm/criteo/day_11


100%|██████████| 1079894/1079894 [00:01<00:00, 545596.91it/s]
  4%|▍         | 42002/1079894 [00:00<00:02, 420012.39it/s]

Writing 1079894 samples /data/dlrm/criteo/day_12


100%|██████████| 1079894/1079894 [00:01<00:00, 573398.20it/s]
  5%|▍         | 51251/1079894 [00:00<00:02, 512501.94it/s]

Writing 1079894 samples /data/dlrm/criteo/day_13


100%|██████████| 1079894/1079894 [00:01<00:00, 541691.42it/s]
  9%|▉         | 102259/1079894 [00:00<00:01, 501594.52it/s]

Writing 1079894 samples /data/dlrm/criteo/day_14


100%|██████████| 1079894/1079894 [00:01<00:00, 576798.58it/s]
  9%|▉         | 98572/1079893 [00:00<00:02, 474007.80it/s]

Writing 1079893 samples /data/dlrm/criteo/day_15


100%|██████████| 1079893/1079893 [00:01<00:00, 566253.93it/s]
  5%|▍         | 50488/1079893 [00:00<00:02, 504878.07it/s]

Writing 1079893 samples /data/dlrm/criteo/day_16


100%|██████████| 1079893/1079893 [00:01<00:00, 552395.50it/s]
  9%|▉         | 100602/1079893 [00:00<00:02, 479842.80it/s]

Writing 1079893 samples /data/dlrm/criteo/day_17


100%|██████████| 1079893/1079893 [00:01<00:00, 552608.54it/s]
 10%|▉         | 104194/1079893 [00:00<00:01, 519607.12it/s]

Writing 1079893 samples /data/dlrm/criteo/day_18


100%|██████████| 1079893/1079893 [00:01<00:00, 570774.82it/s]
 10%|▉         | 104238/1079893 [00:00<00:01, 521822.14it/s]

Writing 1079893 samples /data/dlrm/criteo/day_19


100%|██████████| 1079893/1079893 [00:01<00:00, 570095.92it/s]
  4%|▍         | 47340/1079893 [00:00<00:02, 473391.42it/s]

Writing 1079893 samples /data/dlrm/criteo/day_20


100%|██████████| 1079893/1079893 [00:01<00:00, 553342.24it/s]
 10%|▉         | 105260/1079893 [00:00<00:01, 516459.72it/s]

Writing 1079893 samples /data/dlrm/criteo/day_21


100%|██████████| 1079893/1079893 [00:01<00:00, 576518.70it/s]
  4%|▍         | 43970/1079893 [00:00<00:02, 439697.27it/s]

Writing 1079893 samples /data/dlrm/criteo/day_22


100%|██████████| 1079893/1079893 [00:01<00:00, 560094.83it/s]


In [23]:
!tail /data/dlrm/criteo/day_0

1	1	18544	0298203
1	1	125843	0080684
1	1	95203	0112573
1	1	73639	0258463
1	1	132651	0109781
1	1	29464	0083658
1	1	66219	0077405
1	1	44389	0134847
1	1	152508	0088794
1	1	9439	0267804


In [24]:
write_DLRM_data(test_data_np, filename='/data/dlrm/criteo/day_23')

  0%|          | 0/325082 [00:00<?, ?it/s]

Writing 325082 samples /data/dlrm/criteo/day_23


100%|██████████| 325082/325082 [00:00<00:00, 557972.00it/s]


In [25]:
!head /data/dlrm/criteo/day_0

1	1	94294	0053291
1	1	157552	1478338
1	1	53227	0190524
1	1	162488	0086190
1	1	53193	0120762
1	1	51552	0352248
1	1	132227	2140479
1	1	61050	0095776
1	1	119495	0046250
1	1	103464	0361748


In [26]:
!tail /data/dlrm/criteo/day_23

0	1	26437	22077
0	1	39502	45843
0	1	57945	47617
0	1	49756	39439
0	1	93145	47952
0	1	101697	22366
0	1	53529	55809
0	1	28231	38531
0	1	87930	39152
0	1	34217	56868


# Preprocessing with DLRM

In [27]:
%%bash
apt install psmisc
cd ./preproc
killall -9 java
bash ./prepare_dataset.sh

Reading package lists...
Building dependency tree...
Reading state information...
The following NEW packages will be installed:
  psmisc
0 upgraded, 1 newly installed, 0 to remove and 32 not upgraded.
Need to get 52.5 kB of archives.
After this operation, 266 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 psmisc amd64 23.1-1ubuntu0.1 [52.5 kB]
Fetched 52.5 kB in 2s (33.8 kB/s)
Selecting previously unselected package psmisc.
(Reading database ... (Reading database ... 5%(Reading database ... 10%(Reading database ... 15%(Reading database ... 20%(Reading database ... 25%(Reading database ... 30%(Reading database ... 35%(Reading database ... 40%(Reading database ... 45%(Reading database ... 50%(Reading database ... 55%(Reading database ... 60%(Reading database ... 65%(Reading database ... 70%(Reading database ... 75%(Reading database ... 80%(Reading database ... 85%(Reading database ... 90%(Reading database ...



debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
java: no process found
+ ls -ltrash
+ download_dir=/data/dlrm/criteo
+ ./verify_criteo_downloaded.sh /data/dlrm/criteo
++ download_dir=/data/dlrm/criteo
++ cd /data/dlrm/criteo
+++ seq 0 23
++ for i in $(seq 0 23)
++ filename=day_0
++ '[' -f day_0 ']'
++ echo 'day_0 exists, OK'
++ for i in $(seq 0 23)
++ filename=day_1
++ '[' -f day_1 ']'
++ echo 'day_1 exists, OK'
++ for i in $(seq 0 23)
++ filename=day_2
++ '[' -f day_2 ']'
++ echo 'day_2 exists, OK'
++ for i in $(seq 0 23)
++ filename=day_3
++ '[' -f day_3 ']'
++ echo 'day_3 exists, OK'
++ 

CalledProcessError: Command 'b'apt install psmisc\ncd ./preproc\nkillall -9 java\nbash ./prepare_dataset.sh\n'' returned non-zero exit status 1.

In [None]:
!cat /data/dlrm/binary_dataset/model_size.json


# Training

In [None]:
%run /workspace/dlrm/dlrm/scripts/main.py \
--mode train \
--dataset /data/dlrm/binary_dataset/ \
--num_numerical_features 1 \
--base_device cuda \
--lr 0.1 \
--embedding_type joint \
--epochs 10 \
--batch_size=8192 \
--save_checkpoint_path ./movie_lens_model

In [None]:
python  /workspace/dlrm/dlrm/scripts/main.py \
--mode train \
--dataset /data/dlrm/binary_dataset/ \
--num_numerical_features 1 \
--base_device cuda \
--lr 0.1 \
--embedding_type joint \
--epochs 10 \
--batch_size=8192 \
--save_checkpoint_path ./movie_lens_model