# Criteo Example 


In [1]:
# clear cache
!sync; echo 3 > /proc/sys/vm/drop_caches

In [2]:
import os
GPU_id = 2
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [3]:
import torch
import pandas as pd
import numpy as np
from time import time 

from fastai import *
from fastai.basic_data import *
from fastai.basic_data import *
from fastai.tabular import *
from fastai.basic_data import DataBunch
from fastai.tabular import TabularModel
import rmm
import cudf
import nvtabular as nvt
from nvtabular.ops import Normalize, FillMissing, Categorify, Moments, Median, Encoder, LogOp, ZeroFill
from nvtabular.torch_dataloader import FileItrDataset, DLCollator, DLDataLoader
import warnings

import matplotlib.pyplot as plt
%matplotlib inline

Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_NVVM=/usr/local/cuda/nvvm/lib64/libnvvm.so.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')
Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_LIBDEVICE=/usr/local/cuda/nvvm/libdevice/.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')


In [4]:
rmm.reinitialize(pool_allocator=True, initial_pool_size=0.8 * rmm.get_info().free)

0

# <h3> Dataset Gathering: Define files in the training and validation datasets. </h3>

In [5]:

# data_path = '/rapids/notebooks/jperez/Documents/ds-itr/examples/'
data_path = '/raid/criteo/tests/crit_int_pq/'
#df_test = 'test/'
df_valid = ''
df_train = ''
start = 0
split = 23
end = 24

train_days = [x for x in range(start, split)]
valid_days = [x for x in range(split, end)]
# print(train_days, valid_days)

train_set = [data_path + df_train + x for x in os.listdir(data_path + df_train) if  x.endswith('parquet') and int(x.split(".")[0].split('_')[1]) in train_days]
valid_set = [data_path + df_valid + x for x in os.listdir(data_path + df_valid) if  x.endswith('parquet') and int(x.split(".")[0].split('_')[1]) in valid_days]

In [6]:
train_set, valid_set

(['/raid/criteo/tests/crit_int_pq/day_5.parquet',
  '/raid/criteo/tests/crit_int_pq/day_19.parquet',
  '/raid/criteo/tests/crit_int_pq/day_2.parquet',
  '/raid/criteo/tests/crit_int_pq/day_22.parquet',
  '/raid/criteo/tests/crit_int_pq/day_16.parquet',
  '/raid/criteo/tests/crit_int_pq/day_3.parquet',
  '/raid/criteo/tests/crit_int_pq/day_8.parquet',
  '/raid/criteo/tests/crit_int_pq/day_17.parquet',
  '/raid/criteo/tests/crit_int_pq/day_7.parquet',
  '/raid/criteo/tests/crit_int_pq/day_12.parquet',
  '/raid/criteo/tests/crit_int_pq/day_13.parquet',
  '/raid/criteo/tests/crit_int_pq/day_10.parquet',
  '/raid/criteo/tests/crit_int_pq/day_15.parquet',
  '/raid/criteo/tests/crit_int_pq/day_9.parquet',
  '/raid/criteo/tests/crit_int_pq/day_18.parquet',
  '/raid/criteo/tests/crit_int_pq/day_14.parquet',
  '/raid/criteo/tests/crit_int_pq/day_20.parquet',
  '/raid/criteo/tests/crit_int_pq/day_1.parquet',
  '/raid/criteo/tests/crit_int_pq/day_11.parquet',
  '/raid/criteo/tests/crit_int_pq/day_

<h4>Grab column information</h4>

In [7]:
cont_names = ['I' + str(x) for x in range(1,14)]
cat_names =  ['C' + str(x) for x in range(1,27)]
cat_names, cont_names

(['C1',
  'C2',
  'C3',
  'C4',
  'C5',
  'C6',
  'C7',
  'C8',
  'C9',
  'C10',
  'C11',
  'C12',
  'C13',
  'C14',
  'C15',
  'C16',
  'C17',
  'C18',
  'C19',
  'C20',
  'C21',
  'C22',
  'C23',
  'C24',
  'C25',
  'C26'],
 ['I1',
  'I2',
  'I3',
  'I4',
  'I5',
  'I6',
  'I7',
  'I8',
  'I9',
  'I10',
  'I11',
  'I12',
  'I13'])

In [8]:
cols = ['label']  + cont_names + cat_names
cols

['label',
 'I1',
 'I2',
 'I3',
 'I4',
 'I5',
 'I6',
 'I7',
 'I8',
 'I9',
 'I10',
 'I11',
 'I12',
 'I13',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20',
 'C21',
 'C22',
 'C23',
 'C24',
 'C25',
 'C26']

<h3>Preprocessing:</h3> <p>Select operations to perform, create the Preprocessor object, create dataset iterator object and collect the stats on the training dataset</p>

In [9]:
%%time
proc = nvt.Workflow(cat_names=cat_names, cont_names=cont_names, label_name=['label'], to_cpu=False)

CPU times: user 29 µs, sys: 19 µs, total: 48 µs
Wall time: 58.9 µs


In [10]:
%%time
proc.add_cont_feature([ZeroFill(replace=True), LogOp(replace=True)])
proc.add_cont_preprocess(Normalize(replace=True))
proc.add_cat_preprocess(Categorify(replace=True, use_frequency=True, freq_threshold=15))

CPU times: user 63 µs, sys: 40 µs, total: 103 µs
Wall time: 114 µs


In [11]:
%%time
trains_itrs = nvt.dataset(train_set, engine='parquet', gpu_memory_frac=0.4)
valids_itrs = nvt.dataset(valid_set, engine='parquet', gpu_memory_frac=0.4)

CPU times: user 14 µs, sys: 9 µs, total: 23 µs
Wall time: 33.4 µs


In [12]:
out = '/raid/criteo/tests/demo_out'
output_train = os.path.join(out, 'train/')
output_valid = os.path.join(out, 'valid/')

In [13]:
%%time 
proc.apply(trains_itrs, apply_offline=True, record_stats=True, shuffle=True, output_path=output_train, num_out_files=35)

CPU times: user 17min 18s, sys: 9min 44s, total: 27min 3s
Wall time: 21min 51s


In [14]:
%%time
proc.apply(valids_itrs, apply_offline=True, record_stats=False, shuffle=True, output_path=output_valid, num_out_files=35)

CPU times: user 27.4 s, sys: 18.6 s, total: 46 s
Wall time: 33.9 s


<h4>Preprocessing Complete<4>
    
    
    
    

<br><br>

<h3>Model Setup<h3>

In [None]:
new_train_set = [os.path.join(output_train, x) for x in os.listdir(output_train) if x.endswith("parquet")]
new_valid_set = [os.path.join(output_valid, x) for x in os.listdir(output_valid) if x.endswith("parquet")]

In [None]:
rmm.reinitialize(pool_allocator=False)

<h5>Gather embeddings using statistics gathered in the Read phase.</h5>

In [None]:
embeddings = [x[1] for x in proc.df_ops['Categorify'].get_emb_sz(proc.stats["categories"], proc.columns_ctx['categorical']['base'])]

<h5>Create the file iterators using the FileItrDataset Class.</h5>

In [None]:
%%time
t_batch_sets = [FileItrDataset(x, names=cols, engine='parquet', batch_size=1600000, sep="\t") for x in new_train_set]
v_batch_sets = [FileItrDataset(x, names=cols, engine='parquet', batch_size=1600000, sep="\t") for x in new_valid_set]

In [None]:
%%time
t_chain = torch.utils.data.ChainDataset(t_batch_sets)
v_chain = torch.utils.data.ChainDataset(v_batch_sets)

<h5>Use the Deep Learning Collator to create a collate function to pass to the dataloader.</h5>

In [None]:
%%time
dlc = DLCollator(preproc=proc, apply_ops=False)

In [None]:
%%time
t_data = DLDataLoader(t_chain, collate_fn=dlc.gdf_col, pin_memory=False, num_workers=0)
v_data = DLDataLoader(v_chain, collate_fn=dlc.gdf_col, pin_memory=False, num_workers=0)

<h4>After creating the Dataloaders you can leverage fastai framework to create Machine Learning models</h4>

In [None]:
databunch = DataBunch(t_data, v_data, collate_fn=dlc.gdf_col, device="cuda")

In [None]:
%%time
model = TabularModel(emb_szs = embeddings, n_cont=len(cont_names), out_sz=2, layers=[512,256])

learn =  Learner(databunch, model, metrics=[accuracy])
learn.loss_func = torch.nn.CrossEntropyLoss()


In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot(show_moms=True, suggestion=True)

In [None]:
learning_rate = 1.32e-2
epochs = 1

In [None]:
start = time()
learn.fit_one_cycle(epochs,learning_rate)
t_final = time() - start 

In [None]:
t_final

#### 