In [1]:
import os
GPU_id = 2
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline


In [2]:
import torch
import pandas as pd
import numpy as np
from time import time 

from fastai import *
from fastai.basic_data import *
from fastai.basic_data import *
from fastai.tabular import *
from fastai.basic_data import DataBunch
from fastai.tabular import TabularModel

import cudf

from preproc import *
from batchloader import *
from helpers import get_mean_reciprocal_rank, roc_auc_score

- In this notebook we want to benchmark the processing and training time for three diffrent models: 

- The two first models are using our CuDF processing workflow <a href=#cudf_workflow> section I </a>:
     1.  <a href=#first_model> Model 1 </a> : CuDF processing with CPU a copy
     2.  <a href=#second_model> Model 2 </a> : CuDF processing in-memory without copy    

           
 - <a href=#third_model> Model 3 </a> : In the second <a href=#fastai_workflow> section II </a>, we are using the Fastai processing workflow to get the scores of the best model found in the section I.  We directly process and create databunch from data_pair_all.pkl dataframe 

**N.B** : For each model, you need to re-start the kernel to free the GPU memory and be able to run all the experiments 

In [3]:
%load_ext snakeviz
# load snakeviz if you want to run profiling 

<h1> <center> <a id=batchdatabunch>New Data Bunch </a></center> </h1> 

### Define a custom databunch fastai that takes a TensorBatchDataLoader instead of the usual torch DataLoader 

In [4]:
class BatchDataBunch(DataBunch):
    
    @classmethod
    def remove_tfm(cls, tfm:Callable)->None:
        "Remove `tfm` from `self.tfms`."
        if tfm in cls.tfms: cls.tfms.remove(tfm)
            
    @classmethod
    def add_tfm(cls,tfm:Callable)->None:
        "Add `tfm` to `self.tfms`."
        cls.tfms.append(tfm)

    
    @classmethod
    def create(cls, train_ds, valid_ds, test_ds=None, path:PathOrStr='.', bs:int=64, val_bs=None, 
                      num_workers:int=defaults.cpus, device:torch.device=None,
                      collate_fn:Callable=data_collate, tfms: List[Callable]=None, 
                       size:int=None, **kwargs)->'BatchDataBunch':
        
        
        cls.tfms = listify(tfms)
        
        
        val_bs = ifnone(val_bs, bs)
        
        datasets = [TensorBatchDataset(train_ds, batch_size=bs), 
                    TensorBatchDataset(valid_ds, batch_size=bs)]
        
        if valid_ds is not None:
            cls.empty_val = False
        else:
            cls.empty_val = True
            
        if test_ds is not None:
            datasets.append(TensorBatchDataset(test_ds, batch_size=bs))
        else: 
            datasets.append(test_ds)
        
        cls.device = defaults.device if device is None else device
        
        dls = [BatchDataLoader(d, shuffle=s, pin_memory=False, drop_last=False, device=cls.device) for d,s in
               zip(datasets,(True,False,False)) if d is not None]

        cls.path = path 
        
        cls.dls = dls
    
        
        
        assert not isinstance(dls[0],DeviceDataLoader)
        
        
        # load batch in device 
        
        if test_ds is not None:
            cls.train_dl, cls.valid_dl, cls.test_dl = dls
        else: 
            cls.train_dl, cls.valid_dl = dls
            
            
        cls.path = Path(path)
        return cls
    


- To use the new BatchDatabunch class, we have to build the following processed tensors ( using cudf)  : 
    - train : cat_tensor, cont_tensor, label_tensor 
    
    - valid : cat_tensor, cont_tensor, label_tensor 
    
    - test : cat_tensor, cont_tensor, label_tensor 
    
- The size of vocaublary of each categorical variable need to be known 

<h1> <center>  <a id=fastai_workflow> Test of Tabular Learner with Fastai workflow </a></center> </h1>

- As the processing time is taking more than 6minutes and our purpose is to benchmark the best model using our proposed workflow against the Fastai workflow. We'll directly compute the scores of the Tabular model with batch size of 204800 and learning rate 0.09 

<h3> <a id=third_model> Fastai model </a> </h3> 

In [5]:
batch_size = 4096*50

In [6]:
from fastai import *
from fastai.basic_data import *
from fastai.basic_data import *
from fastai.tabular import *
from fastai.basic_data import DataBunch
from batchloader import *

In [7]:
data_path = './parquet_data/'

In [8]:
%%time
import pandas as pd
path = os.path.join(data_path,'data_pair_all.pkl' )
ds = pd.read_pickle(path)

CPU times: user 11.4 s, sys: 16 s, total: 27.4 s
Wall time: 27.4 s


In [9]:
ds.head(2)

Unnamed: 0,row_id,candidate_order,item_id,price,row_id_count,item_count,user_id,session_id,timestamp,step,...,delta_last_viewed_item_step_interaction item deals,delta_last_viewed_item_timestamp_interaction item deals,price_rank,price_rank_norm,item_count_rank,item_count_rank_norm,count_item_user_id_session_id_rank,count_item_user_id_session_id_rank_norm,count_item_user_id_rank,count_item_user_id_rank_norm
0,461,2,1812701,44,25,177,9Z8H0R5BPH3H,b0d46e23f4544,1541072329,2,...,,,5,0.2,3,0.12,0,0.0,0,0.0
1,461,3,5164712,70,25,43,9Z8H0R5BPH3H,b0d46e23f4544,1541072329,2,...,,,14,0.56,15,0.6,1,0.04,1,0.04


<h3> Create pre-processed databunch </h3> 

In [10]:
%%time
# split to train / test 
train = ds[ds.clickout_missing==0]
test = ds[ds.clickout_missing>0]
print(train.shape,test.shape)

# get categorical and continious variables names 
cat_names = ['user_id','item_id','platform','city','device','current_filters'] + [i for i in train.columns if i.startswith('is_')]
cont_names = ['price','candidate_order'] + [i for i in train.columns if i.startswith('count') or 'rank' in i or i.startswith('delta_')]

# define validation rows
train['is_va'] = train.row_id%5 == 0
del ds

(42756036, 46) (5762533, 46)
CPU times: user 6.98 s, sys: 6.93 s, total: 13.9 s
Wall time: 13.9 s


In [11]:
%%time
procs = [FillMissing, Categorify, Normalize]

test_list = TabularList.from_df(test, path='./', cat_names=cat_names, cont_names=cont_names)
data = (TabularList.from_df(train, path='./', cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_from_df('is_va')
                           .label_from_df(cols='target')
                           .add_test(test_list)
                           .databunch(num_workers=8,bs=batch_size, device='cuda'))

CPU times: user 3min 31s, sys: 2min 34s, total: 6min 5s
Wall time: 5min 52s


<h3> Compute average validation scores of the best model  </h3> 

In [12]:
## Mean / std of scores : 5 runs 
aucs = []
mrrs = []
times = []
best_bs = 4096*50
best_lr = 9e-2

emb_sz = [(938604, 16), (903867, 16), (56, 4), (32763, 8), (4, 1), (27842, 8), 
          (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3), (3, 3)]  

for i in range(5): 
    # define the model
    model = TabularModel(emb_szs = emb_sz, n_cont=len(cont_names), out_sz=2, layers=[64, 32])
    model = model.cuda()
    learn =  Learner(data, model, metrics=None)
    learn.loss_func = torch.nn.CrossEntropyLoss()
    
    # train the model 
    start = time()
    learn.fit_one_cycle(1, best_lr)
    tf = time()-start
    
    # get validation metrics 
    yp,y_valid = learn.get_preds()
    cv = train.loc[train['is_va']>0,['row_id','reference','item_id', 'target']].copy()
    cv['prob'] = yp.numpy()[:,1]
    cv = cv.sort_values(by=['row_id','prob'],ascending=False)
    auc = roc_auc_score(y_valid.numpy().ravel(),yp.numpy()[:,1])
    mean_reciprocal_rank = get_mean_reciprocal_rank(cv)
    
    aucs.append(auc)
    mrrs.append(mean_reciprocal_rank)
    times.append(tf)
    

In [13]:
print("the mrr of the best mdodel is: %s +/- %s" %(np.mean(mrrs), np.std(mrrs)))

print("the auc of the best mdodel is: %s +/- %s" %(np.mean(aucs), np.std(aucs)))

print("the best mdodel's training time is %s +/- %s" %(np.mean(times), np.std(times)))

the mrr of the best mdodel is: 0.6113445716562136 +/- 0.0004404710098267076
the auc of the best mdodel is: 0.8788439595236509 +/- 0.003043091013816776
the best mdodel's training time is 366.2606031894684 +/- 3.395267576536019


In [None]:
cp /s