In [1]:
import os
GPU_id = 3
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

# This workbook must be run using cudf 0.7

In [2]:
import warnings
warnings.filterwarnings("ignore")
import cudf as gd
import pandas as pd
import numpy as np
import xgboost as xgb
import os
import time
import nvstrings
from librmm_cffi import librmm
import matplotlib.pyplot as plt
%matplotlib inline

### Global

In [3]:
GPU_RUN_TIME = {}
STEPS = []

### Functions

In [4]:
def on_gpu(words,func,arg=None,dtype=np.int64):
    
    res = librmm.device_array(words.size(), dtype=dtype)
    if arg is None:
        cmd = 'words.%s(res.device_ctypes_pointer.value)'%(func)
    else:
        cmd = 'words.%s(arg,res.device_ctypes_pointer.value)'%(func)
    eval(cmd)
    return res

### Read data

In [5]:
if os.path.exists('cache')==False:
    os.mkdir('cache')

In [6]:
path = '../../../../data/'

In [7]:
step = 'read csv'
STEPS.append(step)

### cudf read csv

In [8]:
%%time
start = time.time()
train_gd = gd.read_csv('%s/train.csv'%path)
test_gd = gd.read_csv('%s/test.csv'%path)
submission_gd = gd.read_csv('%s/submission_popular.csv'%path)
print("train & test",train_gd.shape,test_gd.shape)
data_gd = gd.concat([train_gd,test_gd])
print('combined',data_gd.shape)
GPU_RUN_TIME[step] = time.time() - start

train & test (15932992, 12) (3782335, 12)
combined (19715327, 12)
CPU times: user 3.25 s, sys: 1.66 s, total: 4.91 s
Wall time: 4.89 s


In [9]:
del train_gd,test_gd

### Only keep click out rows

In [10]:
step = 'string comparsion and masking'
STEPS.append(step)

### cudf string comparsion and masking.

In [11]:
%%time
start = time.time()
action_type = nvstrings.from_strings(data_gd['action_type'].data)
data_gd['is_click_out'] = action_type.compare('clickout item')
del action_type
data_gd['is_click_out'] = data_gd['is_click_out']==0 # 0 means string match
data_gd['is_click_out'] = data_gd['is_click_out'].astype('bool')
data_gd = data_gd[data_gd['is_click_out']]

data_gd.drop_column('is_click_out')
print("# of clickouts:",data_gd.shape[0])
data_gd['clickout_missing'] = data_gd['reference'].isnull()

print('true test',data_gd[data_gd['clickout_missing']].shape)
print(submission_gd.shape[0])
assert submission_gd.shape[0] == data_gd[data_gd['clickout_missing']].shape[0]
print('true test shape match submission shape')
GPU_RUN_TIME[step] = time.time() - start

# of clickouts: 2115365
true test (253573, 13)
253573
true test shape match submission shape
CPU times: user 5.75 s, sys: 2.13 s, total: 7.87 s
Wall time: 7.87 s


In [12]:
%%time
data_gd['row_id'] = np.arange(data_gd.shape[0])

CPU times: user 7.43 ms, sys: 3.76 ms, total: 11.2 ms
Wall time: 9.58 ms


### Create recommendation list from `impressions`

In [13]:
step = 'string column split & expand'
STEPS.append(step)

### cudf string column split and expand

In [14]:
%%time
start = time.time()
# impressions/prices will be split into 25 columns
candidates_gd = data_gd['impressions'].data.split('|')
prices_gd = data_gd['prices'].data.split('|')
GPU_RUN_TIME[step] = time.time() - start

CPU times: user 657 ms, sys: 489 ms, total: 1.15 s
Wall time: 1.15 s


In [15]:
%%time
data_gd.drop_column('impressions')
data_gd.drop_column('prices')

CPU times: user 3.6 ms, sys: 687 µs, total: 4.29 ms
Wall time: 2.86 ms


### Assign string columns to dataframe

In [16]:
step = 'assign string columns to dataframe'
STEPS.append(step)

In [17]:
%%time
start = time.time()
data_gd_rec_list = data_gd[['row_id']].to_pandas()
for i in range(len(candidates_gd)):
    data_gd_rec_list['item_%d'%i] = candidates_gd[i].to_host()
    data_gd_rec_list['price_%d'%i] = prices_gd[i].to_host()
data_gd_rec_list = data_gd_rec_list.set_index('row_id')
GPU_RUN_TIME[step] = time.time() - start

CPU times: user 21.4 s, sys: 3 s, total: 24.4 s
Wall time: 24.4 s


### Create data pair

In [18]:
step = 'create data pair'
STEPS.append(step)

### cudf create data pair
For functionalities that are not supported by cudf yet, such as `stack`, we fall back to pandas. 

In [19]:
%%time
start = time.time()

cols = [i for i in data_gd_rec_list.columns if i.startswith('item_')]
items = data_gd_rec_list[cols].stack().reset_index()
items.columns = ['row_id','candidate_order','item_id']

cols = [i for i in data_gd_rec_list.columns if i.startswith('price_')]
prices = data_gd_rec_list[cols].stack().reset_index()
prices.columns = ['row_id','candidate_order','price']

items['price'] = prices['price'].astype(int)
items['candidate_order'] = items['candidate_order'].apply(lambda x:x.split('_')[1]).astype(int)

count = items['row_id'].value_counts()
items['row_id_count'] = items['row_id'].map(count)
items = items[items['row_id_count']>1]
items_gd = gd.from_pandas(items)

data_gd['clickout_missing'] = data_gd['clickout_missing'].astype(int)

CPU times: user 1min 2s, sys: 7.95 s, total: 1min 10s
Wall time: 1min 10s


In [20]:
data_gd.shape

(2115365, 12)

In [21]:
len(data_gd[data_gd.clickout_missing==1])

253573

In [None]:
data_pair_gd = items_gd.merge(data_gd,on='row_id',how='left')

data_pair_gd['reference'] = data_pair_gd['reference'].astype(int)
data_pair_gd['item_id'] = data_pair_gd['item_id'].astype(int)
data_pair_gd['target'] = data_pair_gd['reference'] == data_pair_gd['item_id']
data_pair_gd['target'] = data_pair_gd['target'].astype(int)

GPU_RUN_TIME[step] = time.time() - start

In [None]:
%%time
train_pair_gd = data_pair_gd[data_pair_gd.clickout_missing==0]
test_pair_gd = data_pair_gd[data_pair_gd.clickout_missing>0]
del data_pair_gd
train_pair_gd['is_va'] = train_pair_gd.row_id%5 == 0
train_pair = train_pair_gd[train_pair_gd.is_va==0]
valid_pair = train_pair_gd[train_pair_gd.is_va>0]
del train_pair_gd
train_pair = train_pair.drop(['is_va'])
valid_pair = valid_pair.drop(['is_va'])
valid_pair.to_parquet('../cache/valid/')
test_pair_gd.to_parquet('../cache/test/')
train_pair[:].to_parquet('../cache/train/')

### Visualize the timing

In [None]:
GPU_RUN_TIME