In [1]:
import os
GPU_id = 6
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [2]:
import warnings
warnings.filterwarnings("ignore")

import cudf as gd
import pandas as pd
import numpy as np
import os
import time
import nvstrings
from librmm_cffi import librmm
import matplotlib.pyplot as plt
%matplotlib inline

### Global

In [3]:
GPU_RUN_TIME = {}
CPU_RUN_TIME = {}
STEPS = []

### Functions

In [4]:
def on_gpu(words,func,arg=None,dtype=np.int32):
    res = librmm.device_array(words.size(), dtype=dtype)
    if arg is None:
        cmd = 'words.%s(devptr=res.device_ctypes_pointer.value)'%(func)
    else:
        cmd = 'words.%s(arg,devptr=res.device_ctypes_pointer.value)'%(func)
    eval(cmd)
    return res

def count_items(data,cols):
    dg = data.groupby(cols+['item_id'],
            as_index=False).agg({'step':['count']})
    if len(cols) == 0:
        tag = 'global'
    else:
        tag = '_'.join(cols)
    dg.columns = cols + ['item_id', 'count_item_%s'%tag]

    if len(cols):
        df = data.groupby(cols,
            as_index=False).agg({'step':['count']})
        df.columns = cols + ['count_item_%s_all'%tag]
    
        dg = dg.merge(df,on=cols,how='left')
        dg['count_item_%s_norm'%tag] = dg['count_item_%s'%tag] / dg['count_item_%s_all'%tag]
    
        dg = dg.drop('count_item_%s_all'%tag,axis=1)
        del df
    return dg

### Read csv data

In [5]:
if os.path.exists('cache')==False:
    os.mkdir('cache')

In [6]:
path = '/datasets/trivago/data/'

### cudf read csv

In [7]:
%%time
cols = ['city','user_id', 'session_id', 'step', 'action_type', 'reference']

train_gd = gd.read_csv('%s/train.csv'%path,usecols=cols)
test_gd = gd.read_csv('%s/test.csv'%path,usecols=cols)
data_gd = gd.concat([train_gd,test_gd])


data_gd = data_gd[cols]
cols = ['city','user_id', 'session_id', 'step', 'action_type', 'item_id']
data_gd.columns = cols

print('combined',data_gd.shape)

combined (19715327, 6)
CPU times: user 2.67 s, sys: 1.68 s, total: 4.35 s
Wall time: 6.27 s


### pandas read csv

In [8]:
%%time
cols = ['city','user_id', 'session_id', 'step', 'action_type', 'reference']

train_pd = pd.read_csv('%s/train.csv'%path,usecols=cols)
test_pd = pd.read_csv('%s/test.csv'%path,usecols=cols)
submission_pd = pd.read_csv('%s/submission_popular.csv'%path)
data_pd = pd.concat([train_pd,test_pd])

data_pd = data_pd[cols]
cols = ['city','user_id', 'session_id', 'step', 'action_type', 'item_id']
data_pd.columns = cols

print('combined',data_pd.shape)

combined (19715327, 6)
CPU times: user 22.6 s, sys: 5.28 s, total: 27.9 s
Wall time: 27.9 s


In [9]:
%%time
cols = ['user_id','session_id','item_id','city','device']
data_pair_pd = pd.read_parquet('cache/data_pair.parquet')

CPU times: user 21 s, sys: 10.4 s, total: 31.4 s
Wall time: 11.3 s


In [10]:
%%time
cols = ['user_id','session_id','item_id','city','device']
data_pair_gd = gd.read_parquet('cache/data_pair.parquet')

CPU times: user 376 ms, sys: 776 ms, total: 1.15 s
Wall time: 1.89 s


In [11]:
del train_gd,test_gd
del train_pd,test_pd

### Only keep interaction rows

In [12]:
step = 'contrain string'
STEPS.append(step)

### cudf find string within string

In [13]:
%%time
start = time.time()

data_gd['is_interaction'] = on_gpu(data_gd['action_type'].data,'contains',
                                          arg='interaction',dtype=np.bool)
data_interaction_gd = data_gd[data_gd['is_interaction']]
data_interaction_gd['item_id'] = data_interaction_gd['item_id'].astype(int)
print(data_interaction_gd['item_id'].unique().shape)
GPU_RUN_TIME[step] = time.time() - start

(268470,)
CPU times: user 1.8 s, sys: 792 ms, total: 2.59 s
Wall time: 3.3 s


### pandas find string within string

In [14]:
%%time
start = time.time()

data_pd['is_interaction'] = data_pd['action_type'].apply(lambda x: x.startswith('interaction'))
data_interaction_pd = data_pd[data_pd['is_interaction']]
bad = np.array(['unknown','Miyako Airport','Shinjuku Station','Lower Manhattan','Estació de Sants'])
mask = data_interaction_pd['item_id'].isin(bad)
data_interaction_pd = data_interaction_pd.loc[~mask]
data_interaction_pd['item_id'] = data_interaction_pd['item_id'].fillna(-1).astype(int)
print(data_interaction_pd['item_id'].unique().shape)
CPU_RUN_TIME[step] = time.time() - start

(271344,)
CPU times: user 11.1 s, sys: 2.04 s, total: 13.1 s
Wall time: 13.1 s


### count items with/without other columns

In [15]:
step = 'count items'
STEPS.append(step)

In [16]:
%%time
start = time.time()
count_user_session_gd = count_items(data_interaction_gd,cols=['user_id','session_id'])
count_user_session_city_gd = count_items(data_interaction_gd,cols=['user_id','session_id','city'])
count_user_city_gd = count_items(data_interaction_gd,cols=['user_id','city'])
count_city_gd = count_items(data_interaction_gd,cols=['city'])
count_global_gd = count_items(data_interaction_gd,cols=[])
GPU_RUN_TIME[step] = time.time() - start

CPU times: user 1.14 s, sys: 1.4 s, total: 2.53 s
Wall time: 5.29 s


In [17]:
%%time
start = time.time()
count_user_session_pd = count_items(data_interaction_pd,cols=['user_id','session_id'])
count_user_session_city_pd = count_items(data_interaction_pd,cols=['user_id','session_id','city'])
count_user_city_pd = count_items(data_interaction_pd,cols=['user_id','city'])
count_city_pd = count_items(data_interaction_pd,cols=['city'])
count_global_pd = count_items(data_interaction_pd,cols=[])
CPU_RUN_TIME[step] = time.time() - start

CPU times: user 34.9 s, sys: 10.3 s, total: 45.2 s
Wall time: 45.2 s


### Merge the encoding to pair

In [18]:
step = 'merge'
STEPS.append(step)

### cudf merge

In [None]:
%%time
start = time.time()
data_pair_gd = data_pair_gd.merge(count_user_session_gd,on=['user_id','session_id','item_id'],how='left')
del count_user_session_gd
data_pair_gd = data_pair_gd.merge(count_user_session_city_gd,on=['user_id','session_id','city','item_id'],how='left')
del count_user_session_city_gd
data_pair_gd = data_pair_gd.merge(count_user_city_gd,on=['user_id','city','item_id'],how='left')
del count_user_city_gd
data_pair_gd = data_pair_gd.merge(count_city_gd,on=['city','item_id'],how='left')
del count_city_gd
GPU_RUN_TIME[step] = time.time() - start

### pandas merge

In [20]:
%%time
start = time.time()
data_pair_pd = data_pair_pd.merge(count_user_session_pd,on=['user_id','session_id','item_id'],how='left')
data_pair_pd = data_pair_pd.merge(count_user_session_city_pd,on=['user_id','session_id','city','item_id'],how='left')
data_pair_pd = data_pair_pd.merge(count_user_city_pd,on=['user_id','city','item_id'],how='left')
data_pair_pd = data_pair_pd.merge(count_city_pd,on=['city','item_id'],how='left')
CPU_RUN_TIME[step] = time.time() - start

CPU times: user 1min 38s, sys: 55.1 s, total: 2min 33s
Wall time: 2min 33s


#### Save to Parquet for Next Step
If you wish to add more features please continue to STEPS 3 & 4

In [21]:
data_pair_pd.to_parquet('cache/data_pair_step_2.parquet')

#### Save to Parquet for Model
The current data frame is model ready and can be exported. Will export dataframes to top level directory for use with model.

In [22]:
train_pair_pd = data_pair_pd[data_pair_pd['clickout_missing']==0]
test_pair_pd = data_pair_pd[data_pair_pd['clickout_missing']>0]
train_pair_pd['is_va'] = train_pair_pd.row_id%5 == 0
valid_pair_pd = train_pair_pd[train_pair_pd['is_va']==0]
train_pair_pd = train_pair_pd[train_pair_pd['is_va']>0]
valid_pair_pd.drop(columns=['is_va'])
train_pair_pd.drop(columns=['is_va'])
valid_pair_pd.to_parquet('../../cache/valid.parquet')
train_pair_pd.to_parquet('../../cache/train.parquet')
test_pair_pd.to_parquet('../../cache/test.parquet')

### Visualize the timing

In [23]:
GPU_RUN_TIME['Overall'] = sum([GPU_RUN_TIME[i] for i in STEPS])
CPU_RUN_TIME['Overall'] = sum([CPU_RUN_TIME[i] for i in STEPS])
STEPS.append('Overall')

timing = pd.DataFrame()
timing['step'] = STEPS
timing['GPU'] = [GPU_RUN_TIME[i] for i in STEPS]
timing['CPU'] = [CPU_RUN_TIME[i] for i in STEPS]
timing

KeyError: 'merge'

In [None]:
speedup = [CPU_RUN_TIME[i]/GPU_RUN_TIME[i] for i in STEPS]
df = pd.DataFrame({'steps':STEPS, 'speedup':speedup})
ax = df.plot.bar(x='steps', y='speedup', figsize=(20,5), fontsize=15, title='GPU Speedup',grid=True)

In [None]:
timing.set_index('step').plot.bar(figsize=(20,5),grid=True,fontsize=15, title='Running time: seconds')