In [1]:
import os
GPU_id = 6
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [2]:
import warnings
warnings.filterwarnings("ignore")

import cudf as gd
import cupy as cp
import pandas as pd
import numpy as np
import os
import time
import nvstrings
import matplotlib.pyplot as plt
%matplotlib inline

### Global

In [3]:
GPU_RUN_TIME = {}
CPU_RUN_TIME = {}
STEPS = []

### Functions

In [4]:
def on_gpu(words,func,arg=None,dtype=np.int32):
    res = cp.array(words.size(), dtype=dtype)
    if arg is None:
        cmd = 'words.%s(devptr=res.device_ctypes_pointer.value)'%(func)
    else:
        cmd = 'words.%s(arg,devptr=res.device_ctypes_pointer.value)'%(func)
    eval(cmd)
    return res

def count_items(data,cols):
    dg = data.groupby(cols+['item_id'],
            as_index=False).agg({'step':['count']})
    if len(cols) == 0:
        tag = 'global'
    else:
        tag = '_'.join(cols)
    dg.columns = cols + ['item_id', 'count_item_%s'%tag]

    if len(cols):
        df = data.groupby(cols,
            as_index=False).agg({'step':['count']})
        df.columns = cols + ['count_item_%s_all'%tag]
    
        dg = dg.merge(df,on=cols,how='left')
        dg['count_item_%s_norm'%tag] = dg['count_item_%s'%tag] / dg['count_item_%s_all'%tag]
    
        dg = dg.drop('count_item_%s_all'%tag,axis=1)
        del df
    return dg

### Read csv data

In [5]:
if os.path.exists('cache')==False:
    os.mkdir('cache')

In [6]:
path = '/datasets/trivago/data/'

### pandas read csv

In [8]:
%%time
cols = ['city','user_id', 'session_id', 'step', 'action_type', 'reference']

train_pd = pd.read_csv('%s/train.csv'%path,usecols=cols)
test_pd = pd.read_csv('%s/test.csv'%path,usecols=cols)
submission_pd = pd.read_csv('%s/submission_popular.csv'%path)
data_pd = pd.concat([train_pd,test_pd])

data_pd = data_pd[cols]
cols = ['city','user_id', 'session_id', 'step', 'action_type', 'item_id']
data_pd.columns = cols

print('combined',data_pd.shape)

combined (19715327, 6)
CPU times: user 22.7 s, sys: 5.53 s, total: 28.2 s
Wall time: 28.2 s


In [9]:
%%time
cols = ['user_id','session_id','item_id','city','device']
data_pair_pd = pd.read_parquet('cache/data_pair.parquet')[cols]

CPU times: user 22.8 s, sys: 11.3 s, total: 34.1 s
Wall time: 13.5 s


In [11]:
del train_pd,test_pd

### Only keep interaction rows

In [12]:
step = 'contrain string'
STEPS.append(step)

### pandas find string within string

In [14]:
%%time
start = time.time()

data_pd['is_interaction'] = data_pd['action_type'].apply(lambda x: x.startswith('interaction'))
data_interaction_pd = data_pd[data_pd['is_interaction']]
bad = np.array(['unknown','Miyako Airport','Shinjuku Station','Lower Manhattan','Estació de Sants'])
mask = data_interaction_pd['item_id'].isin(bad)
data_interaction_pd = data_interaction_pd.loc[~mask]
data_interaction_pd['item_id'] = data_interaction_pd['item_id'].fillna(-1).astype(int)
print(data_interaction_pd['item_id'].unique().shape)
CPU_RUN_TIME[step] = time.time() - start

(271344,)
CPU times: user 11.5 s, sys: 2.03 s, total: 13.6 s
Wall time: 13.6 s


### count items with/without other columns

In [15]:
step = 'count items'
STEPS.append(step)

In [17]:
%%time
start = time.time()
count_user_session_pd = count_items(data_interaction_pd,cols=['user_id','session_id'])
count_user_session_city_pd = count_items(data_interaction_pd,cols=['user_id','session_id','city'])
count_user_city_pd = count_items(data_interaction_pd,cols=['user_id','city'])
count_city_pd = count_items(data_interaction_pd,cols=['city'])
count_global_pd = count_items(data_interaction_pd,cols=[])
CPU_RUN_TIME[step] = time.time() - start

CPU times: user 33.7 s, sys: 9.5 s, total: 43.2 s
Wall time: 43.2 s


### Merge the encoding to pair

In [18]:
step = 'merge'
STEPS.append(step)

### pandas merge

In [20]:
%%time
start = time.time()
data_pair_pd = data_pair_pd.merge(count_user_session_pd,on=['user_id','session_id','item_id'],how='left')
data_pair_pd = data_pair_pd.merge(count_user_session_city_pd,on=['user_id','session_id','city','item_id'],how='left')
data_pair_pd = data_pair_pd.merge(count_user_city_pd,on=['user_id','city','item_id'],how='left')
data_pair_pd = data_pair_pd.merge(count_city_pd,on=['city','item_id'],how='left')
CPU_RUN_TIME[step] = time.time() - start

CPU times: user 1min 15s, sys: 35.5 s, total: 1min 51s
Wall time: 1min 51s


#### Save to Parquet
If you wish to use as part of a bigger feature set.

In [22]:
data_pair_pd.to_parquet('cache/data_pair_count.parquet')

### Visualize the timing

In [None]:
CPU_RUN_TIME['Overall'] = sum([CPU_RUN_TIME[i] for i in STEPS])
STEPS.append('Overall')

timing = pd.DataFrame()
timing['step'] = STEPS
timing['CPU'] = [CPU_RUN_TIME[i] for i in STEPS]
timing