# NVTabular

In [1]:
# External Dependencies
import cupy as cp
import cudf
import dask
import dask_cudf
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
from dask.utils import parse_bytes
from dask.delayed import delayed
import rmm

from nvtabular.utils import device_mem_size

import dask as dask, dask_cudf
from dask.distributed import Client, wait
from dask_cuda import LocalCUDACluster

import numpy as np

Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_NVVM=/usr/local/cuda/nvvm/lib64/libnvvm.so.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')
Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_LIBDEVICE=/usr/local/cuda/nvvm/libdevice/.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')


In [2]:
BASE_DIR = '/raid/'

In [3]:
cluster = LocalCUDACluster(protocol="tcp", 
                           rmm_pool_size="31GB")
#cluster = LocalCUDACluster()
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:33471  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 8  Memory: 429.50 GB


In [4]:
# Initialize RMM pool on ALL workers
def _rmm_pool():
    rmm.reinitialize(
        pool_allocator=True,
        initial_pool_size=None, # Use default size
    )
    
client.run(_rmm_pool)

{'tcp://127.0.0.1:33537': None,
 'tcp://127.0.0.1:34189': None,
 'tcp://127.0.0.1:35929': None,
 'tcp://127.0.0.1:36359': None,
 'tcp://127.0.0.1:37795': None,
 'tcp://127.0.0.1:43307': None,
 'tcp://127.0.0.1:44655': None,
 'tcp://127.0.0.1:46699': None}

In [5]:
import nvtabular as nvt

In [6]:
features = [
    'text_tokens',    ###############
    'hashtags',       #Tweet Features
    'tweet_id',       #
    'media',          #
    'links',          #
    'domains',        #
    'tweet_type',     #
    'language',       #
    'timestamp',      ###############
    'a_user_id',              ###########################
    'a_follower_count',       #Engaged With User Features
    'a_following_count',      #
    'a_is_verified',          #
    'a_account_creation',     ###########################
    'b_user_id',              #######################
    'b_follower_count',       #Engaging User Features
    'b_following_count',      #
    'b_is_verified',          #
    'b_account_creation',     #######################
    'b_follows_a',    #################### Engagement Features
    'reply',          #Target Reply
    'retweet',        #Target Retweet    
    'retweet_comment',#Target Retweet with comment
    'like',           #Target Like
                      ####################
]

cat_names = [ 
    'hashtags', 
    'tweet_id', 
    'media', 
    'links',
    'domains', 
    'tweet_type', 
    'language', 
    'a_user_id',
    'a_is_verified',
    'b_user_id', 
    'b_is_verified', 
    'b_follows_a', 

]

cont_names = [
    'timestamp',
    'a_follower_count', 
    'a_following_count',
    'a_account_creation',
    'b_follower_count',
    'b_following_count',
    'b_account_creation'
]

label_name = [
    'reply', 
    'retweet', 
    'retweet_comment', 
    'like'
]

In [7]:
proc = nvt.Workflow(cat_names=cat_names, 
                    cont_names=cont_names, 
                    label_name=label_name)

In [8]:
def splitmedia2(col):
    if col.shape[0] == 0:
        return(col)
    else:
        return(col.str.split('\t')[0].fillna('') + '_' + col.str.split('\t')[1].fillna(''))
    
def count_token(col,token):
    not_null = col.isnull()==0
    return ((col.str.count(token)+1)*not_null).fillna(0)

In [9]:
trains_itrs = nvt.Dataset(BASE_DIR + 'training.tsv', header=None, names=features, engine='csv', sep='\x01', part_size='1GB')

In [10]:
proc.add_feature([
    nvt.ops.LambdaOp(
        op_name='count_t',
        f=lambda col, gdf: count_token(col,'\t'),
        columns=['hashtags', 'domains', 'links'],
        replace=False
    ),
    nvt.ops.FillMissing(columns=label_name + ['hashtags', 'domains', 'links']),
    nvt.ops.LambdaOp(
        op_name='astypeint32',
        f=lambda col, gdf: col.astype(np.uint32),
        columns=label_name + ['timestamp',
                              'a_follower_count', 
                              'a_following_count',
                              'a_account_creation',
                              'b_follower_count',
                              'b_following_count',
                              'b_account_creation'],
        replace=True
    ),
    nvt.ops.LambdaOp(
        op_name='splitmedia',
        f=lambda col, gdf: splitmedia2(col),
        columns=['media'],
        replace=False
    ),
    nvt.ops.Categorify(
        columns=['media_splitmedia', 'language', 'tweet_type', 'tweet_id', 'a_user_id', 'b_user_id', 'hashtags', 'domains', 'links']
    ), 
    nvt.ops.LambdaOp(
        op_name='astypeint32_2',
        f=lambda col, gdf: col.astype(np.uint32),
        replace=True,
        columns=['media_splitmedia', 'language', 'tweet_type', 'tweet_id', 'a_user_id', 'b_user_id', 'hashtags', 'domains', 'links']
    )
])

In [11]:
proc.apply(trains_itrs, record_stats=True, output_path=BASE_DIR + 'preprocess/')



In [12]:
!ls $BASE_DIR/preprocess

_metadata	 part.22.parquet  part.37.parquet  part.51.parquet
part.0.parquet	 part.23.parquet  part.38.parquet  part.52.parquet
part.1.parquet	 part.24.parquet  part.39.parquet  part.53.parquet
part.10.parquet  part.25.parquet  part.4.parquet   part.54.parquet
part.11.parquet  part.26.parquet  part.40.parquet  part.55.parquet
part.12.parquet  part.27.parquet  part.41.parquet  part.56.parquet
part.13.parquet  part.28.parquet  part.42.parquet  part.57.parquet
part.14.parquet  part.29.parquet  part.43.parquet  part.58.parquet
part.15.parquet  part.3.parquet   part.44.parquet  part.59.parquet
part.16.parquet  part.30.parquet  part.45.parquet  part.6.parquet
part.17.parquet  part.31.parquet  part.46.parquet  part.60.parquet
part.18.parquet  part.32.parquet  part.47.parquet  part.61.parquet
part.19.parquet  part.33.parquet  part.48.parquet  part.7.parquet
part.2.parquet	 part.34.parquet  part.49.parquet  part.8.parquet
part.20.parquet  part.35.parquet  part.5.parquet   part.9.pa