In [1]:
import os, time
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3,4,5,6,7"

In [2]:
# External Dependencies
import cupy as cp
import cudf
import dask_cudf
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
from dask.utils import parse_bytes
from dask.delayed import delayed
import rmm

from nvtabular.utils import device_mem_size

Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_NVVM=/usr/local/cuda/nvvm/lib64/libnvvm.so.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')
Environment variables with the 'NUMBAPRO' prefix are deprecated and consequently ignored, found use of NUMBAPRO_LIBDEVICE=/usr/local/cuda/nvvm/libdevice/.

For more information about alternatives visit: ('http://numba.pydata.org/numba-doc/latest/cuda/overview.html', '#cudatoolkit-lookup')


In [4]:
BASE_DIR = '/raid/'
PREFIX_in = 'nv' # org, dask, nv
PREFIX_out = 'nv3' # nv

In [5]:
# Deploy a Single-Machine Multi-GPU Cluster
protocol = "tcp"             # "tcp" or "ucx"
visible_devices = "0,1,2,3"  # Delect devices to place workers
device_spill_frac = 0.9      # Spill GPU-Worker memory to host at this limit.
                             # Reduce if spilling fails to prevent
                             # device memory errors.
cluster = None               # (Optional) Specify existing scheduler port
capacity = device_mem_size(kind="total")
if cluster is None:
    cluster = LocalCUDACluster(
        protocol = protocol,
        CUDA_VISIBLE_DEVICES = visible_devices,
        local_directory = BASE_DIR + 'dask/',
        device_memory_limit = capacity * device_spill_frac,
    )

# Create the distributed client
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:34697  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 429.50 GB


In [6]:
# Initialize RMM pool on ALL workers
def _rmm_pool():
    rmm.reinitialize(
        pool_allocator=True,
        initial_pool_size=None, # Use default size
    )
    
client.run(_rmm_pool)

{'tcp://127.0.0.1:35195': None,
 'tcp://127.0.0.1:39241': None,
 'tcp://127.0.0.1:41755': None,
 'tcp://127.0.0.1:44467': None}

In [7]:
import nvtabular as nvt

In [8]:
CATEGORICAL_COLUMNS = [
    'hashtags', 'tweet_id', 'media', 'links', 'domains', 'tweet_type', 'language', 
    'a_user_id', 'a_is_verified', 
    'b_user_id', 'b_is_verified', 
    'b_follows_a', 'dt_dow'
]

CONTINUOUS_COLUMNS = [
    'timestamp', 
    'a_follower_count', 'a_following_count', 
    'b_follower_count', 'b_following_count',
    'hashtags_count_t', 'domains_count_t', 'links_count_t'
]
LABEL_COLUMNS = [
    'reply', 'retweet', 'retweet_comment', 'like'
]


In [9]:
proc = nvt.Workflow(
    cat_names=CATEGORICAL_COLUMNS,
    cont_names=CONTINUOUS_COLUMNS,
    label_name=LABEL_COLUMNS
)

In [10]:
proc.add_feature([
    nvt.ops.LambdaOp(
        op_name = 'change',
        f = lambda col, gdf: (col>0).astype('int8'),
        columns = LABEL_COLUMNS,
        replace=True
    ),
    nvt.ops.TargetEncoding(
        cat_groups = ['media', 
                      'tweet_type', 
                      'language', 
                      'a_user_id', 
                      'b_user_id', 
                      ['domains','language','b_follows_a','tweet_type','media','a_is_verified']],
        cont_target = LABEL_COLUMNS,
        kfold = 5,
        p_smooth = 20
    ),
    nvt.ops.JoinGroupby(
        columns = [
            'media', 
            'tweet_type', 
            'language', 
            'a_user_id', 
            'b_user_id'
        ]
    ),
    nvt.ops.LambdaOp(
        op_name = 'a_ff_rate',
        f = lambda col, gdf: gdf['a_following_count']/gdf['a_follower_count'],
        columns = ['a_following_count'],
        replace=False
    ),
    nvt.ops.LambdaOp(
        op_name = 'b_ff_rate',
        f = lambda col, gdf: gdf['b_following_count']/gdf['b_follower_count'],
        columns = ['b_following_count'],
        replace=False
    ),
    nvt.ops.LambdaOp(
        op_name = 'to_datetime',
        f = lambda col, gdf: cudf.to_datetime(col, unit='s'),
        columns = ['timestamp'],
        replace=False
    ),
    nvt.ops.LambdaOp(
        op_name = 'to_hour',
        f = lambda col, gdf: col.dt.hour,
        columns = ['timestamp_to_datetime'],
        replace=False
    ),
    nvt.ops.LambdaOp(
        op_name = 'to_minute',
        f = lambda col, gdf: col.dt.minute,
        columns = ['timestamp_to_datetime'],
        replace=False
    ),
    nvt.ops.LambdaOp(
        op_name = 'to_second',
        f = lambda col, gdf: col.dt.second,
        columns = ['timestamp_to_datetime'],
        replace=False
    ),
    nvt.ops.LambdaOp(
        op_name = 'asfloat',
        f = lambda col, gdf: col.astype('float32'),
        columns = ['b_follower_count','b_following_count','language'],
        replace=True
    ),
    nvt.ops.DifferenceLag(
        'b_user_id', 
        columns=['b_follower_count','b_following_count','language'],
        shift = 1
    ),
    nvt.ops.FillMissing(fill_val=0)
])

In [11]:
train_dataset = nvt.Dataset(BASE_DIR + PREFIX_in + '_train-1-train.parquet', engine='parquet', part_size="200MB")

# get a nvt dataset and convert to a dask dataframe
#ddf = train_dataset.to_ddf()
# partition the dask dataframe by userid, then sort by userid/timestamp
#ddf = ddf.shuffle("b_user_id").sort_values(["b_user_id", "timestamp"]).reset_index()
# create a new nvtabular dataset on the partitioned/sorted values
#train_dataset = nvt.Dataset(ddf)

In [12]:
valid_dataset = nvt.Dataset(BASE_DIR + PREFIX_in + '_train-1-valid.parquet', engine='parquet', part_size="200MB")
#
# get a nvt dataset and convert to a dask dataframe
#ddf = valid_dataset.to_ddf()
# partition the dask dataframe by userid, then sort by userid/timestamp
#ddf = ddf.shuffle("b_user_id").sort_values(["b_user_id", "timestamp"]).reset_index()
# create a new nvtabular dataset on the partitioned/sorted values
#valid_dataset = nvt.Dataset(ddf)

In [13]:
#!rm -r $BASE_DIR/${PREFIX_out}_out_train/
#!rm -r $BASE_DIR/${PREFIX_out}_nv_out_valid/

proc.apply(train_dataset, record_stats=True, output_path=BASE_DIR + PREFIX_in + '_' + PREFIX_out + '_out_train/')
proc.apply(valid_dataset, record_stats=False, output_path=BASE_DIR + PREFIX_in + '_' + PREFIX_out + '_out_valid/')

In [13]:
#!ls $BASE_DIR/nv_out_train/

_metadata	 part.23.parquet  part.39.parquet  part.54.parquet
part.0.parquet	 part.24.parquet  part.4.parquet   part.55.parquet
part.1.parquet	 part.25.parquet  part.40.parquet  part.56.parquet
part.10.parquet  part.26.parquet  part.41.parquet  part.57.parquet
part.11.parquet  part.27.parquet  part.42.parquet  part.58.parquet
part.12.parquet  part.28.parquet  part.43.parquet  part.59.parquet
part.13.parquet  part.29.parquet  part.44.parquet  part.6.parquet
part.14.parquet  part.3.parquet   part.45.parquet  part.60.parquet
part.15.parquet  part.30.parquet  part.46.parquet  part.61.parquet
part.16.parquet  part.31.parquet  part.47.parquet  part.62.parquet
part.17.parquet  part.32.parquet  part.48.parquet  part.63.parquet
part.18.parquet  part.33.parquet  part.49.parquet  part.64.parquet
part.19.parquet  part.34.parquet  part.5.parquet   part.7.parquet
part.2.parquet	 part.35.parquet  part.50.parquet  part.8.parquet
part.20.parquet  part.36.parquet  part.51.parquet  part.9.pa

In [14]:
#!ls $BASE_DIR/nv_out_valid/

_metadata	 part.14.parquet  part.20.parquet  part.5.parquet
part.0.parquet	 part.15.parquet  part.21.parquet  part.6.parquet
part.1.parquet	 part.16.parquet  part.22.parquet  part.7.parquet
part.10.parquet  part.17.parquet  part.23.parquet  part.8.parquet
part.11.parquet  part.18.parquet  part.24.parquet  part.9.parquet
part.12.parquet  part.19.parquet  part.3.parquet
part.13.parquet  part.2.parquet   part.4.parquet
