In [1]:
############################################################################
##
## Copyright (C) 2021 NVIDIA Corporation.  All rights reserved.
##
## NVIDIA Sample Code
##
## Please refer to the NVIDIA end user license agreement (EULA) associated
## with this source code for terms and conditions that govern your use of
## this software. Any use, reproduction, disclosure, or distribution of
## this software and related documentation outside the terms of the EULA
## is strictly prohibited.
##
############################################################################

In this notebook we prepare augmenting our feature set by adding GNN embeddings as additional features to be passed into XGBoost for fraud detection task.

In [1]:
import os
import subprocess

import numpy as np
import cudf
import dask_cudf
import cupy as cp
from dask_cuda import LocalCUDACluster
from dask.distributed import Client, wait
from utils import device_mem_size, get_rmm_size

import torch
device = torch.device('cuda')

## Set up paths

In [2]:
BASE_DIR = "./basedir"
dask_space = os.path.join(BASE_DIR, "dask_space")
# Define our processed data
processed_path = os.path.join(BASE_DIR, "processed_data_1gpu")

## Set up Dask Cluster

In [3]:
device_pool_frac = 0.8 # allocate 80% of total GPU memory on each GPU
device_size = device_mem_size(kind="total")
device_pool_size = int(device_pool_frac * device_size)

protocol = "ucx"  

# Select GPUs to place workers. Here 1st and 2nd GPU are used
visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "0,1")

# Get the IP Address
cmd = "hostname --all-ip-addresses"
process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
IPADDR = str(output.decode()).split()[0]
    
if protocol == 'ucx':
    cluster = LocalCUDACluster(
    ip=IPADDR,
    protocol=protocol,
    CUDA_VISIBLE_DEVICES=visible_devices,
    rmm_pool_size=get_rmm_size(device_pool_size),
    local_directory=dask_space,
    device_memory_limit=0.8,
    enable_tcp_over_ucx=True,
    enable_nvlink=True)
else:
    cluster = LocalCUDACluster(
    ip=IPADDR,
    protocol=protocol,
    CUDA_VISIBLE_DEVICES=visible_devices,
    rmm_pool_size=get_rmm_size(device_pool_size),
    local_directory=dask_space,
    device_memory_limit=0.8)
    

# Create the distributed client
client = Client(cluster)
client

distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize


0,1
Connection method: Cluster object,Cluster type: LocalCUDACluster
Dashboard: http://172.17.0.3:8787/status,

0,1
Status: running,Using processes: True
Dashboard: http://172.17.0.3:8787/status,Workers: 2
Total threads:  2,Total memory:  251.82 GiB

0,1
Comm: ucx://172.17.0.3:59531,Workers: 2
Dashboard: http://172.17.0.3:8787/status,Total threads:  2
Started:  Just now,Total memory:  251.82 GiB

0,1
Comm: ucx://172.17.0.3:34307,Total threads: 1
Dashboard: http://172.17.0.3:42299/status,Memory: 125.91 GiB
Nanny: ucx://172.17.0.3:59717,
Local directory: /workspace/basedir/dask_space/dask-worker-space/worker-4lhplzi5,Local directory: /workspace/basedir/dask_space/dask-worker-space/worker-4lhplzi5
GPU: Tesla V100-DGXS-32GB,GPU memory: 31.75 GiB

0,1
Comm: ucx://172.17.0.3:42185,Total threads: 1
Dashboard: http://172.17.0.3:35287/status,Memory: 125.91 GiB
Nanny: ucx://172.17.0.3:36511,
Local directory: /workspace/basedir/dask_space/dask-worker-space/worker-yl3m86hy,Local directory: /workspace/basedir/dask_space/dask-worker-space/worker-yl3m86hy
GPU: Tesla V100-DGXS-32GB,GPU memory: 31.75 GiB


Let's load in the train and test datasets.

In [4]:
X_train = dask_cudf.read_parquet(os.path.join(processed_path, 'X_train.parquet'))
X_test = dask_cudf.read_parquet(os.path.join(processed_path, 'X_test.parquet'))

Now let's load back in our GNN embeddings for cards and merchants and convert those to dataframes.

In [5]:
card_embeds = torch.load(os.path.join(processed_path, 'gnn_card_embeds.pt'), map_location=device)

In [6]:
merch_embeds = torch.load(os.path.join(processed_path, 'gnn_merch_embeds.pt'), map_location=device)

In [7]:
merch_embeds = cudf.DataFrame(cp.asarray(merch_embeds))
card_embeds = cudf.DataFrame(cp.asarray(card_embeds))
card_embeds = dask_cudf.from_cudf(card_embeds, npartitions=2)
merch_embeds = dask_cudf.from_cudf(merch_embeds, npartitions=2)

We also read in the `card_id` and `merchant_id` mappings we created when renumbered them for DGL and perform some remapping so that it will become easier to merge these embeddings with the the rest of dataset features.

In [8]:
card_id_map = cudf.read_parquet(os.path.join(processed_path, 'uniq_card_id.parquet'))

In [9]:
merch_id_map = cudf.read_parquet(os.path.join(processed_path, 'uniq_merch_id.parquet'))

In [10]:
card_id_map = card_id_map.reset_index()
card_embeds = card_embeds.reset_index()
card_embeds.columns = [f'c_{i}' if isinstance(i, int) else i for i in card_embeds.columns]
card_embeds = card_embeds.merge(card_id_map, how='left', on='index')

In [11]:
merch_id_map = merch_id_map.reset_index()
merch_embeds = merch_embeds.reset_index()
merch_embeds.columns = [f'm_{i}' if isinstance(i, int) else i for i in merch_embeds.columns]
merch_embeds = merch_embeds.merge(merch_id_map, how='left', on='index')

Now we go ahead and merge the card embeddings in training set based on `card_id` column. Similarly for merchant embeddings.

In [12]:
card_embeds['card_id'] = card_embeds['card_id'].astype(np.int16)
merch_embeds['merchant_id'] = merch_embeds['merchant_id'].astype(np.int32)
X_train = X_train.merge(card_embeds, how='left', on='card_id')
X_train = X_train.merge(merch_embeds, how='left', on='merchant_id')

Now we go ahead and merge the card embeddings in test set based on `card_id` column. Similarly for merchant embeddings.

In [13]:
X_test = X_test.merge(card_embeds, how='left', on='card_id')
X_test = X_test.merge(merch_embeds, how='left', on='merchant_id')

In [14]:
# drop unnecessary index columns
X_train = X_train.drop(columns=['index_x', 'index_y'])
X_test = X_test.drop(columns=['index_x', 'index_y'])

Finally we save the GNN-embeddings augmented train and test data

In [15]:
X_train.to_parquet(os.path.join(processed_path, 'X_train_wGNN.parquet'))

In [16]:
X_test.to_parquet(os.path.join(processed_path, 'X_test_wGNN.parquet'))

## Important Cleanup

Run the code below to shutdown the running dask cluster and free GPU resources.

In [17]:
client.close()

In [18]:
client.shutdown()

**NOTE: Also please restart the kernel before moving on to the next notebook.**