# Define Global variables and imports

In [1]:
import os

In [2]:
PROJECT_ID = 'renatoleite-mldemos' # Change to your project Id.
REGION = 'us-central1' # Change to your region.
BUCKET =  'renatoleite-nvtabular' # Change to your bucket.

IMAGE_NAME = 'nvt_preprocessing'
IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}'
DOCKERNAME = 'nvtabular'

In [3]:
shuffle = 'None'
train_split = 'train'
valid_split = 'valid'
n_workers = 1
recursive = False

local_dev_folder = '/home/renatoleite/nvidia-merlin-on-vertex-ai/tests'

# Build docker image

In [None]:
# Build docker image with Cloud Build
! gcloud builds submit --config ../src/cloudbuild.yaml --substitutions _DOCKERNAME=$DOCKERNAME,_IMAGE_URI=$IMAGE_URI

In [4]:
# Pull image locally to test
! docker pull $IMAGE_URI

Using default tag: latest
latest: Pulling from renatoleite-mldemos/nvt_preprocessing
Digest: sha256:707c4d589b9f8b6af2948924a07d33fe891d75bfb073b528ca63b488e9f26ab4
Status: Image is up to date for gcr.io/renatoleite-mldemos/nvt_preprocessing:latest
gcr.io/renatoleite-mldemos/nvt_preprocessing:latest


# Create local directory to read/write files

This folder will work like a GCSfuse mount point.

In [5]:
BASE_DIR = f'/gcs/{BUCKET}'
print(BASE_DIR)

/gcs/renatoleite-nvtabular


In [6]:
# Create BASE_DIR if not exists
if not os.path.exists(BASE_DIR):
    os.makedirs(BASE_DIR)
    print(f'Directory \"{BASE_DIR}\" created')
else:
    print(f'Directory \"{BASE_DIR}\" already exists')

Directory "/gcs/renatoleite-nvtabular" already exists


# Convert CSV to Parquet

In [None]:
# Training / Validation
train_paths = 'gs://workshop-datasets/criteo/day_1' # Training CSV file to be preprocessed.
valid_paths = 'gs://workshop-datasets/criteo/day_0' # Validation CSV file to be preprocessed.
num_output_files_train = 1
num_output_files_valid = 1
convert_path = os.path.join(BASE_DIR, 'convert-csv-parquet')

In [None]:
# Convert Training files from CSV to Parquet
! docker run -it --rm --gpus all \
-v $local_dev_folder:/tests \
-v $BASE_DIR:$BASE_DIR \
$IMAGE_URI \
python /tests/test_preprocessing.py \
--method-to-call convert_csv_to_parquet_op \
--output-path $convert_path \
--data-paths $train_paths \
--split $train_split \
--num-output-files $num_output_files_train \
--n-workers $n_workers \
--recursive $recursive \
--device-limit-frac 0.8 \
--device-pool-frac 0.9 \
--part-mem-frac 0.125

In [None]:
# Convert Validation files from CSV to Parquet
! docker run -it --rm --gpus all \
-v $local_dev_folder:/tests \
-v $BASE_DIR:$BASE_DIR \
$IMAGE_URI \
python /tests/test_preprocessing.py \
--method-to-call convert_csv_to_parquet_op \
--output-path $convert_path \
--data-paths $valid_paths \
--split $valid_split \
--num-output-files $num_output_files_valid \
--n-workers $n_workers \
--recursive $recursive \
--device-limit-frac 0.8 \
--device-pool-frac 0.9 \
--part-mem-frac 0.125

In [None]:
# Copy converted files back to GCS
! gsutil -m cp -r $convert_path gs://renatoleite-nvtabular

# Fit (analyse) dataset

In [None]:
parquet_path = f'gs://{BUCKET}/convert-csv-parquet'
workflow_path = f'/gcs/{BUCKET}/workflow'

In [None]:
! docker run -it --rm --gpus all \
-v $local_dev_folder:/tests \
-v $BASE_DIR:$BASE_DIR \
$IMAGE_URI \
python /tests/test_preprocessing.py \
--method-to-call analyze_dataset_op \
--output-path $parquet_path \
--workflow-path $workflow_path \
--split $train_split \
--n-workers $n_workers \
--recursive $recursive \
--device-limit-frac 0.8 \
--device-pool-frac 0.9 \
--part-mem-frac 0.125

In [None]:
! gsutil -m cp -r $workflow_path gs://renatoleite-nvtabular

# Transform Dataset

In [None]:
parquet_path = f'gs://{BUCKET}/convert-csv-parquet'
workflow_path = f'/gcs/{BUCKET}/workflow'
transformed_dataset = f'/gcs/{BUCKET}/transformed'

In [None]:
! docker run -it --rm --gpus all \
-v $local_dev_folder:/tests \
-v $BASE_DIR:$BASE_DIR \
$IMAGE_URI \
python /tests/test_preprocessing.py \
--method-to-call transform_dataset_op \
--output-path $parquet_path \
--workflow-path $workflow_path \
--transformed-dataset $transformed_dataset \
--split $train_split \
--n-workers $n_workers \
--recursive $recursive \
--device-limit-frac 0.8 \
--device-pool-frac 0.9 \
--part-mem-frac 0.125

In [None]:
! gsutil -m cp -r $transformed_dataset gs://renatoleite-nvtabular

# Export Parquet files from BigQuery to GCS

In [15]:
output_path = f'gs://{BUCKET}/bq_export_parquet'
bq_project = PROJECT_ID
bq_location = 'us'
bq_dataset_name = 'criteo_small'
bq_table_name = 'train'

In [16]:
! docker run -it --rm --gpus all \
-v $local_dev_folder:/tests \
-v $BASE_DIR:$BASE_DIR \
$IMAGE_URI \
python /tests/test_preprocessing.py \
--method-to-call export_parquet_from_bq_op \
--output-path $output_path \
--bq-project $bq_project \
--bq-location $bq_location \
--bq-dataset-name $bq_dataset_name \
--bq-table-name $bq_table_name \
--split $train_split

18-11-21 10:14:53 - Args: Namespace(bq_dataset_name='criteo_small', bq_location='us', bq_project='renatoleite-mldemos', bq_table_name='train', data_paths='', device_limit_frac=0.8, device_pool_frac=0.9, method_to_call='export_parquet_from_bq_op', n_workers=1, num_output_files=1, output_path='gs://renatoleite-nvtabular/bq_export_parquet', part_mem_frac=0.125, recursive=False, sep='\t', shuffle='None', split='train', transformed_dataset='', workflow_path='')
18-11-21 10:14:53 - Starting job.
18-11-21 10:14:56 - Extracting train table to gs://renatoleite-nvtabular/bq_export_parquet/train path.
18-11-21 10:14:58 - Finished exporting to GCS.
