# Define Global variables and imports

In [1]:
import os

In [2]:
PROJECT_ID = 'renatoleite-mldemos' # Change to your project Id.
REGION = 'us-central1' # Change to your region.
BUCKET =  'renatoleite-nvtabular' # Change to your bucket.

IMAGE_VERSION = '21.11'
IMAGE_NAME = 'nvt_preprocessing'
IMAGE_URI = f'gcr.io/{PROJECT_ID}/{IMAGE_NAME}:{IMAGE_VERSION}'

In [3]:
shuffle = 'None'
train_split = 'train'
valid_split = 'valid'
n_workers = 1
recursive = False

local_dev_folder = '/home/renatoleite/nvidia-merlin-on-vertex-ai/tests'

# Build docker image

In [4]:
! docker build -t $IMAGE_URI -f ../src/Dockerfile.nvtabular ../src/


Step 1/10 : FROM nvcr.io/nvidia/merlin/merlin-training:21.11
 ---> 8499f10347db
Step 2/10 : WORKDIR /src
 ---> Using cache
 ---> b9308b55a8c8
Step 3/10 : RUN pip install -U pip
 ---> Using cache
 ---> 7dd5a85a4cbf
Step 4/10 : RUN pip install google-cloud-bigquery gcsfs
 ---> Using cache
 ---> 0f4d0d6917f9
Step 5/10 : RUN pip install google-cloud-aiplatform==1.7.0 kfp==1.8.9
 ---> Using cache
 ---> 1d11165fdbc6
Step 6/10 : COPY ./preprocessing ./preprocessing
 ---> Using cache
 ---> 488c732fc27a
Step 7/10 : COPY ./serving ./serving
 ---> Using cache
 ---> fbe776a023e3
Step 8/10 : COPY setup.py .
 ---> Using cache
 ---> abe269ebf12a
Step 9/10 : COPY feature_utils.py .
 ---> Using cache
 ---> 9b11f7aae284
Step 10/10 : RUN pip install -e .
 ---> Using cache
 ---> e82fb12c4cbc
Successfully built e82fb12c4cbc
Successfully tagged gcr.io/renatoleite-mldemos/nvt_preprocessing:21.11


# Create local directory to read/write files

This folder will work like a GCSfuse mount point.

In [None]:
BASE_DIR = f'/gcs/{BUCKET}'
print(BASE_DIR)

In [None]:
# Create BASE_DIR if not exists
if not os.path.exists(BASE_DIR):
    os.makedirs(BASE_DIR)
    print(f'Directory \"{BASE_DIR}\" created')
else:
    print(f'Directory \"{BASE_DIR}\" already exists')

# Convert CSV to Parquet

In [None]:
# Training / Validation
train_paths = 'gs://workshop-datasets/criteo/day_1' # Training CSV file to be preprocessed.
valid_paths = 'gs://workshop-datasets/criteo/day_0' # Validation CSV file to be preprocessed.
num_output_files_train = 1
num_output_files_valid = 1
convert_path = os.path.join(BASE_DIR, 'convert-csv-parquet')

In [None]:
# Convert Training files from CSV to Parquet
! docker run -it --rm --gpus all \
-v $local_dev_folder:/tests \
-v $BASE_DIR:$BASE_DIR \
$IMAGE_URI \
python /tests/test_preprocessing.py \
--method-to-call convert_csv_to_parquet_op \
--output-path $convert_path \
--data-paths $train_paths \
--split $train_split \
--num-output-files $num_output_files_train \
--n-workers $n_workers \
--recursive $recursive \
--device-limit-frac 0.8 \
--device-pool-frac 0.9 \
--part-mem-frac 0.125

In [None]:
# Convert Validation files from CSV to Parquet
! docker run -it --rm --gpus all \
-v $local_dev_folder:/tests \
-v $BASE_DIR:$BASE_DIR \
$IMAGE_URI \
python /tests/test_preprocessing.py \
--method-to-call convert_csv_to_parquet_op \
--output-path $convert_path \
--data-paths $valid_paths \
--split $valid_split \
--num-output-files $num_output_files_valid \
--n-workers $n_workers \
--recursive $recursive \
--device-limit-frac 0.8 \
--device-pool-frac 0.9 \
--part-mem-frac 0.125

In [None]:
# Copy converted files back to GCS
! gsutil -m cp -r $convert_path gs://renatoleite-nvtabular

# Fit (analyse) dataset

In [None]:
parquet_path = f'gs://{BUCKET}/convert-csv-parquet'
workflow_path = f'/gcs/{BUCKET}/workflow'

In [None]:
! docker run -it --rm --gpus all \
-v $local_dev_folder:/tests \
-v $BASE_DIR:$BASE_DIR \
$IMAGE_URI \
python /tests/test_preprocessing.py \
--method-to-call analyze_dataset_op \
--output-path $parquet_path \
--workflow-path $workflow_path \
--split $train_split \
--n-workers $n_workers \
--recursive $recursive \
--device-limit-frac 0.8 \
--device-pool-frac 0.9 \
--part-mem-frac 0.125

In [None]:
! gsutil -m cp -r $workflow_path gs://renatoleite-nvtabular

# Transform Dataset

In [None]:
parquet_path = f'gs://{BUCKET}/convert-csv-parquet'
workflow_path = f'/gcs/{BUCKET}/workflow'
transformed_dataset = f'/gcs/{BUCKET}/transformed'

In [None]:
! docker run -it --rm --gpus all \
-v $local_dev_folder:/tests \
-v $BASE_DIR:$BASE_DIR \
$IMAGE_URI \
python /tests/test_preprocessing.py \
--method-to-call transform_dataset_op \
--output-path $parquet_path \
--workflow-path $workflow_path \
--transformed-dataset $transformed_dataset \
--split $train_split \
--n-workers $n_workers \
--recursive $recursive \
--device-limit-frac 0.8 \
--device-pool-frac 0.9 \
--part-mem-frac 0.125

In [None]:
! gsutil -m cp -r $transformed_dataset gs://renatoleite-nvtabular

# Export Parquet files from BigQuery to GCS

In [None]:
output_path = f'gs://{BUCKET}/bq_export_parquet'
bq_project = PROJECT_ID
bq_location = 'us'
bq_dataset_name = 'criteo_small'
bq_table_name = 'train'

In [None]:
! docker run -it --rm --gpus all \
-v $local_dev_folder:/tests \
-v $BASE_DIR:$BASE_DIR \
$IMAGE_URI \
python /tests/test_preprocessing.py \
--method-to-call export_parquet_from_bq_op \
--output-path $output_path \
--bq-project $bq_project \
--bq-location $bq_location \
--bq-dataset-name $bq_dataset_name \
--bq-table-name $bq_table_name \
--split $train_split