# Define global variables and imports

In [1]:
import os

In [4]:
PROJECT_ID = 'renatoleite-mldemos'
BUCKET =  'renatoleite-nvtabular'

IMAGE_VERSION = 'nightly'
IMAGE_URI = f'gcr.io/{PROJECT_ID}/hugectr-training:{IMAGE_VERSION}'

# Build docker image

In [None]:
! docker build -t gcr.io/$PROJECT_ID/hugectr-training:$IMAGE_VERSION -f ../src/Dockerfile.hugectr ../src/

# Create local directory to read/write files

This folder will work like a GCSfuse mount point.

In [6]:
BASE_DIR = f'/gcs/{BUCKET}'
print(BASE_DIR)

/gcs/renatoleite-nvtabular


In [7]:
# Create BASE_DIR if not exists
if not os.path.exists(BASE_DIR):
    os.makedirs(BASE_DIR)
    print(f'Directory \"{BASE_DIR}\" created')
else:
    print(f'Directory \"{BASE_DIR}\" already exists')

Directory "/gcs/renatoleite-nvtabular" already exists


# Train the model

In [8]:
LOCAL_DEV_FOLDER = '/home/renatoleite/nvidia-merlin-on-vertex-ai/src/training/hugectr'
LOCAL_MODEL_DIR = '/tmp/saved_model'
LOCAL_CHECKPOINT_DIR = '/tmp/checkpoints'

In [9]:
# Model related variables
MODEL_NAME = 'deepfm'

# Data related variables
DATA_ROOT = f'/gcs/{BUCKET}/transformed'
TRAIN_DATA = os.path.join(DATA_ROOT, 'train/_file_list.txt')
VALID_DATA = os.path.join(DATA_ROOT, 'valid/_file_list.txt')
SCHEMA_PATH = os.path.join(DATA_ROOT, 'train/schema.pbtxt')

In [10]:
gpus = '[[0,1,2,3,4,5,6,7]]'

# Training related variables
NUM_EPOCHS = 0
MAX_ITERATIONS = 50000
EVAL_INTERVAL = 1000
EVAL_BATCHES = 500
EVAL_BATCHES_FINAL = 2500
DISPLAY_INTERVAL = 200
SNAPSHOT_INTERVAL = 0
PER_GPU_BATCH_SIZE = 2048
LR = 0.001
DROPOUT_RATE = 0.5
NUM_WORKERS = 12

In [None]:
# Convert Training files from CSV to Parquet
! docker run -it --rm --gpus all \
-v $LOCAL_DEV_FOLDER:/tests \
-v $LOCAL_MODEL_DIR:$LOCAL_MODEL_DIR \
-v $LOCAL_CHECKPOINT_DIR:$LOCAL_CHECKPOINT_DIR \
-v $BASE_DIR:$BASE_DIR \
$IMAGE_URI \
python /tests/task.py \
--per_gpu_batch_size=$PER_GPU_BATCH_SIZE \
--model_name=$MODEL_NAME \
--train_data=$TRAIN_DATA \
--valid_data=$TRAIN_DATA \
--schema=$SCHEMA_PATH \
--max_iter=$MAX_ITERATIONS \
--max_eval_batches=$EVAL_BATCHES \
--eval_batches=$EVAL_BATCHES_FINAL \
--dropout_rate=$DROPOUT_RATE \
--lr=$LR \
--num_workers=$NUM_WORKERS \
--num_epochs=$NUM_EPOCHS \
--eval_interval=$EVAL_INTERVAL \
--snapshot=$SNAPSHOT_INTERVAL \
--display_interval=$DISPLAY_INTERVAL \
--gpus=$gpus