In [1]:
# Copyright 2022 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ================================

In [2]:
import os
import numpy as np

# we can control how much memory to give tensorflow with this environment variable
# IMPORTANT: make sure you do this before you initialize TF's runtime, otherwise
# TF will have claimed all free GPU memory
os.environ["TF_MEMORY_ALLOCATION"] = "0.3"  # fraction of free memory

from nvtabular.loader.tf_utils import configure_tensorflow

configure_tensorflow()

import nvtabular as nvt
from nvtabular.ops import *
from merlin.models.utils.example_utils import workflow_fit_transform, save_results

from merlin.schema.tags import Tags

import merlin.models.tf as mm
from merlin.io.dataset import Dataset

import tensorflow as tf

2022-10-20 07:45:04.202902: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-10-20 07:45:05.609926: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-20 07:45:07.669736: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14742 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:17:00.0, compute capability: 8.6
2022-10-20 07:45:07.670334: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created d

In [3]:
from merlin.datasets.synthetic import generate_data

DATA_FOLDER = os.environ.get("DATA_FOLDER", "/raid/data/")

NUM_ROWS = os.environ.get("NUM_ROWS", 10000000)
SYNTHETIC_DATA = eval(os.environ.get("SYNTHETIC_DATA", "True"))

if SYNTHETIC_DATA:
    train, valid = generate_data("aliccp-raw", int(NUM_ROWS), set_sizes=(0.7, 0.3))
    # save the datasets as parquet files
    train.to_ddf().to_parquet(os.path.join(DATA_FOLDER, "train"))
    valid.to_ddf().to_parquet(os.path.join(DATA_FOLDER, "valid"))



In [4]:
train_path = os.path.join(DATA_FOLDER, "train", "*.parquet")
valid_path = os.path.join(DATA_FOLDER, "valid", "*.parquet")
output_path = os.path.join(DATA_FOLDER, "processed")

In [5]:
user_id = ["user_id"] >> Categorify(freq_threshold=5) >> TagAsUserID()
item_id = ["item_id"] >> Categorify(freq_threshold=5) >> TagAsItemID()
add_feat = [
    "user_item_categories",
    "user_item_shops",
    "user_item_brands",
    "user_item_intentions",
    "item_category",
    "item_shop",
    "item_brand",
] >> Categorify()

te_feat = (
    ["user_id", "item_id"] + add_feat
    >> TargetEncoding(["click"], kfold=1, p_smooth=20)
    >> Normalize()
)

targets = ["click"] >> AddMetadata(tags=[Tags.BINARY_CLASSIFICATION, "target"])

outputs = user_id + item_id + targets + add_feat + te_feat

# Remove rows where item_id==0 and user_id==0
outputs = outputs >> Filter(f=lambda df: (df["item_id"] != 0) & (df["user_id"] != 0))

workflow_fit_transform(outputs, train_path, valid_path, output_path)

In [6]:
%%writefile './train.py'

import argparse
import os
import random
from pathlib import Path

# we can control how much memory to give tensorflow with this environment variable
# IMPORTANT: make sure you do this before you initialize TF's runtime, otherwise
# TF will have claimed all free GPU memory
os.environ["TF_MEMORY_ALLOCATION"] = "0.3"  # fraction of free memory

import cupy
import horovod.tensorflow as hvd
import numpy as np
import tensorflow as tf

import merlin.models.tf as mm
from merlin.io.dataset import Dataset
from merlin.schema.tags import Tags
#from merlin.models.tf.distributed.backend import hvd

hvd.init()

# Seed with system randomness (or a static seed)
os.environ["TF_CUDNN_DETERMINISTIC"] = str(hvd.rank())
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)
cupy.random.seed(None)


def seed_fn():
    """
    Generate consistent dataloader shuffle seeds across workers

    Reseeds each worker's dataloader each epoch to get fresh a shuffle
    that's consistent across workers.
    """
    min_int, max_int = tf.int32.limits
    max_rand = max_int // hvd.size()

    # Generate a seed fragment on each worker
    seed_fragment = cupy.random.randint(0, max_rand).get()

    # Aggregate seed fragments from all Horovod workers
    seed_tensor = tf.constant(seed_fragment)
    reduced_seed = hvd.allreduce(
        seed_tensor,
        name="shuffle_seed",
        op=hvd.mpi_ops.Sum,
    )

    return reduced_seed % max_rand


def train():
    base_dir = Path(args.base_dir)
    train = Dataset(base_dir / "train" / "*.parquet")
    ddf = train.to_ddf().repartition(npartitions=hvd.size())
    train = Dataset(ddf, schema=train.schema)

    train_loader = mm.Loader(
        train,
        schema=train.schema,
        batch_size=args.batch_size,
        shuffle=True,
        drop_last=True,
        seed_fn=seed_fn,
    )

    target_column = train.schema.select_by_tag(Tags.TARGET).column_names[0]

    model = mm.DLRMModel(
        train.schema,
        embedding_dim=args.embedding_dim,
        bottom_block=mm.MLPBlock([128, 64]),
        top_block=mm.MLPBlock([128, 64, 32]),
        prediction_tasks=mm.BinaryClassificationTask(target_column),
    )

    opt = tf.keras.optimizers.Adagrad(learning_rate=args.learning_rate)
    model.compile(optimizer=opt, run_eagerly=False, metrics=[tf.keras.metrics.AUC()])

    
    model.fit(
        train_loader,
        batch_size=args.batch_size,
    )

    if hvd.rank() == 0:
        model.save(base_dir)
        print(f"Training complete. Model saved to {base_dir}")


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--base_dir", type=str, default="./data", help="Input directory")
    parser.add_argument("--batch_size", type=int, default=16 * 1024)
    parser.add_argument("--learning_rate", type=float, default=0.03)
    parser.add_argument("--embedding_dim", type=int, default=64)
    args = parser.parse_args()
    return args


if __name__ == "__main__":
    args = parse_args()
    train()

Overwriting ./train.py


In [7]:
%%writefile './hvd_wrapper.sh'

#!/bin/bash

# Get local process ID from OpenMPI or alternatively from SLURM
if [ -z "${CUDA_VISIBLE_DEVICES:-}" ]; then
    if [ -n "${OMPI_COMM_WORLD_LOCAL_RANK:-}" ]; then
        LOCAL_RANK="${OMPI_COMM_WORLD_LOCAL_RANK}"
    elif [ -n "${SLURM_LOCALID:-}" ]; then
        LOCAL_RANK="${SLURM_LOCALID}"
    fi
    export CUDA_VISIBLE_DEVICES=${LOCAL_RANK}
fi

exec "$@"

Overwriting ./hvd_wrapper.sh


In [8]:
! horovodrun -np 2 sh hvd_wrapper.sh python train.py --base_dir {output_path}

2022-10-20 07:45:24.934467: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[1,1]<stderr>:2022-10-20 07:45:27.343749: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[1,0]<stderr>:2022-10-20 07:45:27.343747: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
[1,1]<stderr>:2022-10-20 07:45:30.072011: I tensorflow/core/platform/cpu_feature_guard.cc:

In [9]:
print(os.listdir(output_path))

['saved_model.pb', 'keras_metadata.pb', 'assets', 'variables', 'valid', 'train']
