To run this POC:
1. Download and uncompress TenRec preprocessed dataset: https://drive.google.com/file/d/1OW6zIk2jUOyYiugr4fNDOCiYF86Nz4-O/view?usp=sharing  

2. Run a Merlin TF Docker container (adjusting the paths)
```bash
docker run --runtime=nvidia --rm -it --ipc=host --cap-add SYS_NICE -v /home/gmoreira/projects/nvidia/nvidia_merlin/:/merlin_dev/ -v /mnt/nvme0n1/datasets:/data -p 8888:8888 nvcr.io/nvidia/merlin/merlin-tensorflow:23.06 /bin/bash
```

3. Pull latest code from main and pip install latest code from models 
```
cd /models
git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && git fetch && git checkout main && pip install . --no-deps
```

4. Start Jupyter notebook
```bash
jupyter notebook --no-browser --ip 0.0.0.0 --no-browser --allow-root
```

In [1]:
import os
from tensorflow.keras import regularizers

2023-07-06 03:30:22.183795: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-06 03:30:22.220282: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
import tensorflow as tf

In [4]:
import merlin.models.tf as mm
from merlin.dataloader.tensorflow import Loader
from merlin.io.dataset import Dataset
from merlin.schema import ColumnSchema



  warn(f"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}")


[INFO]: sparse_operation_kit is imported
[SOK INFO] Import /usr/local/lib/python3.8/dist-packages/merlin_sok-1.2.0-py3.8-linux-x86_64.egg/sparse_operation_kit/lib/libsok_experiment.so
[SOK INFO] Import /usr/local/lib/python3.8/dist-packages/merlin_sok-1.2.0-py3.8-linux-x86_64.egg/sparse_operation_kit/lib/libsok_experiment.so
[SOK INFO] Initialize finished, communication tool: horovod


2023-07-06 03:30:27.999511: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-07-06 03:30:27.999552: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1638] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 16254 MB memory:  -> device: 0, name: Quadro GV100, pci bus id: 0000:15:00.0, compute capability: 7.0
  from .autonotebook import tqdm as notebook_tqdm


In [5]:
import wandb

In [6]:
config = dict(
    epochs=1,
    batch_size=16384,
    embedding_dim = 64,
    bottom_mlp = [64],
    top_mlp = [256,128,64],
    dropout = 0.18033334331720113,
    l2_reg = 3.5665386015190466e-05,
    LR=0.001,
    #LR_decay_factor=0.98,
    #LR_decay_steps=100,
    positive_class_weight=9.0
)

In [7]:
wandb.init(config=config)

[34m[1mwandb[0m: Currently logged in as: [33mgspmoreira[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [8]:
wandb_callback = wandb.keras.WandbCallback(log_batch_frequency=50)



In [9]:
train_dataset = Dataset("/data/tenrec/tenrec_preproc_v01/ranking/train/*.parquet")
valid_dataset = Dataset("/data/tenrec/tenrec_preproc_v01/ranking/valid/*.parquet")
schema = train_dataset.schema.excluding_by_name(['follow', 'like', 'share', 'watching_times'])

In [10]:
output_block = mm.BinaryOutput("click", 
                               post = mm.ColumnBasedSampleWeight(
                                    weight_column_name="click",
                                    binary_class_weights=(1.0, config["positive_class_weight"]),
                                ))
model = mm.DLRMModel(
    schema,
    embedding_dim=config["embedding_dim"],
    bottom_block=mm.MLPBlock(config["bottom_mlp"],
                            kernel_regularizer=regularizers.l2(config["l2_reg"]),
                            bias_regularizer=regularizers.l2(config["l2_reg"]),
                            dropout=config["dropout"]
                            ),
    top_block=mm.MLPBlock(config["top_mlp"],
                          kernel_regularizer=regularizers.l2(config["l2_reg"]),
                          bias_regularizer=regularizers.l2(config["l2_reg"]),
                          dropout=config["dropout"]
                         ),
    prediction_tasks=output_block
)

In [11]:
model

Model(
  (_should_compute_train_metrics_for_batch): <tf.Variable 'should_compute_train_metrics_for_batch:0' shape=() dtype=bool, numpy=True>
  (blocks): _TupleWrapper((SequentialBlock(
    (layers): List(
      (0): SequentialBlock(
        (layers): List(
          (0): ParallelBlock(
            (parallel_layers): Dict(
              (user_id): EmbeddingTable(
                (features): Dict(
                  (user_id): ColumnSchema(name='user_id', tags={<Tags.USER: 'user'>, <Tags.CATEGORICAL: 'categorical'>, <Tags.ID: 'id'>}, properties={'freq_threshold': 0.0, 'max_size': 0.0, 'num_buckets': None, 'start_index': 1.0, 'cat_path': './/categories/unique.user_id.parquet', 'embedding_sizes': {'cardinality': 2633851.0, 'dimension': 512.0}, 'domain': {'min': 0, 'max': 2633851, 'name': 'user_id'}}, dtype=DType(name='int32', element_type=<ElementType.Int: 'int'>, element_size=32, element_unit=None, signed=True, shape=Shape(dims=None)), is_list=False, is_ragged=False)
                )
    

In [12]:
optimizer = tf.keras.optimizers.Adam(learning_rate=config["LR"])
model.compile(optimizer, run_eagerly=False)

In [13]:
%%time
with Loader(train_dataset, batch_size=config["batch_size"]) as train_loader, \
     Loader(valid_dataset, batch_size=config["batch_size"]) as valid_loader:
     
    model.fit(
            train_dataset,            
            epochs=config["epochs"],
            #steps_per_epoch=1000,
            batch_size=config["batch_size"],            
            callbacks=[wandb_callback],      
            train_metrics_steps=50
            #validation_steps=100,
            #validation_data=valid_loader,
        )

2023-07-06 03:30:34.789777: I tensorflow/core/common_runtime/executor.cc:1209] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]
2023-07-06 03:30:40.543084: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x4a1694a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-07-06 03:30:40.543125: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): Quadro GV100, Compute Capability 7.0
2023-07-06 03:30:40.551028: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-07-06 03:30:40.926684: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8900
2023-07-06 03:30:41.068496: I ./tensor

CPU times: user 7min 41s, sys: 38.6 s, total: 8min 20s
Wall time: 5min 34s


In [14]:
%%time
metrics = model.evaluate(valid_loader, return_dict=True)
metrics

2023-07-06 03:36:07.089212: I tensorflow/core/common_runtime/executor.cc:1209] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


CPU times: user 2.51 s, sys: 178 ms, total: 2.68 s
Wall time: 1.71 s


{'loss': 1.4807631969451904,
 'precision': 0.3457164764404297,
 'recall': 0.9655006527900696,
 'binary_accuracy': 0.4441780745983124,
 'auc': 0.752277672290802,
 'regularization_loss': 0.0008068884490057826,
 'loss_batch': 1.4292196035385132}