In [1]:
import torch
import nvtabular as nvt
from nvtabular import dataset as ds
from merlin_models import tf as mtf

In [2]:
user_id = nvt.ColumnGroup(["userId"], tags=nvt.Tag.USER)
item_id = nvt.ColumnGroup(["movieId"], tags=nvt.Tag.ITEM)

cat_features = (
    user_id + item_id
    >> nvt.ops.Categorify()
)

cat_features.columns

[Column(name='userId', tags=['user', 'categorical'], properties={}),
 Column(name='movieId', tags=['item', 'categorical'], properties={})]

In [3]:
cat_features.tags_by_column()

{'userId': ['user', 'categorical'], 'movieId': ['item', 'categorical']}

In [4]:
dataset = ds.MovieLens("/romeyn/data")

data = dataset.transform()
stats = dataset.statistics(transformed=False, split_names="splits")

stats.display_overview()

In [3]:
dataset.schema(transformed=True)

namespace(train=None, eval=None)

In [3]:
dataloader_kwargs = dict(batch_size=20000, separate_labels=True, targets=["rating_binary"])

train_data = mtf.DataLoader.from_directory(data.get_dir("train"), **dataloader_kwargs)
eval_data = mtf.DataLoader.from_directory(data.get_dir("eval"), **dataloader_kwargs)

train_data.output_shapes

{'movieId': TensorShape([20000, 1]),
 'rating': TensorShape([20000, 1]),
 'userId': TensorShape([20000, 1]),
 'genres': TensorShape([20000, 10]),
 'rating_binary': TensorShape([20000, 1])}

In [4]:
inputs = mtf.TabularFeatures.from_column_group(train_data.columns, aggregation="concat")
block = inputs >> mtf.MLPBlock([512, 256])
head = mtf.Head.from_column_group(train_data.columns)
model = block.to_model(head, optimizer="adam")

model

SequentialBlockWithHead(
  (block): SequentialBlock(
    (layers): List(
      (0): TabularFeatures(
        (to_merge): List(
          (0): EmbeddingFeatures(
            (convert_to_sparse): AsSparseFeatures()
            (embeddings): Dict(
              (movieId): TableConfig(vocabulary_size=56585, dim=512, initializer=None, optimizer=None, combiner='mean', name='movieId')
              (userId): TableConfig(vocabulary_size=162541, dim=512, initializer=None, optimizer=None, combiner='mean', name='userId')
              (genres): TableConfig(vocabulary_size=20, dim=16, initializer=None, optimizer=None, combiner='mean', name='genres')
            )
          )
        )
        (aggregation): ConcatFeatures()
      )
      (1): Dense(512, activation=relu, use_bias=True)
      (2): Dense(256, activation=relu, use_bias=True)
    )
  )
  (head): Head(
    (tasks): Dict(
      (rating_binary): PredictionTask(
        (pre): Dense(1, activation=sigmoid, use_bias=True)
        (eval_metri

In [5]:
model.fit(train_data, epochs=3)

Epoch 1/3
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f33e9403790>