In [1]:
import os

import logging
logging.basicConfig(level=logging.INFO)

import cudf
import dask_cudf
import nvtabular as nvt
from nvtabular import dataset as ds

In [2]:
dataset = ds.ClothingReviews("/romeyn/data")

data = dataset.transform(for_training=True)
stats = dataset.statistics(transformed=False)

stats.display_overview()

INFO:nvtabular:Preparing data...
INFO:numba.cuda.cudadrv.driver:init
INFO:nvtabular:Transforming dataset...
INFO:nvtabular:Fitting dataset...


['Division Name', 'Department Name', 'Class Name', 'Clothing ID']
['Positive Feedback Count', 'Age']
fit
['Positive Feedback Count', 'Age']
Department Name            object
Title                      object
Review Text                object
Class Name                 object
Age                         int64
Division Name              object
Recommended IND             int64
Rating                      int64
Positive Feedback Count     int64
Clothing ID                 int64
dtype: object
['Recommended', 'Rating', 'Positive Feedback Count', 'Age', 'Division Name', 'Department Name', 'Class Name', 'Clothing ID', 'Title', 'Review Text']


INFO:nvtabular:Transforming train...
INFO:nvtabular:Transforming eval...
INFO:nvtabular:Saving to /romeyn/data/clothing_reviews/transformed/ce06e0a35c2d346230f566b5402cc4a4_60ed76f95bf5241d36dcbabe253501c1
INFO:nvtabular:Saving dataset to parquet in: /romeyn/data/clothing_reviews/transformed/ce06e0a35c2d346230f566b5402cc4a4_60ed76f95bf5241d36dcbabe253501c1


['Recommended', 'Rating', 'Positive Feedback Count', 'Age', 'Division Name', 'Department Name', 'Class Name', 'Clothing ID', 'Title', 'Review Text']
['Recommended', 'Rating', 'Positive Feedback Count', 'Age', 'Division Name', 'Department Name', 'Class Name', 'Clothing ID', 'Title', 'Review Text']


INFO:nvtabular:Saving to /romeyn/data/clothing_reviews/transformed/fe3f7737b84fe8ff3216954796bc0408_60ed76f95bf5241d36dcbabe253501c1
INFO:nvtabular:Saving dataset to parquet in: /romeyn/data/clothing_reviews/transformed/fe3f7737b84fe8ff3216954796bc0408_60ed76f95bf5241d36dcbabe253501c1


['Recommended', 'Rating', 'Positive Feedback Count', 'Age', 'Division Name', 'Department Name', 'Class Name', 'Clothing ID', 'Title', 'Review Text']
Department Name              int64
Title                       object
Age                        float32
Class Name                   int64
Review Text                 object
Division Name                int64
Rating                       int64
Positive Feedback Count    float32
Clothing ID                  int64
Recommended                  int64
dtype: object
['Recommended', 'Rating', 'Positive Feedback Count', 'Age', 'Division Name', 'Department Name', 'Class Name', 'Clothing ID', 'Title', 'Review Text']
['Recommended', 'Rating', 'Positive Feedback Count', 'Age', 'Division Name', 'Department Name', 'Class Name', 'Clothing ID', 'Title', 'Review Text']
Department Name              int64
Title                       object
Review Text                 object
Age                        float32
Class Name                   int64
Division Name 

INFO:nvtabular:Preparing data...


['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name']
Department Name            object
Title                      object
Review Text                object
Age                         int64
Class Name                 object
Division Name              object
Recommended IND             int64
Rating                      int64
Positive Feedback Count     int64
Unnamed: 0                  int64
Clothing ID                 int64
dtype: object
['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name']
['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating', 'Recommended IND', 'Positive Feedback Count', 'Division Name', 'Department Name', 'Class Name']
Department Name            object
Title                      object
Review Text                object
Age      

In [3]:
os.environ["TF_MEMORY_ALLOCATION"] = "0.7"

import tensorflow as tf
from merlin_models import tf as mtf

INFO:numba.cuda.cudadrv.driver:init


In [4]:
batch_size = 10000

train_data = mtf.DataLoader.from_directory(data.train, batch_size=batch_size)
eval_data = mtf.DataLoader.from_directory(data.eval, batch_size=batch_size)

In [5]:
train_data.schema

feature {
  name: "Recommended"
  type: INT
  int_domain {
    name: "Recommended"
    min: 0
    max: 1
    is_categorical: false
  }
  annotation {
    tag: "binary"
    tag: "target"
  }
}
feature {
  name: "Rating"
  type: INT
  int_domain {
    name: "Rating"
    min: 1
    max: 5
    is_categorical: false
  }
  annotation {
    tag: "regression"
    tag: "target"
  }
}
feature {
  name: "Positive Feedback Count"
  type: FLOAT
  float_domain {
    name: "Positive Feedback Count"
    min: -0.4472583
    max: 20.985716
  }
  annotation {
    tag: "continuous"
  }
}
feature {
  name: "Age"
  type: FLOAT
  float_domain {
    name: "Age"
    min: -2.0518227
    max: 4.54904
  }
  annotation {
    tag: "continuous"
  }
}
feature {
  name: "Division Name"
  type: INT
  int_domain {
    name: "Division Name"
    min: 0
    max: 3
    is_categorical: true
  }
  annotation {
    tag: "categorical"
  }
}
feature {
  name: "Department Name"
  type: INT
  int_domain {
    name: "Department Nam

In [6]:
train_data.columns.embedding_sizes()

{'Division Name': (3, 16),
 'Department Name': (6, 16),
 'Class Name': (20, 16),
 'Clothing ID': (1150, 83)}

In [7]:
bottom_mlp = mtf.MLPBlock([64, 16])
top_mlp = mtf.MLPBlock([256, 128, 64])
dlrm_block = mtf.DLRMBlock.from_column_group(train_data.columns, bottom_mlp, top_mlp=top_mlp)

head = mtf.Head.from_column_group(train_data.columns)
model = dlrm_block.to_model(head=head, optimizer="adam", run_eagerly=True)

model

DLRMBlockWithHead(
  (block): DLRMBlock(
    (stack_features): MergeTabular(
      (to_merge): List(
        (0): EmbeddingFeatures(
          (convert_to_sparse): AsSparseFeatures()
          (embeddings): Dict(
            (Division Name): TableConfig(vocabulary_size=3, dim=16, initializer=None, optimizer=None, combiner='mean', name='Division Name')
            (Department Name): TableConfig(vocabulary_size=6, dim=16, initializer=None, optimizer=None, combiner='mean', name='Department Name')
            (Class Name): TableConfig(vocabulary_size=20, dim=16, initializer=None, optimizer=None, combiner='mean', name='Class Name')
            (Clothing ID): TableConfig(vocabulary_size=1150, dim=16, initializer=None, optimizer=None, combiner='mean', name='Clothing ID')
          )
        )
        (1): ContinuousEmbedding(
          (layers): List(
            (0): ContinuousFeatures(
              Age, Positive Feedback Count
              (aggregation): ConcatFeatures()
            )
   

In [8]:
model.fit(train_data, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f97929a2460>

In [7]:
features = train_data.head()[0]

In [12]:
model(features)

{'Recommended': <tf.Tensor: shape=(10000, 1), dtype=float32, numpy=
 array([[0.49565056],
        [0.48492843],
        [0.5033961 ],
        ...,
        [0.49643537],
        [0.49919438],
        [0.50095326]], dtype=float32)>,
 'Rating': <tf.Tensor: shape=(10000, 1), dtype=float32, numpy=
 array([[0.00747207],
        [0.01251686],
        [0.01540815],
        ...,
        [0.01885306],
        [0.0091423 ],
        [0.02178757]], dtype=float32)>}