In [1]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [4]:
train_filename = "./data/samples/train_transactions_gt2018.tfrecord"
train = tf.data.TFRecordDataset(train_filename)

test_filename = "./data/samples/test_transactions_gt2018.tfrecord"
test = tf.data.TFRecordDataset(test_filename)

feature_description = {
    'context_item_id': tf.io.FixedLenFeature([10], tf.int64, default_value=np.repeat(0, 10)),
    'context_item_quantity': tf.io.FixedLenFeature([10], tf.float32, default_value=np.repeat(0, 10)),
    'context_item_price': tf.io.FixedLenFeature([10], tf.float32, default_value=np.repeat(0, 10)),
    'context_department_id': tf.io.FixedLenFeature([10], tf.int64, default_value=np.repeat(0, 10)),
    'context_discount_id': tf.io.FixedLenFeature([10], tf.int64, default_value=np.repeat(0, 10)),
    'context_return_id': tf.io.FixedLenFeature([10], tf.int64, default_value=np.repeat(0, 10)),   
    'label_item_id': tf.io.FixedLenFeature([1], tf.int64, default_value=0),
}

In [5]:
def _parse_function(example_proto):
  return tf.io.parse_single_example(example_proto, feature_description)

train_ds = train.map(_parse_function).map(lambda x: {
    "context_item_id": tf.strings.as_string(x["context_item_id"]),
    "label_item_id": tf.strings.as_string(x["label_item_id"])
})

test_ds = test.map(_parse_function).map(lambda x: {
    "context_item_id": tf.strings.as_string(x["context_item_id"]),
    "label_item_id": tf.strings.as_string(x["label_item_id"])
})

In [7]:
for x in train_ds.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'context_item_id': array([b'51484', b'14856', b'30961', b'30961', b'114343', b'51484',
       b'88698', b'96369', b'96382', b'51484'], dtype=object),
 'label_item_id': array([b'31287'], dtype=object)}


In [8]:
items_filename = "./web/data/samples/items.tfrecord"
items_tf = tf.data.TFRecordDataset(items_filename)
item_feature_description = {
    'item_id': tf.io.FixedLenFeature([1], tf.int64, default_value=0),
    'item_description': tf.io.FixedLenFeature([1], tf.string, default_value='None')}
def item_parse_function(example_proto):
  return tf.io.parse_single_example(example_proto, item_feature_description)

items_ds = items_tf.map(item_parse_function).map(lambda x: {
    "item_id": tf.strings.as_string(x["item_id"]),
    "item_description": x["item_description"],
})
items = items_ds.map(lambda x: x["item_id"])
item_ids = items.batch(1_000)
unique_item_ids = np.unique(np.concatenate(list(item_ids)))

In [9]:
embedding_dimension = 32

query_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
      vocabulary=unique_item_ids, mask_token=None),
    tf.keras.layers.Embedding(len(unique_item_ids) + 1, embedding_dimension), 
    tf.keras.layers.GRU(embedding_dimension),
])

candidate_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_item_ids, mask_token=None),
  tf.keras.layers.Embedding(len(unique_item_ids) + 1, embedding_dimension)
])

In [10]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=items_tf.batch(128).map(candidate_model)
)

task = tfrs.tasks.Retrieval(
  metrics=metrics
)

class MyModel(tfrs.Model):

    def __init__(self, query_model, candidate_model):
        super().__init__()
        self._query_model = query_model
        self._candidate_model = candidate_model

        self._task = task

    def compute_loss(self, features, training=False):
        watch_history = features["context_item_id"]
        watch_next_label = features["label_item_id"]

        query_embedding = self._query_model(watch_history)       
        candidate_embedding = self._candidate_model(watch_next_label)

        return self._task(query_embedding, candidate_embedding, compute_metrics=not training)

In [11]:
model = MyModel(query_model, candidate_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [12]:
cached_train = train_ds.shuffle(10_000).batch(12800).cache()
cached_test = test_ds.batch(2560).cache()

In [13]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x235f22daca0>

In [14]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.8845945000648499,
 'factorized_top_k/top_5_categorical_accuracy': 0.8845945000648499,
 'factorized_top_k/top_10_categorical_accuracy': 0.8845945000648499,
 'factorized_top_k/top_50_categorical_accuracy': 0.8845945000648499,
 'factorized_top_k/top_100_categorical_accuracy': 0.8845945000648499,
 'loss': 412.1374206542969,
 'regularization_loss': 0,
 'total_loss': 412.1374206542969}

In [16]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model._query_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((items.batch(100), items.batch(100).map(model._candidate_model)))
)
path = './web/data/samples/saved_model'
tf.saved_model.save(index, path)






INFO:tensorflow:Assets written to: ./web/data/samples/saved_model\assets


INFO:tensorflow:Assets written to: ./web/data/samples/saved_model\assets


In [18]:
foo, titles = index(tf.constant(['139716','35287','142953','132041','','','','','','',],shape=(1,10,1)))

In [19]:
print(foo)
print(titles)

tf.Tensor(
[[3.259811  3.1711054 3.159664  3.1514008 3.115191  3.0871491 3.0802324
  3.0752077 3.0517814 3.0282812]], shape=(1, 10), dtype=float32)
tf.Tensor(
[[[b'29822']
  [b'145973']
  [b'132418']
  [b'114617']
  [b'73940']
  [b'99887']
  [b'84706']
  [b'126377']
  [b'31124']
  [b'151992']]], shape=(1, 10, 1), dtype=string)


In [116]:
foo, titles = index(tf.constant(['95801','91520','37133','57218','','','','','','',],shape=(1,10,1)))
print(foo)
print(titles)

tf.Tensor(
[[3.3212721 3.2121694 3.134917  3.1190243 3.1052196 3.0620832 3.0236504
  3.0063984 2.9986353 2.9633884]], shape=(1, 10), dtype=float32)
tf.Tensor(
[[[b'73343']
  [b'151931']
  [b'133325']
  [b'49788']
  [b'76912']
  [b'145911']
  [b'107808']
  [b'28549']
  [b'128225']
  [b'133326']]], shape=(1, 10, 1), dtype=string)


In [40]:
titles.numpy().flatten().tolist()[:5]

[b'29822', b'145973', b'132418', b'114617', b'73940']

In [125]:
tf.keras.models.save_model(model, 'data/save2')





ValueError: Model <__main__.MyModel object at 0x000002A7D0578280> cannot be saved either because the input shape is not available or because the forward pass of the model is not defined.To define a forward pass, please override `Model.call()`. To specify an input shape, either call `build(input_shape)` directly, or call the model on actual data using `Model()`, `Model.fit()`, or `Model.predict()`. If you have a custom training step, please make sure to invoke the forward pass in train step through `Model.__call__`, i.e. `model(inputs)`, as opposed to `model.call()`.