In [16]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [17]:
train_filename = "../data/samples/train_transactions_rich.tfrecord"
train = tf.data.TFRecordDataset(train_filename)

test_filename = "../data/samples/test_transactions_rich.tfrecord"
test = tf.data.TFRecordDataset(test_filename)

feature_description = {
    'context_item_id': tf.io.FixedLenFeature([10], tf.int64, default_value=np.repeat(0, 10)),    
    'context_item_price': tf.io.FixedLenFeature([10], tf.float32, default_value=np.repeat(0, 10)),
    'context_item_discount': tf.io.FixedLenFeature([10], tf.float32, default_value=np.repeat(0, 10)),
    'context_item_description': tf.io.FixedLenFeature([10], tf.string, default_value=np.repeat('None', 10)), 
    'label_item_id': tf.io.FixedLenFeature([1], tf.int64, default_value=0),
}

def _parse_function(example_proto):
  return tf.io.parse_single_example(example_proto, feature_description)
def _map_function(x):
  return {
    "context_item_id": tf.strings.as_string(x["context_item_id"]),
    "context_item_description": x["context_item_description"],
    "context_item_price": x["context_item_price"],
    "context_item_discount": x["context_item_discount"],
    "label_item_id": tf.strings.as_string(x["label_item_id"])}
    
train_ds = train.map(_parse_function).map(_map_function)
test_ds = test.map(_parse_function).map(_map_function)

In [18]:
items_filename = "../data/samples/items.tfrecord"
items_tf = tf.data.TFRecordDataset(items_filename)
item_feature_description = {
    'item_id': tf.io.FixedLenFeature([1], tf.int64, default_value=0),
    'item_fullprice' : tf.io.FixedLenFeature([1], tf.float32, default_value=0),
    'item_description': tf.io.FixedLenFeature([1], tf.string, default_value='None')}
def item_parse_function(example_proto):
  return tf.io.parse_single_example(example_proto, item_feature_description)

items_ds = items_tf.map(item_parse_function).map(lambda x: {
    "item_id": tf.strings.as_string(x["item_id"]),
    "item_description": x["item_description"],
})

item_ids = items_ds.map(lambda x: x["item_id"]).batch(1_000)
unique_item_ids = np.unique(np.concatenate(list(item_ids)))
item_descriptions = items_ds.map(lambda x: x["item_description"]).batch(1_000)
unique_item_descriptions = np.unique(np.concatenate(list(item_descriptions)))

In [19]:
class QueryItemModel(tf.keras.Model):
  embedding_dimension = 32
  def __init__(self):
    super().__init__()

    max_tokens = 10_000

    self.item_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(vocabulary=unique_item_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_item_ids) + 1, self.embedding_dimension),
      tf.keras.layers.GRU(self.embedding_dimension),
    ])

    self.description_text_embedding = tf.keras.Sequential([
      tf.keras.layers.TextVectorization(max_tokens=max_tokens),
      tf.keras.layers.Embedding(max_tokens, self.embedding_dimension, mask_zero=True),
      tf.keras.layers.GlobalAveragePooling1D(),
    ])
    self.description_vectorizer.adapt(unique_item_descriptions)

  def call(self, features):
    return tf.concat([
        self.item_embedding(features["context_item_id"]),
        self.description_text_embedding(features["context_item_description"]),
    ], axis=1)

In [20]:

class RetrievalModel(tfrs.Model):
    embedding_dimension = 32
    def __init__(self):
        super().__init__()
        self._query_model = QueryItemModel()
        self._candidate_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_item_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_item_ids) + 1, self.embedding_dimension)
            ])
        metrics = tfrs.metrics.FactorizedTopK(candidates=items_tf.batch(128).map(self._candidate_model))
        self._task = tfrs.tasks.Retrieval(metrics=metrics)

    def compute_loss(self, features, training=False):
        item_history = {
            "context_item_id": features["context_item_id"],
            "context_item_description": features["context_item_description"]}   
        next_item_label = features["label_item_id"]

        query_embedding = self._query_model(item_history)       
        candidate_embedding = self._candidate_model(next_item_label)

        return self._task(query_embedding, candidate_embedding, compute_metrics=not training)

In [21]:
model = RetrievalModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [22]:
cached_train = train_ds.shuffle(10_000).batch(12800).cache()
cached_test = test_ds.batch(2560).cache()

In [23]:
model.fit(cached_train, epochs=3)

Epoch 1/3


ValueError: in user code:

    File "c:\Users\anand.HOME\Anaconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\anand.HOME\Anaconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\anand.HOME\Anaconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\anand.HOME\Anaconda3\envs\tf\lib\site-packages\tensorflow_recommenders\models\base.py", line 68, in train_step
        loss = self.compute_loss(inputs, training=True)
    File "C:\Users\anand.HOME\AppData\Local\Temp\ipykernel_13424\2786634238.py", line 19, in compute_loss
        query_embedding = self._query_model(item_history)
    File "c:\Users\anand.HOME\Anaconda3\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\ANAND~1.HOM\AppData\Local\Temp\__autograph_generated_file_a217p3d.py", line 12, in tf__call
        retval_ = ag__.converted_call(ag__.ld(tf).concat, ([ag__.converted_call(ag__.ld(self).item_embedding, (ag__.ld(features)['context_item_id'],), None, fscope), ag__.converted_call(ag__.ld(self).description_text_embedding, (ag__.ld(features)['context_item_description'],), None, fscope)],), dict(axis=1), fscope)

    ValueError: Exception encountered when calling layer "query_item_model_2" (type QueryItemModel).
    
    in user code:
    
        File "C:\Users\anand.HOME\AppData\Local\Temp\ipykernel_13424\1242551685.py", line 27, in call  *
            self.description_text_embedding(features["context_item_description"]),
        File "c:\Users\anand.HOME\Anaconda3\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "c:\Users\anand.HOME\Anaconda3\envs\tf\lib\site-packages\keras\layers\preprocessing\text_vectorization.py", line 521, in _preprocess
            raise ValueError(
    
        ValueError: Exception encountered when calling layer "text_vectorization_2" (type TextVectorization).
        
        When using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(None, 10) with rank=2
        
        Call arguments received by layer "text_vectorization_2" (type TextVectorization):
          • inputs=tf.Tensor(shape=(None, 10), dtype=string)
    
    
    Call arguments received by layer "query_item_model_2" (type QueryItemModel):
      • features={'context_item_id': 'tf.Tensor(shape=(None, 10), dtype=string)', 'context_item_description': 'tf.Tensor(shape=(None, 10), dtype=string)'}


In [None]:
model.evaluate(cached_test, return_dict=True)

In [None]:
for x in items_ds.take(5).as_numpy_iterator():
  pprint.pprint(x)

In [None]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model._query_model)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((items.batch(100), items.batch(100).map(model._candidate_model)))
)
_ = index(tf.constant(['139716','35287','142953','132041','','','','','','',],shape=(1,10,1)))



In [None]:
foo, titles = index(tf.constant(['139716','35287','142953','132041','','','','','','',],shape=(1,10,1)))
print(foo)
print(titles)

In [None]:
path = '../data/retrieval_model'
tf.saved_model.save(index, path)

In [None]:
foo, titles = loaded(tf.constant(['139716','35287','142953','132041','','','','','','',],shape=(1,10,1)))

In [None]:
print (titles)