In [1]:
%load_ext tensorboard

In [2]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import datetime

In [3]:
train_filename = "../data/samples/train_user_item_transactions.tfrecord"
train = tf.data.TFRecordDataset(train_filename)

test_filename = "../data/samples/test_user_item_transactions.tfrecord"

test = tf.data.TFRecordDataset(test_filename)
feature_description = {
    'user_id': tf.io.FixedLenFeature([], tf.int64, default_value=0),
    'item_id': tf.io.FixedLenFeature([], tf.int64, default_value=0),
    'item_trantime' : tf.io.FixedLenFeature([], tf.int64, default_value=0),
    'item_price': tf.io.FixedLenFeature([], tf.float32, default_value=0.0),
    'item_fullprice': tf.io.FixedLenFeature([], tf.float32, default_value=0.0),
    'item_quantity': tf.io.FixedLenFeature([], tf.float32, default_value=0.0),
    'department_id': tf.io.FixedLenFeature([], tf.int64, default_value=0),
    'return_id': tf.io.FixedLenFeature([], tf.int64, default_value=0),                
    'discount_id': tf.io.FixedLenFeature([], tf.int64, default_value=0),
}

In [4]:
def _parse_function(example_proto):
  return tf.io.parse_single_example(example_proto, feature_description)

train_ds = train.map(_parse_function).map(lambda x: {
    'user_id': x['user_id'],
    'item_id': x['item_id'],
})

test_ds = test.map(_parse_function).map(lambda x: {
    'user_id': x['user_id'],
    'item_id': x['item_id'],
})

In [5]:
items_filename = '../data/samples/items.tfrecord'
items_tf = tf.data.TFRecordDataset(items_filename)
item_feature_description = {
    'item_id': tf.io.FixedLenFeature([], tf.int64, default_value=0)
}

def item_parse_function(example_proto):
  return tf.io.parse_single_example(example_proto, item_feature_description)

items_ds = items_tf.map(item_parse_function).map(lambda x: {
    'item_id': x['item_id']
})
item_ids = items_ds.map(lambda x: x['item_id'])
unique_item_ids = np.unique(np.concatenate(list(item_ids.batch(1000))))

customers_filename = '../data/samples/customers.tfrecord'
customers_tf = tf.data.TFRecordDataset(customers_filename)
customer_feature_description = {
  'user_id' : tf.io.FixedLenFeature([], tf.int64, default_value=0),
  'zip_code' : tf.io.FixedLenFeature([], tf.string, default_value='None'),
  'total_visits' : tf.io.FixedLenFeature([], tf.int64, default_value=0),
  'total_sales' : tf.io.FixedLenFeature([], tf.float32, default_value=0.0),
  'total_savings' : tf.io.FixedLenFeature([], tf.float32, default_value=0.0)
}

def customer_parse_function(example_proto):
  return tf.io.parse_single_example(example_proto, customer_feature_description)

customers_ds = customers_tf.map(customer_parse_function).map(lambda x: {
  'user_id' : x['user_id'],
})
customer_ids = customers_ds.map(lambda x: x['user_id'])
unique_customer_ids = np.unique(np.concatenate(list(customer_ids.batch(1000))))

In [6]:
embedding_dimension = 32

user_model = tf.keras.Sequential([
  tf.keras.layers.IntegerLookup(vocabulary=unique_customer_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_customer_ids) + 1, embedding_dimension)
])

item_model = tf.keras.Sequential([
  tf.keras.layers.IntegerLookup(vocabulary=unique_item_ids, mask_token=None),
  tf.keras.layers.Embedding(len(unique_item_ids) + 1, embedding_dimension)
])

metrics = tfrs.metrics.FactorizedTopK(candidates=item_ids.batch(12800).map(item_model))
task = tfrs.tasks.Retrieval(metrics=metrics)

class MovielensModel(tfrs.Model):

  def __init__(self, user_model, item_model):
    super().__init__()
    self.item_model: tf.keras.Model = item_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features['user_id'])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_item_embeddings = self.item_model(features['item_id'])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_item_embeddings)

In [7]:
model = MovielensModel(user_model, item_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
log_dir = "../logs/fit" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [8]:
cached_train = train_ds.shuffle(10_000).batch(6400).cache()
cached_test = test_ds.batch(2560).cache()

In [10]:
model.fit(cached_train, epochs=36, validation_data=cached_test, callbacks=[tensorboard_callback])

Epoch 1/36
Epoch 2/36
Epoch 3/36
Epoch 4/36
Epoch 5/36
Epoch 6/36
Epoch 7/36
Epoch 8/36
Epoch 9/36
Epoch 10/36
Epoch 11/36
Epoch 12/36
Epoch 13/36
Epoch 14/36
Epoch 15/36
Epoch 16/36
Epoch 17/36
Epoch 18/36
Epoch 19/36
Epoch 20/36
Epoch 21/36
Epoch 22/36
Epoch 23/36
Epoch 24/36
Epoch 25/36
Epoch 26/36

KeyboardInterrupt: 

In [11]:
%tensorboard --logdir logs/fit

ERROR: Timed out waiting for TensorBoard to start. It may still be running as pid 140028.

In [None]:
model.evaluate(cached_test, return_dict=True)