In [None]:
# recommendation_trainer.py
import csv
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs

# user ratings must be at least this good to be used for training
rating_threshold = 5
# the number of books used for the query tower (# of isbns in input)
n_isbns_in = 3

base_dir = '/data/book_recommender'
weights_dir = base_dir +'/{}-weights'
index_model_dir = base_dir + '/model'
tfjs_model_dir = base_dir + '/tfjs-model'

# create a list of all isbn values
isbns = []
with open(base_dir + '/books.csv') as file:
    reader = csv.reader(file)
    next(reader)
    for line in reader:
        isbns.append(line[0])

# create a lookup dictionary to convert isbn values to their index in the above list
isbn_idxs = {isbn:idx for idx, isbn in enumerate(isbns)}
with open(base_dir + '/isbn_idxs.csv', 'w') as file:
    file.write('isbn,index\n')
    for isbn,index in isbn_idxs.items():
        file.write('{},{}\n'.format(isbn,index))
n_isbns = len(isbn_idxs)

# create a dictionary of all isbns that a user rates >= the rating_threshold
user_isbns = {}
with open(base_dir + '/ratings.csv') as file:
    reader = csv.reader(file)
    next(reader)
    for line in reader:
        rating = int(line[2])
        if rating < rating_threshold:
            continue
        user_id = line[0]
        isbn = line[1]
        try:
            isbn_idx = isbn_idxs[isbn]
        except:
            continue
        try:
            user_isbns[user_id].append(isbn_idx)
        except:
            user_isbns[user_id] = [isbn_idx]

user_isbns = dict(filter(lambda x: len(x[1]) >= n_isbns_in, user_isbns.items()))
user_idxs = {user_id:idx for idx, user_id in enumerate(user_isbns.keys())}
with open(base_dir + '/user_idxs.csv', 'w') as file:
    file.write('user_id,index\n')
    for user_id, index in user_idxs.items():
        file.write('{},{}\n'.format(user_id,index))
n_users = len(user_idxs)

# create a dictionary of inputs and outputs
dataset = {'isbns': [], 'user': []}
for user_id, isbns in user_isbns.items():
    # use 5x the number of isbns gathered for the user
    # this ensures a larger amount of training data
    for _ in range(len(isbns) * 5):
        # randomly select n_isbns_in from the user's isbns
        selected_isbns = np.random.choice(isbns, n_isbns_in)
        # add them to the inputs
        dataset['isbns'].append(selected_isbns)
        # add the user to the output
        dataset['user'].append(user_idxs[user_id])
        
# create a permutation to randomly shuffle all of the above data
# a permutation is created first so the same order can be applied
# to the inputs and the outputs
permutation = np.random.permutation(len(dataset['isbns']))
# apply the above permutation to the inputs and the outputs
dataset = {
    'isbns': np.asarray(dataset['isbns'])[permutation],
    'user': np.asarray(dataset['user'])[permutation]
}
# convert the above dictionary to a TF Dataset
dataset = tf.data.Dataset.from_tensor_slices(dataset)

# print an example of the data in the dataset
for d in dataset.take(1):
    print(d)

In [None]:
# create the query and candidate models
n_embedding_dimensions = 24

## QUERY
isbns_in_in = tf.keras.Input(shape=(n_isbns_in))
isbns_in_emb = tf.keras.layers.Embedding(n_isbns+1, n_embedding_dimensions)(isbns_in_in)
isbns_in_emb_avg = tf.keras.layers.AveragePooling1D(pool_size=3)(isbns_in_emb)
query = tf.keras.layers.Flatten()(isbns_in_emb_avg)
query_model = tf.keras.Model(inputs=isbns_in_in, outputs=query)

## CANDIDATE
isbns_out_in = tf.keras.Input(shape=(1))
isbns_out_emb = tf.keras.layers.Embedding(n_users+1, n_embedding_dimensions)(isbns_out_in)
candidate = tf.keras.layers.Flatten()(isbns_out_emb)
candidate_model = tf.keras.Model(inputs=isbns_out_in, outputs=candidate)

In [None]:
# TFRS TASK SETUP
candidates = dataset.batch(128).map(lambda x: candidate_model(x['user']))
metrics = tfrs.metrics.FactorizedTopK(candidates=candidates)
task = tfrs.tasks.Retrieval(metrics=metrics)


## TFRS MODEL CLASS
class Model(tfrs.Model):
    def __init__(self, query_model, candidate_model):
        super().__init__()
        self._query_model = query_model
        self._candidate_model = candidate_model
        self._task = task

    def compute_loss(self, features, training=False):
        query_embedding = self._query_model(features['isbns'])
        candidate_embedding = self._candidate_model(features['user'])
        return self._task(query_embedding, candidate_embedding)

## COMPILE AND TRAIN MODEL
model = Model(query_model, candidate_model)
# load model weights - this is to resume training
# model._query_model.load_weights(weights_dir.format('query'))
# model._candidate_model.load_weights(weights_dir.format('candidate'))

model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
model.fit(dataset.repeat().shuffle(300_000).batch(4096), steps_per_epoch=50, epochs=30, verbose=1)

In [None]:
# save the model weights
model._query_model.save_weights(weights_dir.format('query'))
model._candidate_model.save_weights(weights_dir.format('candidate'))

In [None]:
# create the index model to lookup the best candidate match for a query
index = tfrs.layers.factorized_top_k.BruteForce(model._query_model)
index.index_from_dataset(
    tf.data.Dataset.zip((
      dataset.map(lambda x: x['user']).batch(100),
      dataset.batch(100).map(lambda x: model._candidate_model(x['user']))
    ))
)
for features in dataset.shuffle(2000).batch(1).take(1):
    print('isbns', features['isbns'])
    scores, users = index(features['isbns'])
    print('recommended users', users)

In [None]:
# save the index model
index.save(index_model_dir)

In [None]:
## CONVERT TO TFJS MODEL
import subprocess
cmd = [
    'tensorflowjs_converter',
    '--input_format=tf_saved_model',
    '--output_format=tfjs_graph_model',
    index_model_dir,
    tfjs_model_dir
]
subprocess.run(cmd)