In [None]:
# recommendation_trainer.py
import csv
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs

n_songs_in = 3

base_dir = '/data/song-recommender'
weights_dir = base_dir +'/{}-weights'
index_model_dir = base_dir + '/model'
tfjs_model_dir = base_dir + '/tfjs-model'

# create a list of users and songs
users_songs = {}
with open(base_dir + '/users_songs.csv') as file:
    reader = csv.reader(file)
    next(reader)
    for line in reader:
        user = int(line[0])
        song = int(line[1])
        try:
            users_songs[user].append(song)
        except:
            users_songs[user] = [song]

user_idxs = {}
with open(base_dir + '/user_idxs.csv') as file:
    reader = csv.reader(file)
    next(reader)
    for line in reader:
        user_idxs[int(line[1])] = line[0]

song_idxs = {}
with open(base_dir + '/song_idxs.csv') as file:
    reader = csv.reader(file)
    next(reader)
    for line in reader:
        song_idxs[int(line[1])] = line[0]

n_users = len(user_idxs)
n_songs = len(song_idxs)
print('n_users', n_users)
print('n_songs', n_songs)
# create a dictionary of inputs and outputs
dataset = {'songs': [], 'user': []}
for user, songs in users_songs.items():
    for _ in range(len(songs) * 5):
        # randomly select n_songsns_in from the user's isbns
        selected_songs = np.random.choice(songs, n_songs_in)
        # add them to the inputs
        dataset['songs'].append(selected_songs)
        # add the user to the output
        dataset['user'].append(user)
        
# create a permutation to randomly shuffle all of the above data
# a permutation is created first so the same order can be applied
# to the inputs and the outputs
permutation = np.random.permutation(len(dataset['songs']))
# apply the above permutation to the inputs and the outputs
dataset = {
    'songs': np.asarray(dataset['songs'])[permutation],
    'user': np.asarray(dataset['user'])[permutation]
}
# convert the above dictionary to a TF Dataset
dataset = tf.data.Dataset.from_tensor_slices(dataset)

print('len_dataset', permutation.shape[0])
# print an example of the data in the dataset
for d in dataset.take(1):
    print(d)



In [None]:
# create the query and candidate models
n_embedding_dimensions = 24

## QUERY
songs_in_in = tf.keras.Input(shape=(n_songs_in))
songs_in_emb = tf.keras.layers.Embedding(n_songs+1, n_embedding_dimensions)(songs_in_in)
songs_in_emb_avg = tf.keras.layers.AveragePooling1D(pool_size=3)(songs_in_emb)
query = tf.keras.layers.Flatten()(songs_in_emb_avg)
query_model = tf.keras.Model(inputs=songs_in_in, outputs=query)

## CANDIDATE
user_in = tf.keras.Input(shape=(1))
user_emb = tf.keras.layers.Embedding(n_users+1, n_embedding_dimensions)(user_in)
candidate = tf.keras.layers.Flatten()(user_emb)
candidate_model = tf.keras.Model(inputs=user_in, outputs=candidate)

In [None]:
# TFRS TASK SETUP
candidates = dataset.batch(128).map(lambda x: candidate_model(x['user']))
metrics = tfrs.metrics.FactorizedTopK(candidates=candidates)
task = tfrs.tasks.Retrieval(metrics=metrics)


## TFRS MODEL CLASS
class Model(tfrs.Model):
    def __init__(self, query_model, candidate_model):
        super().__init__()
        self._query_model = query_model
        self._candidate_model = candidate_model
        self._task = task

    def compute_loss(self, features, training=False):
        query_embedding = self._query_model(features['songs'])
        candidate_embedding = self._candidate_model(features['user'])
        return self._task(query_embedding, candidate_embedding)

## COMPILE AND TRAIN MODEL
model = Model(query_model, candidate_model)
# load model weights - this is to resume training
# model._query_model.load_weights(weights_dir.format('query'))
# model._candidate_model.load_weights(weights_dir.format('candidate'))

model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
model.fit(dataset.repeat().shuffle(300_000).batch(4096), steps_per_epoch=50, epochs=30, verbose=1)

In [None]:
model.fit(dataset.repeat().shuffle(300_000).batch(4096), steps_per_epoch=50, epochs=30, verbose=1)

In [None]:
# save the model weights
model._query_model.save_weights(weights_dir.format('query'))
model._candidate_model.save_weights(weights_dir.format('candidate'))

In [None]:
# create the index model to lookup the best candidate match for a query
index = tfrs.layers.factorized_top_k.BruteForce(model._query_model)
index.index_from_dataset(
    tf.data.Dataset.zip((
      dataset.map(lambda x: x['user']).batch(100),
      dataset.batch(100).map(lambda x: model._candidate_model(x['user']))
    ))
)
for features in dataset.shuffle(2000).batch(1).take(1):
    print('songs', features['songs'])
    scores, users = index(features['songs'])
    print('recommended users', users)

In [None]:
# save the index model
index.save(index_model_dir)

In [None]:
## CONVERT TO TFJS MODEL
import subprocess
cmd = [
    'tensorflowjs_converter',
    '--input_format=tf_saved_model',
    '--output_format=tfjs_graph_model',
    index_model_dir,
    tfjs_model_dir
]
subprocess.run(cmd)