In [1]:
import os
import pprint
import tempfile

from typing import Dict, Text
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs
import pandas as pd

In [2]:
projects = pd.read_csv('./test.csv', usecols=['user_id', 'project_name'])
print(len(projects))

# projects = projects.head(100000)

train_num = int(0.8 * len(projects))

unique_movie_titles = np.asarray(projects['project_name'].drop_duplicates())
print('movie_title num: ', len(unique_movie_titles))
unique_user_ids = np.asarray(projects['user_id'].drop_duplicates())
print('user_num: ', len(unique_user_ids))

ratings = projects
movies = unique_movie_titles

ratings = tf.data.Dataset.from_tensor_slices(ratings)
movies = tf.data.Dataset.from_tensor_slices(movies)

tf.random.set_seed(42)
shuffled = ratings.shuffle(train_num, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(train_num)
test = shuffled.skip(train_num).take(len(projects) - train_num)

embedding_dimension = 32

user_model = tf.keras.Sequential([
    tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_user_ids, mask_token=None
    ),
    tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])


movie_model = tf.keras.Sequential([
    tf.keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=unique_movie_titles, mask_token=None
    ),
    tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
])

metrics = tfrs.metrics.FactorizedTopK(
    candidates=movies.batch(128).map(movie_model)
)

task = tfrs.tasks.Retrieval(
    metrics=metrics
)


class MovielensModel(tfrs.Model):
    def __init__(self, user_model, movie_model):
        super().__init__()
        self.movie_model: tf.keras.Model = movie_model
        self.user_model: tf.keras.Model = user_model
        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        user_embeddings = self.user_model(features[:, 0])
        positive_movie_embeddings = self.movie_model(features[:, 1])

        return self.task(user_embeddings, positive_movie_embeddings)


model = MovielensModel(user_model, movie_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()


model.fit(cached_train, epochs=3)

model.evaluate(cached_test, return_dict=True)

89942
movie_title num:  16322
user_num:  12692
Epoch 1/3
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 2/3
Epoch 3/3


{'factorized_top_k/top_1_categorical_accuracy': 0.007226638495922089,
 'factorized_top_k/top_5_categorical_accuracy': 0.040802713483572006,
 'factorized_top_k/top_10_categorical_accuracy': 0.06976485997438431,
 'factorized_top_k/top_50_categorical_accuracy': 0.1764967441558838,
 'factorized_top_k/top_100_categorical_accuracy': 0.23864583671092987,
 'loss': 11234.388671875,
 'regularization_loss': 0,
 'total_loss': 11234.388671875}

In [3]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index(movies.batch(100).map(model.movie_model), movies)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f973d7a4908>

In [4]:
_, titles = index(tf.constant(["XianxinMao"]))
print(f"Recommendations for user XianxinMao: {titles[0, :10]}")

Recommendations for user XianxinMao: [b'help-docs' b'mirrors-settings' b'rcore-v3-course'
 b'rcore-coding-course' b'china-open-source-blue-book'
 b'DevopsDemo-course' b'git-learning-course' b'operation-work'
 b'enterprise_technology' b'course_test_repo']


In [5]:
scann_index = tfrs.layers.factorized_top_k.ScaNN(model.user_model)
scann_index.index(movies.batch(100).map(model.movie_model), movies)

<tensorflow_recommenders.layers.factorized_top_k.ScaNN at 0x7f973d7a4cc0>

In [6]:
_, titles = scann_index(tf.constant(["XianxinMao"]))
print(f"Recommendations for user XianxinMao: {titles[0, :10]}")

Recommendations for user XianxinMao: [b'rcore-v3-course' b'help-docs' b'DevopsDemo-course'
 b'rcore-coding-course' b'mirrors-settings' b'china-open-source-blue-book'
 b'cve-2021-21300' b'enterprise_technology' b'operation-work'
 b'course_test_repo']


In [7]:
export_path = './scann_recommend/1/'
tf.keras.models.save_model(
    scann_index,
    export_path,
    overwrite=True,
    include_optimizer=True,
    save_format=None,
    signatures=None,
    options=tf.saved_model.SaveOptions(namespace_whitelist=["Scann"])
)
print("Saved model")



INFO:tensorflow:Assets written to: ./scann_recommend/1/assets


INFO:tensorflow:Assets written to: ./scann_recommend/1/assets


Saved model


In [8]:
for dict_batch in cached_test.take(1):
    print(dict_batch)

tf.Tensor(
[[b'qq_18741387' b'subdomainsbrute']
 [b'weixin_45003648' b'05-week-homework']
 [b'qq656394906' b'luyten']
 ...
 [b'chengxulu1125' b'pytorch_Realtime_Multi-Person_Pose_Estimation']
 [b'qq_41817408' b'maskrcnn-benchmark']
 [b'xiongjiamu' b'learngit']], shape=(4096, 2), dtype=string)


In [9]:
user_id_np = [user.decode("utf-8") for user in dict_batch[:, 0][0:3].numpy()]

In [10]:
import json
data = json.dumps({"signature_name": "serving_default", "instances": user_id_np})
print('Data: {} ... {}'.format(data[:50], data[len(data)-52:]))

Data: {"signature_name": "serving_default", "instances": ... : ["qq_18741387", "weixin_45003648", "qq656394906"]}


In [11]:
import requests
headers = {"content-type": "application/json"}
json_response = requests.post('http://localhost:8501/v1/models/scann_recommend:predict', data=data, headers=headers)
predictions = json.loads(json_response.text)['predictions']

In [12]:
predictions

[{'output_1': [1.17841959,
   1.05265713,
   1.04639745,
   1.03273988,
   1.02363491,
   1.0206,
   0.956865132,
   0.868850291,
   0.857705,
   0.821807921],
  'output_2': ['antsword',
   'html5_rtsp_player',
   'sqli-labs',
   'githack',
   'dirsearch',
   'hackbar2.1.3',
   'tplmap',
   'pikachu',
   'DevopsDemo-course',
   'behinder']},
 {'output_1': [0.713201821,
   0.631738186,
   0.58721745,
   0.585470796,
   0.550274611,
   0.540802121,
   0.529583,
   0.526315808,
   0.504465461,
   0.50101757],
  'output_2': ['pytorch-pretrained-bert',
   'bert-bilstm-crf-ner',
   'chinese-bert-wwm',
   'vue-element-admin',
   'chinesenre',
   'keras-bert',
   'webpack',
   'html5_rtsp_player',
   'qtquickexamples',
   'bert-as-service']},
 {'output_1': [1.63155508,
   1.28092718,
   1.25521684,
   1.23969436,
   1.0687865,
   1.00340772,
   0.998073,
   0.993408561,
   0.970987201,
   0.963739097],
  'output_2': ['matecloud',
   'rocketmq-externals',
   'mall',
   'jeecg-boot',
   'springb

In [13]:
import requests
headers = {"content-type": "application/json"}
json_response = requests.post('http://localhost:8501/v1/models/scann_recommend:predict', data=data, headers=headers)
predictions = json.loads(json_response.text)['predictions']

In [14]:
predictions

[{'output_2': ['antsword',
   'html5_rtsp_player',
   'sqli-labs',
   'githack',
   'dirsearch',
   'hackbar2.1.3',
   'tplmap',
   'pikachu',
   'DevopsDemo-course',
   'behinder'],
  'output_1': [1.17841959,
   1.05265713,
   1.04639745,
   1.03273988,
   1.02363491,
   1.0206,
   0.956865132,
   0.868850291,
   0.857705,
   0.821807921]},
 {'output_2': ['pytorch-pretrained-bert',
   'bert-bilstm-crf-ner',
   'chinese-bert-wwm',
   'vue-element-admin',
   'chinesenre',
   'keras-bert',
   'webpack',
   'html5_rtsp_player',
   'qtquickexamples',
   'bert-as-service'],
  'output_1': [0.713201821,
   0.631738186,
   0.58721745,
   0.585470796,
   0.550274611,
   0.540802121,
   0.529583,
   0.526315808,
   0.504465461,
   0.50101757]},
 {'output_2': ['matecloud',
   'rocketmq-externals',
   'mall',
   'jeecg-boot',
   'springbootdemo',
   'myblog',
   'pylof',
   'java',
   'JavaGuide',
   'springboot'],
  'output_1': [1.63155508,
   1.28092718,
   1.25521684,
   1.23969436,
   1.068786