# 双塔模型

Sampling-bias-corrected neural modeling for large corpus item recommendations

https://dl.acm.org/doi/abs/10.1145/3298689.3346996

此实验完全参照 Tensorflow_recommenders/docs/examples/deep_recommenders.ipynb

In [None]:
import sys
sys.path.append("..")
sys.dont_write_bytecode = True

import os
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
%matplotlib inline
import matplotlib.pyplot as plt

from deep_recommenders import models as dprs_models
from deep_recommenders import tasks as dprs_tasks
from deep_recommenders import metrics as dprs_metrics

## 数据准备与处理

### Movielens/100k-ratings
- user_id
- movie_title
- timestamp
### Movielens/100k-movies
- movie_title


In [None]:
ratings = tfds.load("movielens/100k-ratings", split="train")
movies = tfds.load("movielens/100k-movies", split="train")

ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
    "timestamp": x["timestamp"],
})
movies = movies.map(lambda x: x["movie_title"])

timestamps = np.concatenate(list(ratings.map(lambda x: x["timestamp"]).batch(100)))

max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,
)

unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1000))))
unique_user_ids = np.unique(np.concatenate(list(ratings.batch(1_000).map(
    lambda x: x["user_id"]))))

## 模型构建

### 用户模型

处理用户特征，并生成embedding。

In [None]:
class UserModel(tf.keras.Model):
    """User Model"""

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self.user_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32),
        ])
        self.timestamp_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.Discretization(timestamp_buckets.tolist()),
            tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
        ])
        self.normalized_timestamp = tf.keras.layers.experimental.preprocessing.Normalization()

        self.normalized_timestamp.adapt(timestamps)

    def call(self, inputs):
        # 输入为字典类型
        return tf.concat([
            self.user_embedding(inputs["user_id"]),
            self.timestamp_embedding(inputs["timestamp"]),
            self.normalized_timestamp(inputs["timestamp"]),
        ], axis=1)

### Query塔

将UserModel生成的embedding，输入多层dnn网络进行特征交叉，生成Query向量。

In [None]:
class QueryModel(tf.keras.Model):
    
    def __init__(self, layer_sizes):
        super().__init__()

        # We first use the user model for generating embeddings.
        self.embedding_model = UserModel()

        # Then construct the layers.
        self.dense_layers = tf.keras.Sequential()

        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

        # No activation for the last layer.
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size))
        
    def call(self, inputs):
        feature_embedding = self.embedding_model(inputs)
        return self.dense_layers(feature_embedding)

### Movie模型

处理Movie特征，并生成embedding。


In [None]:
class MovieModel(tf.keras.Model):
  
    def __init__(self):
        super().__init__()

        max_tokens = 10_000

        self.title_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_movie_titles, mask_token=None),
            tf.keras.layers.Embedding(len(unique_movie_titles) + 1, 32)
        ])

        self.title_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
            max_tokens=max_tokens)

        self.title_text_embedding = tf.keras.Sequential([
            self.title_vectorizer,
            tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
            tf.keras.layers.GlobalAveragePooling1D(),
        ])

        self.title_vectorizer.adapt(movies)

    def call(self, titles):
        return tf.concat([
            self.title_embedding(titles),
            self.title_text_embedding(titles),
        ], axis=1)

### Candidate模型

将MovieModel生成的embedding，输入多层dnn网络进行特征交叉，生成Candidate向量。


In [None]:
class CandidateModel(tf.keras.Model):

    def __init__(self, layer_sizes):
        super().__init__()

        self.embedding_model = MovieModel()

        # Then construct the layers.
        self.dense_layers = tf.keras.Sequential()

        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

        # No activation for the last layer.
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size))
        
    def call(self, inputs):
        feature_embedding = self.embedding_model(inputs)
        return self.dense_layers(feature_embedding)

### Movielens检索模型

将QueryModel与CandidateModel进行联合训练。


In [None]:
class MovielensModel(dprs_models.Model):

    def __init__(self, layer_sizes):
        super().__init__()
        self.query_model = QueryModel(layer_sizes)
        self.candidate_model = CandidateModel(layer_sizes)
        self.task = dprs_tasks.Retrieval(
            metrics=dprs_metrics.FactorizedTopK(
                candidates=movies.batch(128).map(self.candidate_model),
            ),
        )

    def compute_loss(self, features, training=False):
        query_embeddings = self.query_model({
            "user_id": features["user_id"],
            "timestamp": features["timestamp"],
        })
        movie_embeddings = self.candidate_model(features["movie_title"])

        return self.task(
            query_embeddings, movie_embeddings, compute_metrics=not training)

## 数据准备

将数据集分割为训练集和测试集。

In [None]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

## 模型训练

优化器采用Adagrad，学习率0.1。

### 训练单层dnn的Movielens检索模型

Embedding => 32

In [None]:
num_epochs = 300

model = MovielensModel([32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

one_layer_history = model.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=5,
    epochs=num_epochs,
    verbose=0)

accuracy = one_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"][-1]
print(f"Top-100 accuracy: {accuracy:.2f}.")

# Top-100 accuracy = 0.27

### 训练2层dnn的Movielens检索模型

Embedding => 64 => Relu => 32

In [None]:
model = MovielensModel([64, 32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

two_layer_history = model.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=5,
    epochs=num_epochs,
    verbose=0)

accuracy = two_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"][-1]
print(f"Top-100 accuracy: {accuracy:.2f}.")

# Top-100 accuracy = 0.28

### 训练3层dnn的Movielens检索模型

Embedding => 128 => Relu => 64 => Relu => 32

In [None]:
model = MovielensModel([128, 64, 32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

three_layer_history = model.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=5,
    epochs=num_epochs,
    verbose=0)

accuracy = three_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"][-1]
print(f"Top-100 accuracy: {accuracy:.2f}.")

# Top-100 accuracy = 0.22

## 训练可视化

In [None]:
png_export_dir = os.path.join("assets", "deep_retrieval")

if not os.path.exists(png_export_dir):
    os.makedirs(png_export_dir)

for k in [1, 5, 10, 50, 100]:
    num_validation_runs = len(two_layer_history.history[f"val_factorized_top_k/top_{k}_categorical_accuracy"])
    epochs = [(x + 1)* 5 for x in range(num_validation_runs)]

    plt.plot(epochs, one_layer_history.history[f"val_factorized_top_k/top_{k}_categorical_accuracy"], label="1 layer")
    plt.plot(epochs, two_layer_history.history[f"val_factorized_top_k/top_{k}_categorical_accuracy"], label="2 layers")
    plt.plot(epochs, three_layer_history.history[f"val_factorized_top_k/top_{k}_categorical_accuracy"], label="3 layers")
    plt.title("Accuracy vs epoch")
    plt.xlabel("epoch")
    plt.ylabel(f"Top-{k} accuracy");
    plt.legend()
    plt.savefig(os.path.join(png_export_dir, f"top_{k}_accuracy.png"))
    plt.show()