# DeepFM

DeepFM: A Factorization-Machine based Neural Network for CTR Prediction

https://arxiv.org/abs/1703.04247

https://www.tensorflow.org/datasets/catalog/movielens



In [None]:
import sys
sys.path.append("..")
sys.dont_write_bytecode = True

import os
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
%matplotlib inline
import matplotlib.pyplot as plt

from deep_recommenders import models as dprs_models
from deep_recommenders import tasks as dprs_tasks
from deep_recommenders import metrics as dprs_metrics
from deep_recommenders.layers.fm import FM


## 数据准备


使用Movielens-100k作为训练数据，具体介绍可以参考

https://www.tensorflow.org/datasets/catalog/movielens

### 特征


3个用户特征: 
* `user_id`
* `user_gender`
* `bucketized_user_age`
* `user_occupation_label`

2个电影特征，分别是：
* `movie_id`
* `movie_title`

由于`user_rating`特征为1-5星评价，为了适用二分类任务，

将大于3星的（即4星和5星）评价转化了`1.0`，将少于或等于3星的评价转化为`0.`

In [None]:
ratings = tfds.load("movielens/100k-ratings", split="train")

ratings = ratings.map(lambda x: {
    "user_id": x["user_id"],
    "user_gender": tf.cast(x["user_gender"], tf.int32),
    "bucketized_user_age": tf.cast(x["bucketized_user_age"], tf.int32),
    "user_occupation_label": tf.cast(x["user_occupation_label"], tf.int32),
    "movie_id": x["movie_id"],
    "movie_title": x["movie_title"],
    "user_rating": tf.cond(x["user_rating"] > 3, true_fn=lambda: 1.0, false_fn=lambda: 0.0)
})
unique_user_ids = np.unique(np.concatenate(list(ratings.batch(1_000).map(
    lambda x: x["user_id"]))))
unique_user_ages = np.unique(np.concatenate(list(ratings.batch(1_000).map(
    lambda x: x["bucketized_user_age"]))))
unique_user_occupation_labels = np.unique(np.concatenate(list(ratings.batch(1_000).map(
    lambda x: x["user_occupation_label"]))))
unique_movie_ids = np.unique(np.concatenate(list(ratings.batch(1_000).map(
    lambda x: x["movie_id"]))))

movies = tfds.load("movielens/100k-movies", split="train")
movies = movies.map(lambda x: x["movie_title"])


## 模型构建

### 构建embedding层

注意：DeepFM模型即需要稠密的`Dense Embedding`用于`FM`的2-order交互和`DNN`的high-order交互.

同时，也需要稀疏的`One-Hot Embedding`用于`FM`的1-order交互.

In [None]:
def _vocab_list_embedding(key: str, vocab_list: list, embedding_dim: int):
    """Vocab list embedding"""
    categorical_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key, vocab_list)
    
    dense_embedding_layer = tf.keras.layers.DenseFeatures(
        tf.feature_column.embedding_column(
            categorical_col,
            dimension=embedding_dim, 
            combiner="sum"))

    sparse_embedding_layer = tf.keras.layers.DenseFeatures(
        tf.feature_column.indicator_column(categorical_col))
    
    return dense_embedding_layer, sparse_embedding_layer

### 构建用户Embedding模型

In [None]:
class UserModel(tf.keras.Model):
    """User Embedding Model"""

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self.user_dense_embedding, self.user_sparse_embedding = _vocab_list_embedding(
            "user_id", unique_user_ids, 32)
        self.gender_dense_embedding, self.gender_sparse_embedding = _vocab_list_embedding(
            "user_gender", [0, 1], 32)
        self.age_dense_embedding, self.age_sparse_embedding = _vocab_list_embedding(
            "bucketized_user_age", unique_user_ages, 32)
        self.occupation_dense_embedding, self.occupation_sparse_embedding = _vocab_list_embedding(
            "user_occupation_label", unique_user_occupation_labels, 32)
        
    def call(self, inputs):
        # 输入为字典类型
        dense_embeddings = tf.stack([
            self.user_dense_embedding({"user_id": inputs["user_id"]}),
            self.gender_dense_embedding({"user_gender": inputs["user_gender"]}),
            self.age_dense_embedding({"bucketized_user_age": inputs["bucketized_user_age"]}),
            self.occupation_dense_embedding({"user_occupation_label": inputs["user_occupation_label"]})
        ], axis=1)
        sparse_embeddings = tf.concat([
            self.user_sparse_embedding({"user_id": inputs["user_id"]}),
            self.gender_sparse_embedding({"user_gender": inputs["user_gender"]}),
            self.age_sparse_embedding({"bucketized_user_age": inputs["bucketized_user_age"]}),
            self.occupation_sparse_embedding({"user_occupation_label": inputs["user_occupation_label"]})
        ], axis=1)
        return dense_embeddings, sparse_embeddings

### 构建电影Embedding模型

In [None]:
class MovieModel(tf.keras.Model):
    """Movie Embedding Model"""

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self.movie_dense_embedding, self.movie_sparse_embedding = _vocab_list_embedding(
            "movie_id", unique_movie_ids, 32)
        
        max_tokens = 10_000

        self.title_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
            max_tokens=max_tokens)

        self.title_embedding = tf.keras.Sequential([
            self.title_vectorizer,
            tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
            tf.keras.layers.GlobalAveragePooling1D(),
        ])

        self.title_vectorizer.adapt(movies)
       
    def call(self, inputs):
        # 输入为字典类型
        dense_embeddings = tf.stack([
            self.movie_dense_embedding({"movie_id": inputs["movie_id"]}),
            self.title_embedding(inputs["movie_title"]),
        ], axis=1)
        sparse_embeddings = self.movie_sparse_embedding(
            {"movie_id": inputs["movie_id"]})
        return dense_embeddings, sparse_embeddings

### 构建 DeepFM 模型

In [None]:
class DeepFM(dprs_models.Model):
    """DeepFM model"""

    def __init__(self, layer_sizes):
        super().__init__()

        self.user_model = UserModel()
        self.movie_model = MovieModel()

        # 构建 linear
        self.linear = tf.keras.layers.Dense(1)

        # FM factor embedding 与 user_embedding, movie_embedding共享
        self.fm = FM(factors=None)

        # 构建 dnn layers
        self.dnn = tf.keras.Sequential()

        self.dnn.add(tf.keras.layers.Flatten())

        for layer_size in layer_sizes:
            self.dnn.add(tf.keras.layers.Dense(layer_size, activation="relu"))

        self.dnn.add(tf.keras.layers.Dense(1))

        # logit layer
        self.logits_layer = tf.keras.Sequential([
            tf.keras.layers.Add(),
            tf.keras.layers.Activation("sigmoid")
        ])

        self.task = dprs_tasks.Ranking(
            metrics=[tf.keras.metrics.AUC(name="auc")]
        )

    def compute_loss(self, features, training=False):

        user_dense_embeddings, user_sparse_embeddings = self.user_model({
            "user_id": features["user_id"],
            "user_gender": features["user_gender"],
            "bucketized_user_age": features["bucketized_user_age"],
            "user_occupation_label": features["user_occupation_label"],
        })

        movie_dense_embeddings, movie_sparse_embeddings = self.movie_model({
            "movie_id": features["movie_id"],
            "movie_title": features["movie_title"],
        })

        sparse_embeddings = tf.concat([user_sparse_embeddings, movie_sparse_embeddings], axis=1)
        linear_outputs = self.linear(sparse_embeddings)

        dense_embeddings = tf.concat([user_dense_embeddings, movie_dense_embeddings], axis=1)

        fm_outputs = self.fm(dense_embeddings)
        dnn_outputs = self.dnn(dense_embeddings)

        logits = self.logits_layer(
            [linear_outputs, fm_outputs, dnn_outputs]
        )

        return self.task(features["user_rating"], logits)        

## 数据shuffle和split

Train: Test = 8: 2

In [None]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

## 模型训练

In [None]:
num_epochs = 100

model = DeepFM([64, 32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

history = model.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=5,
    epochs=num_epochs,
    verbose=0)

accuracy = history.history["val_auc"][-1]
print(f"val-auc accuracy: {accuracy:.2f}.")

## 可视化训练Metrics

In [None]:
for metric in ["auc", "loss"]:
    num_train_runs = len(history.history[metric])
    train_epochs = [(x + 1) for x in range(num_train_runs)]

    num_validation_runs = len(history.history[f"val_{metric}"])
    test_epochs = [(x + 1)* 5 for x in range(num_validation_runs)]

    plt.plot(train_epochs, history.history[metric], label="train")
    plt.plot(test_epochs, history.history[f"val_{metric}"], label="test")
    plt.title(f"{metric} vs epoch")
    plt.xlabel("epoch")
    plt.ylabel(f"{metric}");
    plt.legend()
    plt.savefig(os.path.join("assets", f"deepfm_{metric}.png"))
    plt.show()