In [24]:
from typing import Dict, Text
import os
import numpy as np
import tensorflow as tf
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

import warnings
warnings.filterwarnings("ignore")
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

import pickle
def save_pickle(model, filename):
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
def load_pickle(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In this notebook, we will primarily focus on building a simple matrix factorization retrieval model powered by neuralCF. 

In [2]:
# Load final data from preprocessing_2_feature_engineering
data = load_pickle('data.pickle')

In [29]:
# Load ratings and movies
ratings =  data[['userId', 'movieId']]
movies = data[['movieId']]
ratings['userId'] = ratings.userId.astype(str)
ratings['movieId'] = ratings.movieId.astype(str)
movies['movieId'] = movies.movieId.astype(str)

In [30]:
# Load into tf datasets
movies_ds = tf.data.Dataset.from_tensor_slices(dict(movies))
ratings_ds = tf.data.Dataset.from_tensor_slices(dict(ratings))

In [57]:
# get training and testing dataset
# shuffle and batch
shuffled = ratings_ds.shuffle(100000, seed=32)

train = shuffled.take(80000)
test = shuffled.skip(80000).take(20000)

cached_train = train.batch(128).cache()
cached_test = test.batch(128).cache()

In [32]:
# Build userId string lookup layer
user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(ratings_ds.map(lambda x: x["userId"]))

In [34]:
# Build movieId string lookup layer
movie_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
movie_ids_vocabulary.adapt(movies_ds.map(lambda x: x["movieId"]))

In [51]:
# Define MovieLensModel model
class MovieLensModel(tfrs.Model):

    def __init__(
                self,
                user_model: tf.keras.Model,
                movie_model: tf.keras.Model,
                task: tfrs.tasks.Retrieval):
        super().__init__()

        self.user_model = user_model
        self.movie_model = movie_model
        self.task = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        user_embeddings = self.user_model(features["userId"])
        movie_embeddings = self.movie_model(features["movieId"])

        return self.task(user_embeddings, movie_embeddings)

In [54]:
# define user and movie models
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(user_ids_vocabulary.vocabulary_size(), 64)


])
movie_model = tf.keras.Sequential([
    movie_ids_vocabulary,
    tf.keras.layers.Embedding(movie_ids_vocabulary.vocabulary_size(), 64)
])


In [58]:
# Build a neural collaborative filtering model
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(user_ids_vocabulary.vocabulary_size(), 64),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(16, activation="relu")

])
movie_model = tf.keras.Sequential([
    movie_ids_vocabulary,
    tf.keras.layers.Embedding(movie_ids_vocabulary.vocabulary_size(), 64),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(16, activation="relu")
])


In [59]:
# define metrics and task
metrics = tfrs.metrics.FactorizedTopK(
  candidates=movies_ds.map(lambda x: x["movieId"]).batch(128).map(movie_model)
)

task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [60]:
# compile and train
model = MovieLensModel(user_model, movie_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Train for 3 epochs.
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1a590f22d60>

In [61]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.8727785348892212,
 'factorized_top_k/top_5_categorical_accuracy': 0.8727785348892212,
 'factorized_top_k/top_10_categorical_accuracy': 0.8728289008140564,
 'factorized_top_k/top_50_categorical_accuracy': 1.0,
 'factorized_top_k/top_100_categorical_accuracy': 1.0,
 'loss': 72.11636352539062,
 'regularization_loss': 0,
 'total_loss': 72.11636352539062}