In [163]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K
from sentence_transformers import SentenceTransformer, util
import torch

def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

import pickle
def save_pickle(model, filename):
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
def load_pickle(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

The notebook demonstrates how to build a simple recommender system using retrieval model and ranking model trained before, plus plot embedding in retrieval as well.

The input is a tuple (userId, timestamp) and the output is top 5 movie recommendations.

It solves such question: which movies would you recommend to user X at timestamp Y after user X has watched a list of movies Z? 

#### Load dataset and models, and do preprocessing

In [382]:
# Load data
data = load_pickle('data.pickle')

# Load sentence transformer model
st_model = SentenceTransformer('all-MiniLM-L6-v2')

# Load retrieval model
neuralCF = tf.keras.models.load_model('neuralCF')
neuralCF.compile(
    loss=rmse,
    optimizer='adam'
)

# Load ranking model 
deepfm = tf.keras.models.load_model('deepfm')
deepfm.compile(
    loss=rmse,
    optimizer='adam'
)

# Load movie dataset
movie = pd.read_csv("u.item", sep='|', header = None, encoding='latin-1', 
                    names = [ "movieId" , "title" , "release_date" , 'video_release_date' ,
              'IMDbURL' , 'unknown' , 'Action' , 'Adventure' , 'Animation' ,
              "Children's" , 'Comedy' , 'Crime' , 'Documentary' , 'Drama' , 'Fantasy',
              'Film-Noir' , 'Horror' , 'Musical' , 'Mystery' , 'Romance' , 'Sci-Fi' ,
              'Thriller' , 'War' , 'Western' ])





In [168]:
# Apply the same preprocessing rules in model_ranking
data.fillna({'cast_1':'unknown', 
             'cast_2':'unknown', 
             'movie_genre_1': "unknown", 
             "movie_genre_2":"unknown", 
             'user_fav_genre': "unknown",
             'release_year': data.release_year.mean()}, inplace=True)
# Preprocessing numerical columns
numerical_cols = ['age','IMDb_rating', 'release_year', 'user_avg_rating', 'user_std_rating',  'user_rating_count',
                 'movie_avg_rating', 'movie_std_rating', 'movie_rating_count',]
# Apply normalization techniques to all numerical columns
for column in numerical_cols:
    data[column] = (data[column] - data[column].min()) / (data[column].max() - data[column].min())   

#### Sentence embedding similarity retrieval utility functions

In [105]:
# Store movie embedding in a dedicated df
movie_embedding = data.iloc[data.movieId.drop_duplicates().index][['movieId', 'plot embedding']].reset_index(drop=True)

In [126]:
# Function to return the top 5 movieId that have the most similar plot embedding with input movieId
def get_top5_by_embed(movieId):  
    # Calculate cosine similarity
    cos_sim = util.cos_sim(movie_embedding[movie_embedding.movieId == movieId]['plot embedding'].to_numpy()[0],list(movie_embedding['plot embedding']))
    # Get list of tuple (movieId, similarity) from previous result
    sim_lists = [(idx, sim) for idx, sim in enumerate(cos_sim.numpy()[0])]
    # Sort list by similary descending
    sim_lists.sort(key = lambda x: x[1], reverse = True)
    # Return the top 5 movieId, note I discard the first one since that is the input moiveId
    return [movieId for movieId, sim in sim_lists[1:6]]

In [None]:
# Define features and genre_cols for future uses
features = ['userId', 'age', 'gender', 'occupation', 'zipcode_bucket',                    # User Features
                 'movieId', 'movie_genre_1', 'movie_genre_2', 'IMDb_rating',              # Movie Features
                 'director', 'cast_1', 'cast_2', 'plot embedding', 'release_year',        # Movie Features
                 'rating', 'user_avg_rating', 'user_std_rating', 'user_rating_count',     # Rating Features
                 'movie_avg_rating', 'movie_std_rating', 'movie_rating_count',            # Rating Features
                 'user_fav_genre', 'user_fav_movieId', 'timestamp'                        # Rating Features        
               ]
genre_cols = ['unknown', 'Action', 'Adventure',
       'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western']

#### Demo

In [189]:
# Random select a userId and timestamp for demo
target_userId, target_timestamp = data[['userId', 'timestamp']].sample(1, random_state = 30).userId.values[0], data[['userId', 'timestamp']].sample(1, random_state = 30).timestamp.values[0]

In [190]:
target_userId, target_timestamp

(454, 888266955)

We want to recommend movies to user **454** at timestamp **888266955**

#### Prepare dataset for retrieval
Since there are specific features that are timestamp dependent, the process of generating dataset for retrieval needs special consideration.

In [317]:
# These features will change as timestamp changes
timestamp_sensitive_features_user = ['user_avg_rating', 'user_std_rating', 'user_rating_count','user_fav_genre', 'user_fav_movieId']
timestamp_sensitive_features_movie = [ 'movie_avg_rating', 'movie_std_rating', 'movie_rating_count']

# These features will not change as timestamp changes
timestamp_insensitive_features_user = ['userId', 'age', 'gender', 'occupation', 'zipcode_bucket']
timestamp_insensitive_features_movie = ['movieId', 'movie_genre_1', 'movie_genre_2', 'IMDb_rating',             
                 'director', 'cast_1', 'cast_2', 'release_year']

In [318]:
# Prepare movie timestamp insensitive features
movie_retrieval_candidates = data[data.timestamp <=target_timestamp][timestamp_insensitive_features_movie].drop_duplicates()

In [319]:
# Prepare user timestamp insensitive features
user_ts_insensitive_records = data[data.userId == target_userId][timestamp_insensitive_features_user].drop_duplicates().to_dict('records')[0]

for f in timestamp_insensitive_features_user:
    movie_retrieval_candidates[f] = user_ts_insensitive_records[f]

In [320]:
# Prepare user timestamp sensitive features
user_ts_sensitive_records = data[(data.userId == target_userId) & (data.timestamp == target_timestamp)][timestamp_sensitive_features_user].iloc[0].to_dict()

for f in timestamp_sensitive_features_user:
    movie_retrieval_candidates[f] = user_ts_sensitive_records[f]

In [321]:
# Prepare movie timestamp sensitive features
def get_ts_sensitive_features_movie(movieId):
    return data[(data.movieId == movieId) & (data.timestamp <= target_timestamp)].sort_values('timestamp', ascending= False)[timestamp_sensitive_features_movie].iloc[0]

movie_retrieval_candidates[timestamp_sensitive_features_movie] = movie_retrieval_candidates.movieId.apply(get_ts_sensitive_features_movie)

In [322]:
# Get user history
user_history = data[(data.userId == target_userId) & (data.timestamp <= target_timestamp)]

In [323]:
# Filter movies that this user has already watched from movie candidates
movie_retrieval_candidates = movie_retrieval_candidates[~movie_retrieval_candidates.movieId.isin(user_history.movieId)]

#### Retrieval channel 1
via neuralCF

In [325]:
# Pass movie candidates for retrieval into tf dataset
retrieval_ds = tf.data.Dataset.from_tensor_slices((dict(movie_retrieval_candidates))).batch(32)

In [326]:
# Predict the rating
retrieval_result = np.concatenate(neuralCF.predict(retrieval_ds))



In [333]:
# Get indexes of top 200 candidates 
top200_indexes = np.argpartition(retrieval_result, -200)[-200:]

#### Retrieval channel 2
via plot embedding

In [330]:
# Get most recent 5 movies from user history
user_recent_5_movies = user_history.sort_values('timestamp', ascending = False).movieId[:5]

# From these 5 movies, for each movie, get top 5 most similar movies
# That is 25 movieIds in total
most_similar_movies_retrieval = set()
for movieId in user_recent_5_movies:
    most_similar_movies_retrieval.update(get_top5_by_embed(movieId))

#### Prepare dataset for ranking

In [354]:
# Prepare ranking candidates from retrieval candidates using the top200_indexes result from neuralCF
movie_ranking_candidates = movie_retrieval_candidates.iloc[top200_indexes]

In [357]:
# Add retrieval result from plot embedding approach
# First find entries in movie_retrieval_candidates where movieIds are in most_similar_movies_retrieval
# Then make sure these movieIds are not in movie_ranking_candidates
movie_retrieval_via_embedding = movie_retrieval_candidates[(movie_retrieval_candidates.movieId.isin(most_similar_movies_retrieval)) 
                                & 
                                (~(movie_retrieval_candidates.movieId.isin(movie_ranking_candidates.movieId)))]

In [358]:
# Concatenate two results from retrieval 1 and retrieval 2
movie_ranking_candidates = pd.concat([movie_retrieval_via_embedding, movie_ranking_candidates])

#### Ranking

In [360]:
# Pass movie candidates for ranking into tf dataset
ranking_ds = tf.data.Dataset.from_tensor_slices((dict(movie_ranking_candidates))).batch(32)

In [361]:
# Predict the rating
ranking_result = np.concatenate(deepfm.predict(ranking_ds))



In [378]:
# Get top 5 indexes
top5_indexes = np.argpartition(ranking_result, -5)[-5:]

In [380]:
# Get top 5 movieIds
top5_movieId = movie_ranking_candidates.iloc[top5_indexes].movieId

In [390]:
# Get top 5 movie titles
print(f"The top 5 movies for userId {target_userId} at timestamp {target_timestamp} are: ")
[movieTitle for movieTitle in movie[movie.movieId.isin(top5_movieId)]['title'].values]

The top 5 movies for userId 454 at timestamp 888266955 are: 


['Breaking the Waves (1996)',
 'In the Line of Fire (1993)',
 'American President, The (1995)',
 'Safe (1995)',
 'Golden Earrings (1947)']