In [1]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
import tqdm

pd.set_option('display.max_rows', 20)
tf.get_logger().setLevel('WARNING')

In [2]:
# Load the data
rating_df = pd.read_csv('../data/ml-100k/u.data', names=['userId', 'movieId', 'rating', 'timestamp'], sep='\t')
user_df = pd.read_csv('../data/ml-100k/u.user', names=['userId', 'age', 'sex', 'occupation', 'zip_code'], sep='|')
genre_df = pd.read_csv('../data/ml-100k/u.genre', names=['genre', 'genreId'], sep='|')
movie_df = pd.read_csv('../data/ml-100k/u.item', names=['movieId', 'title', 'release_date', 'video_release_date', 'imdb_url'] + list(genre_df['genre'].values), sep='|', encoding='utf-8')

In [3]:
# Map IDs to 0-based indexing
user_le = LabelEncoder()
user_le.fit(rating_df['userId'].unique())
rating_df['userId'] = user_le.transform(rating_df['userId'])
user_df['userId'] = user_le.transform(user_df['userId'])
movie_le = LabelEncoder()
movie_le.fit(rating_df['movieId'].unique())
rating_df['movieId'] = movie_le.transform(rating_df['movieId'])
movie_df['movieId'] = movie_le.transform(movie_df['movieId'])

In [4]:
# Build factorization machine model
embedding_dim = 64
n_users = len(user_le.classes_)
n_movies = len(movie_le.classes_)

inputs = tf.keras.Input(shape=(2,))
user_emb = tf.keras.layers.Embedding(
               input_dim=n_users, output_dim=embedding_dim, input_length=1,
               embeddings_initializer=tf.keras.initializers.GlorotNormal()
           )
movie_emb = tf.keras.layers.Embedding(
               input_dim=n_movies, output_dim=embedding_dim, input_length=1,
               embeddings_initializer=tf.keras.initializers.GlorotNormal()
           )
output = tf.keras.layers.Dot(axes=1)([user_emb(inputs[:, 0]), movie_emb(inputs[:, 1])])
fm_model = tf.keras.Model(inputs=inputs, outputs=output)
fm_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 2)]          0                                            
__________________________________________________________________________________________________
tf.__operators__.getitem (Slici (None,)              0           input_1[0][0]                    
__________________________________________________________________________________________________
tf.__operators__.getitem_1 (Sli (None,)              0           input_1[0][0]                    
__________________________________________________________________________________________________
embedding (Embedding)           (None, 64)           60352       tf.__operators__.getitem[0][0]   
______________________________________________________________________________________________

In [5]:
# Set up inputs and target labels
X = rating_df[['userId', 'movieId']].values
Y = rating_df['rating'].values
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.1, random_state=1, shuffle=True)

In [6]:
# Train FM model
learning_rate = 0.005
regularization = 0.01
gravity_coef = 0.1
num_epochs = 100
batch_size = 2**17

optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
mse = tf.keras.losses.MeanSquaredError()

@tf.function
def train_step(x, y):
    with tf.GradientTape() as tape:
        y_pred = fm_model(x) 
        U = user_emb.weights[0]
        V = movie_emb.weights[0]
        loss_mse = mse(y, y_pred)
        loss_reg = tf.math.reduce_sum(U * U) / U.shape[0] + tf.math.reduce_sum(V * V) / V.shape[0]
        loss_grv = tf.math.reduce_sum(tf.square(tf.matmul(U, V, transpose_b=True))) / (U.shape[0] * V.shape[0])
        loss = loss_mse + regularization * loss_reg + gravity_coef * loss_grv
    grads = tape.gradient(loss, fm_model.trainable_variables)
    optimizer.apply_gradients(zip(grads, fm_model.trainable_variables))
    return loss

for epoch in range(num_epochs):
    loss = train_step(X_train, Y_train)
    
    U = user_emb.weights[0]
    V = movie_emb.weights[0]
    Y_val_pred = fm_model(X_val)
    val_mse = mse(Y_val, Y_val_pred)
    val_loss = val_mse + regularization * tf.math.reduce_sum(U * U) / U.shape[0] + tf.math.reduce_sum(V * V) / V.shape[0] + gravity_coef * tf.math.reduce_sum(tf.square(tf.matmul(U, V, transpose_b=True))) / (U.shape[0] * V.shape[0])
    print('Epoch {}: train_loss - {:.4f}, val_loss - {:.4f}, val_mse - {:.4f}'.format(epoch + 1, loss, val_loss, val_mse))

Epoch 1: train_loss - 13.7246, val_loss - 13.8557, val_mse - 13.7702
Epoch 2: train_loss - 13.6804, val_loss - 13.8051, val_mse - 13.6964
Epoch 3: train_loss - 13.5791, val_loss - 13.6393, val_mse - 13.4990
Epoch 4: train_loss - 13.3648, val_loss - 13.3637, val_mse - 13.1810
Epoch 5: train_loss - 13.0363, val_loss - 12.9995, val_mse - 12.7640
Epoch 6: train_loss - 12.6130, val_loss - 12.5702, val_mse - 12.2721
Epoch 7: train_loss - 12.1181, val_loss - 12.0962, val_mse - 11.7272
Epoch 8: train_loss - 11.5728, val_loss - 11.5942, val_mse - 11.1472
Epoch 9: train_loss - 10.9948, val_loss - 11.0775, val_mse - 10.5465
Epoch 10: train_loss - 10.3980, val_loss - 10.5564, val_mse - 9.9362
Epoch 11: train_loss - 9.7938, val_loss - 10.0390, val_mse - 9.3254
Epoch 12: train_loss - 9.1909, val_loss - 9.5319, val_mse - 8.7212
Epoch 13: train_loss - 8.5963, val_loss - 9.0401, val_mse - 8.1293
Epoch 14: train_loss - 8.0156, val_loss - 8.5674, val_mse - 7.5544
Epoch 15: train_loss - 7.4532, val_loss -

In [7]:
# Retrieve embedding table from the model
user_emb_table = user_emb.weights[0].numpy()
movie_emb_table = movie_emb.weights[0].numpy()

In [None]:
# Sanity check: print movies that are similar in the latent space
# We can use three measures to check similarity: Euclidean distance, cosine, and inner product
def similar_movies(movieId, measure='distance'):
    emb = movie_

In [None]:
train_df

In [None]:
train_df

In [None]:
train_df