<a href="https://colab.research.google.com/github/JNishimura/Deep-Learning-Recommenders/blob/main/Notebooks/MovieLens_TensorFlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installs

In [1]:
!pip install pandas
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets
!pip install -q scann
!pip install ipywidgets

[K     |████████████████████████████████| 61kB 3.2MB/s 
[K     |████████████████████████████████| 394.7MB 41kB/s 
[K     |████████████████████████████████| 3.7MB 6.2MB/s 
[K     |████████████████████████████████| 11.1MB 5.4MB/s 


### Imports

In [2]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

import math
import heapq
import matplotlib.pyplot as plt

### Data Set-up

In [3]:
# Ratings data.
ratings = tfds.load("movielens/100k-ratings", split="train")

[1mDownloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /root/tensorflow_datasets/movielens/100k-ratings/0.1.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Extraction completed...', max=1.0, styl…









HBox(children=(FloatProgress(value=0.0, description='Generating splits...', max=1.0, style=ProgressStyle(descr…

HBox(children=(FloatProgress(value=0.0, description='Generating train examples...', max=100000.0, style=Progre…

HBox(children=(FloatProgress(value=0.0, description='Shuffling movielens-train.tfrecord...', max=100000.0, sty…

[1mDataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-ratings/0.1.0. Subsequent calls will reuse this data.[0m


In [4]:
for x in ratings.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7]),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}


In [5]:
dataframe = tfds.as_dataframe(ratings)
print(dataframe)

       bucketized_user_age     movie_genres  ... user_rating user_zip_code
0                     45.0              [7]  ...         4.0      b'53211'
1                     25.0          [4, 14]  ...         2.0      b'80525'
2                     18.0              [4]  ...         4.0      b'55439'
3                     50.0           [5, 7]  ...         4.0      b'06472'
4                     50.0         [10, 16]  ...         3.0      b'75094'
...                    ...              ...  ...         ...           ...
99995                 25.0       [0, 1, 15]  ...         4.0      b'80027'
99996                 35.0         [13, 16]  ...         4.0      b'60035'
99997                 18.0             [10]  ...         1.0      b'78264'
99998                 35.0  [0, 10, 15, 16]  ...         4.0      b'53210'
99999                 18.0              [4]  ...         2.0      b'95064'

[100000 rows x 12 columns]


In [6]:
# Pull out relevant columns and convert id columns to ints
relevant_cols = dataframe.loc[:, ['user_id', 'movie_id', 'user_rating', 'timestamp']]
relevant_cols['user_id'] = relevant_cols['user_id'].astype(int)
relevant_cols['movie_id'] = relevant_cols['movie_id'].astype(int)
print(relevant_cols)
print(relevant_cols.dtypes)

       user_id  movie_id  user_rating  timestamp
0          138       357          4.0  879024327
1           92       709          2.0  875654590
2          301       412          4.0  882075110
3           60        56          4.0  883326919
4          197       895          3.0  891409199
...        ...       ...          ...        ...
99995      774       228          4.0  888557237
99996      313       333          4.0  891012877
99997      262       567          1.0  879795430
99998      911       183          4.0  892839492
99999      276      1140          2.0  874791894

[100000 rows x 4 columns]
user_id          int64
movie_id         int64
user_rating    float64
timestamp        int64
dtype: object


In [24]:
relevant_cols['latest'] = relevant_cols.groupby(['user_id'])['timestamp'].rank(method='first', ascending=False)

train_ratings = relevant_cols[relevant_cols['latest'] != 1]
test_ratings = relevant_cols[relevant_cols['latest'] == 1]

train_ratings = train_ratings[['user_id', 'movie_id', 'user_rating']][:10000]
test_ratings = test_ratings[['user_id', 'movie_id', 'user_rating']][:1000]

print(train_ratings.shape)
print(test_ratings.shape)

num_users = len(relevant_cols.user_id.unique()) + 1
num_items = len(relevant_cols.movie_id.unique()) + 1

print('Num Users:', num_users)
print('Num Items:', num_items)

(10000, 3)
(943, 3)
Num Users: 944
Num Items: 1683


## Convert to Implicit Feedback Dataset

In [25]:
# Convert rating to 1 for everything to mark that the user has watched this item
train_ratings.loc[:, 'rating'] = 1
print(train_ratings)


       user_id  movie_id  user_rating  rating
0          138       357          4.0       1
1           92       709          2.0       1
2          301       412          4.0       1
3           60        56          4.0       1
4          197       895          3.0       1
...        ...       ...          ...     ...
10194      435       122          3.0       1
10195      343       260          1.0       1
10196      735        25          4.0       1
10197      617       218          2.0       1
10198      256      1033          4.0       1

[10000 rows x 4 columns]


In [26]:
# Add negative samples
all_movies = relevant_cols['movie_id'].unique()

users, items, labels = [], [], []
user_item_set = set(zip(train_ratings['user_id'], train_ratings['movie_id']))
num_negatives = 4

for (u, i) in user_item_set:
  users.append([u])
  items.append([i])
  labels.append([1])
  for _ in range(num_negatives):
    negative_item = np.random.choice(all_movies)
    while (u, negative_item) in user_item_set:
      negative_item = np.random.choice(all_movies)
    users.append([u])
    items.append([negative_item])
    labels.append([0])



## Create Dataset

In [27]:
train_ds = tf.data.Dataset.from_tensor_slices(({'user': users, 'item': items}, labels))

In [28]:
train_ds = train_ds.shuffle(1000)
for elem in train_ds.take(10):
  print(elem)


({'user': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([397], dtype=int32)>, 'item': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1356], dtype=int32)>}, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>)
({'user': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([347], dtype=int32)>, 'item': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([405], dtype=int32)>}, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>)
({'user': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([796], dtype=int32)>, 'item': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([525], dtype=int32)>}, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>)
({'user': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([654], dtype=int32)>, 'item': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1675], dtype=int32)>}, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>)
({'user': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([344], dtype=int32)

In [12]:
BATCH_SIZE = 256

## Define Model

In [71]:
from keras.models import Model
from keras.layers import Embedding, Input, Dense, Reshape, Multiply, Flatten, Lambda, Concatenate, Layer, Dropout, Average
from keras import initializers, regularizers
import sys

def get_GMFmodel(num_users, num_items, latent_dim):
  user_input = Input(shape = (1,), dtype = 'int32', name = 'user')
  item_input = Input(shape = (1,), dtype = 'int32', name = 'item')

  user_embedding = Embedding(input_dim = num_users, output_dim = latent_dim, name = 'user_embed',
                             embeddings_initializer = initializers.RandomNormal(stddev = 0.01), 
                             embeddings_regularizer = regularizers.l2(0), input_length = 1)
  item_embedding = Embedding(input_dim = num_items, output_dim = latent_dim, name = 'item_embed',
                             embeddings_initializer = initializers.RandomNormal(stddev = 0.01), 
                             embeddings_regularizer = regularizers.l2(0), input_length = 1)

  user_latent = Flatten()(user_embedding(user_input))
  item_latent = Flatten()(item_embedding(item_input))

  prediction_vec = Multiply()([user_latent, item_latent])

  prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name = 'prediction')(prediction_vec)

  return Model(inputs=[user_input, item_input], outputs=prediction)

In [69]:
from keras.optimizers import Adam
from keras.regularizers import *


modelMLP = get_MLPmodel(num_users, num_items, 8)
modelMLPGMF =  get_MLPGMFmodel(num_users, num_items, 8)
modelGMF = get_GMFmodel(num_users, num_items, 8)
modelMLP.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=[tf.keras.metrics.BinaryAccuracy()])
modelMLPGMF.compile(optimizer=Adam(0.001), loss='binary_crossentropy', metrics=[tf.keras.metrics.BinaryAccuracy()])
modelGMF.compile(optimizer=Adam(0.01), loss='binary_crossentropy', metrics=[tf.keras.metrics.BinaryAccuracy()])
modelMLP.summary()


Model: "model_46"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
item (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_embed (Embedding)          (None, 1, 8)         7552        user[0][0]                       
__________________________________________________________________________________________________
item_embed (Embedding)          (None, 1, 8)         13464       item[0][0]                       
___________________________________________________________________________________________

In [16]:
class Combine(Layer):
  def __init__(self):
        super(Combine, self).__init__()
        random_alpha = tf.random.uniform(shape=[1])
        self.alpha = tf.Variable(initial_value=random_alpha, trainable=True)
  
  def call(self, inputs):
    return (1 - self.alpha)*inputs[0] + self.alpha*inputs[1]

In [68]:
def get_MLPGMFmodel(num_users, num_items, latent_dim):
  user_input = Input(shape = (1,), dtype = 'int32', name = 'user')
  movie_input = Input(shape = (1,), dtype = 'int32', name = 'item')

  user_embedding = Embedding(input_dim = num_users, output_dim = latent_dim, name = 'user_embed',
                             embeddings_initializer = initializers.RandomNormal(stddev = 0.01), 
                             embeddings_regularizer = regularizers.l2(0), input_length = 1)
  movie_embedding = Embedding(input_dim = num_items, output_dim = latent_dim, name = 'item_embed',
                             embeddings_initializer = initializers.RandomNormal(stddev = 0.01), 
                             embeddings_regularizer = regularizers.l2(0), input_length = 1)
  
  user_latent = Flatten()(user_embedding(user_input))
  item_latent = Flatten()(movie_embedding(movie_input))

  #GMF 
  prediction_vec = Multiply()([user_latent, item_latent])
  prediction_GMF = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name = 'prediction')(prediction_vec)

  #MLF
  inputs = Concatenate()([user_latent, item_latent])
  layer = Dense(32, activation='relu', name='Layer1', kernel_initializer='glorot_uniform', kernel_regularizer=l2())(inputs)
  layer = Dense(16, activation='relu', name='Layer5', kernel_regularizer=l2())(layer)
  layer = Dense(8, activation='relu', name='Layer3')(layer)
  prediction_MLP = Dense(1, activation='sigmoid', name='Layer4')(layer)

  predictions = Combine()([prediction_GMF, prediction_MLP])

  return Model(inputs=[user_input, movie_input], outputs=predictions)


In [18]:
def evaluateNCDG(ranked_list, target_item):
  for i in range(len(ranked_list)):
    if ranked_list[i] == target_item:
      return math.log(2) / math.log(i + 2)
  
  return 0

In [19]:
def hitRate(item, ranklist):
  for rank in ranklist:
    if item == rank:
      return 1
  return 0

In [20]:
all_movies = relevant_cols['movie_id'].unique()

users, items, labels = [], [], []
user_item_set = set(zip(train_ratings['user_id'], train_ratings['movie_id']))
user_set = set(train_ratings['user_id'])
num_test_negatives = 100
test_negatives = {}

for u in user_set:
  negatives = []

  for _ in range(num_test_negatives):
    negative_item = np.random.choice(all_movies)
    while (u, negative_item) in user_item_set:
      negative_item = np.random.choice(all_movies)
    
    negatives.append(negative_item)
  
  test_negatives[u] = negatives

print('Number of users:', len(test_negatives))
print('Number of negative points:', len(test_negatives[138]))

def evaluate_model(model, N=10):
  hits = []
  ncdgs = []
  completed = 0

  for (u, item) in zip(test_ratings['user_id'], test_ratings['movie_id']):
    if u in test_negatives:
      negs = test_negatives[u]
      test_negs = np.array([])
      test_users = np.array([])
      items = {}

      for neg in negs:
        test_negs = np.append(test_negs, tf.constant([neg]))
        test_users = np.append(test_users, tf.constant([u]))
      test_negs = np.append(test_negs, tf.constant([item]))
      test_users = np.append(test_users, tf.constant([u]))

      predictions = model.predict({'user': test_users, 'item': test_negs})
      completed += 1

      for i in range(len(predictions)):
        if i == len(predictions) - 1:
          items[item] = predictions[i]
        else:
          items[negs[i]] = predictions[i]

      rankedList = heapq.nlargest(N, items, items.get)

      ncdg = evaluateNCDG(rankedList, item)
      hit = hitRate(item, rankedList)
      
      ncdgs.append(ncdg)
      hits.append(hit)

    
    hit_rate = np.mean(hits)
    mean_ncdg = np.mean(ncdgs)

  return hit_rate, mean_ncdg

Number of users: 943
Number of negative points: 100


In [70]:
NUM_EPOCHS = 10
best_hr = 0
best_ncdg = 0
best_epoch = -1
model_path = "OP_model.h5"

for epoch in range(1, NUM_EPOCHS + 1):
  b = modelMLPGMF.fit(train_ds, batch_size=256, epochs=1)

  hitrate2, ncdg2 = evaluate_model(modelMLPGMF,10)
  print(hitrate2, ncdg2)


0.37327677624602335 0.2060466714378147
0.3944856839872747 0.21462400091145198
0.39660657476139977 0.21413246942135877
0.3987274655355249 0.21312526899195383
0.39236479321314954 0.21612349066285655
0.3955461293743372 0.21569086643145882
0.38918345705196183 0.21096652340919825
0.38812301166489926 0.20642274253121878
0.37327677624602335 0.19984529302014117
0.35737009544008486 0.1936274148479047


In [None]:
x = np.arange(1, 11)
plt.plot(x,hist.history['binary_accuracy'], '.g-')
plt.plot(x,hist.history['loss'], '-o')
plt.plot(x,hist2.history['binary_accuracy'], '-o')
plt.plot(x,hist2.history['loss'], '-o')
plt.plot(x,hist3.history['binary_accuracy'], 'xb-')
plt.plot(x,hist3.history['loss'], 'xr-')
plt.xticks(x)
plt.title('Loss and accuracy for the NeuMF models')
plt.legend(['acc MLP', 'loss MLP', 'acc NeuMF', 'loss NeuMF', 'acc GMF', 'loss GMF'])



In [None]:
x = [4, 8, 16, 32]
fig = plt.figure(1, figsize=(7,7))
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212)

ax1.axes.set_xlabel('Embedding dimensional factors')
ax1.axes.set_ylabel('Hit@10')
ax1.axes.set_xticks([0,1,2,3])
ax1.axes.set_xticklabels(x)
ax1.plot(y)
ax2.axes.set_xlabel('Embedding dimensional factors')
ax2.axes.set_ylabel('NCDG@10')
ax2.axes.set_xticks([0,1,2,3])
ax2.axes.set_xticklabels(x)
ax2.plot(y)



In [None]:
8 latent factors
MLP model
495285/495285 [==============================] - 915s 2ms/step - loss: 0.3694 - binary_accuracy: 0.8401
Epoch 1 Hit Rate: 0.44750795334040294 NCDG: 0.2517541511227904
495285/495285 [==============================] - 917s 2ms/step - loss: 0.3667 - binary_accuracy: 0.8448
Epoch 2 Hit Rate: 0.45705196182396607 NCDG: 0.26865844132840344
495285/495285 [==============================] - 916s 2ms/step - loss: 0.3670 - binary_accuracy: 0.8477
Epoch 3 Hit Rate: 0.46447507953340406 NCDG: 0.2702547249683689
495285/495285 [==============================] - 912s 2ms/step - loss: 0.3687 - binary_accuracy: 0.8484
Epoch 4 Hit Rate: 0.4931071049840933 NCDG: 0.27538216075397726
495285/495285 [==============================] - 920s 2ms/step - loss: 0.3677 - binary_accuracy: 0.8481
Epoch 1 Hit Rate: 0.46977730646871685 NCDG: 0.2718355382886976
495285/495285 [==============================] - 916s 2ms/step - loss: 0.3710 - binary_accuracy: 0.8488
Epoch 2 Hit Rate: 0.4634146341463415 NCDG: 0.27170201199025257
495285/495285 [==============================] - 913s 2ms/step - loss: 0.3728 - binary_accuracy: 0.8489
Epoch 3 Hit Rate: 0.4517497348886532 NCDG: 0.2710410939976013
495285/495285 [==============================] - 911s 2ms/step - loss: 0.3729 - binary_accuracy: 0.8489
Epoch 4 Hit Rate: 0.4623541887592789 NCDG: 0.276299381160501

50000/50000 [==============================] - 120s 2ms/step - loss: 0.5206 - binary_accuracy: 0.8003
0.36267232237539765 0.19889218640242384
50000/50000 [==============================] - 118s 2ms/step - loss: 0.4116 - binary_accuracy: 0.8156
0.39236479321314954 0.21022428298585605
50000/50000 [==============================] - 119s 2ms/step - loss: 0.3942 - binary_accuracy: 0.8195
0.38812301166489926 0.21168376817058512
50000/50000 [==============================] - 113s 2ms/step - loss: 0.3895 - binary_accuracy: 0.8207
0.383881230116649 0.21076808699462052
50000/50000 [==============================] - 115s 2ms/step - loss: 0.3872 - binary_accuracy: 0.8200
0.3902439024390244 0.21293803688699683
50000/50000 [==============================] - 116s 2ms/step - loss: 0.3845 - binary_accuracy: 0.8194
0.38918345705196183 0.21137988297239213
50000/50000 [==============================] - 115s 2ms/step - loss: 0.3811 - binary_accuracy: 0.8212
0.39236479321314954 0.2103711129883063
50000/50000 [==============================] - 117s 2ms/step - loss: 0.3772 - binary_accuracy: 0.8215
0.3806998939554613 0.20553678808874357
50000/50000 [==============================] - 118s 2ms/step - loss: 0.3728 - binary_accuracy: 0.8234
0.383881230116649 0.20756561552456584
50000/50000 [==============================] - 122s 2ms/step - loss: 0.3688 - binary_accuracy: 0.8252
0.3743372216330859 0.20568002829461596

50000/50000 [==============================] - 73s 1ms/step - loss: 0.5083 - binary_accuracy: 0.7997
0.11770943796394485 0.054666174473342065
50000/50000 [==============================] - 74s 1ms/step - loss: 0.5135 - binary_accuracy: 0.7956
0.10392364793213149 0.05069516237190652
50000/50000 [==============================] - 73s 1ms/step - loss: 0.5120 - binary_accuracy: 0.7906
0.10816542948038176 0.04886419271532115
50000/50000 [==============================] - 74s 1ms/step - loss: 0.5042 - binary_accuracy: 0.7913
0.12301166489925769 0.05522351192349724
50000/50000 [==============================] - 73s 1ms/step - loss: 0.4808 - binary_accuracy: 0.7971
0.21527041357370094 0.10962105689055417
50000/50000 [==============================] - 72s 1ms/step - loss: 0.4581 - binary_accuracy: 0.8103
0.2757158006362672 0.1439957939213699
50000/50000 [==============================] - 74s 1ms/step - loss: 0.4572 - binary_accuracy: 0.8163
0.23860021208907742 0.11946548595632371
50000/50000 [==============================] - 73s 1ms/step - loss: 0.4617 - binary_accuracy: 0.8241
0.30116648992576883 0.14679456267269783
50000/50000 [==============================] - 74s 1ms/step - loss: 0.4841 - binary_accuracy: 0.8241
0.31919406150583246 0.15650349492849633
50000/50000 [==============================] - 74s 1ms/step - loss: 0.5004 - binary_accuracy: 0.8269
0.3244962884411453 0.16890658225169475