<a href="https://colab.research.google.com/github/JNishimura/Deep-Learning-Recommenders/blob/main/Notebooks/MovieLens_TensorFlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installs

In [1]:
!pip install pandas
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets
!pip install -q scann
!pip install ipywidgets

[K     |████████████████████████████████| 61kB 3.3MB/s 
[K     |████████████████████████████████| 394.7MB 44kB/s 
[K     |████████████████████████████████| 3.7MB 9.8MB/s 
[K     |████████████████████████████████| 11.1MB 5.1MB/s 


### Imports

In [2]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

import math
import heapq

### Data Set-up

In [3]:
# Ratings data.
ratings = tfds.load("movielens/100k-ratings", split="train")

[1mDownloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /root/tensorflow_datasets/movielens/100k-ratings/0.1.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Extraction completed...', max=1.0, styl…









HBox(children=(FloatProgress(value=0.0, description='Generating splits...', max=1.0, style=ProgressStyle(descr…

HBox(children=(FloatProgress(value=0.0, description='Generating train examples...', max=100000.0, style=Progre…

HBox(children=(FloatProgress(value=0.0, description='Shuffling movielens-train.tfrecord...', max=100000.0, sty…

[1mDataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-ratings/0.1.0. Subsequent calls will reuse this data.[0m


In [4]:
for x in ratings.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7]),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}


In [5]:
dataframe = tfds.as_dataframe(ratings)
print(dataframe)

       bucketized_user_age     movie_genres  ... user_rating user_zip_code
0                     45.0              [7]  ...         4.0      b'53211'
1                     25.0          [4, 14]  ...         2.0      b'80525'
2                     18.0              [4]  ...         4.0      b'55439'
3                     50.0           [5, 7]  ...         4.0      b'06472'
4                     50.0         [10, 16]  ...         3.0      b'75094'
...                    ...              ...  ...         ...           ...
99995                 25.0       [0, 1, 15]  ...         4.0      b'80027'
99996                 35.0         [13, 16]  ...         4.0      b'60035'
99997                 18.0             [10]  ...         1.0      b'78264'
99998                 35.0  [0, 10, 15, 16]  ...         4.0      b'53210'
99999                 18.0              [4]  ...         2.0      b'95064'

[100000 rows x 12 columns]


In [6]:
# Pull out relevant columns and convert id columns to ints
relevant_cols = dataframe.loc[:, ['user_id', 'movie_id', 'user_rating', 'timestamp']]
relevant_cols['user_id'] = relevant_cols['user_id'].astype(int)
relevant_cols['movie_id'] = relevant_cols['movie_id'].astype(int)
print(relevant_cols)
print(relevant_cols.dtypes)

       user_id  movie_id  user_rating  timestamp
0          138       357          4.0  879024327
1           92       709          2.0  875654590
2          301       412          4.0  882075110
3           60        56          4.0  883326919
4          197       895          3.0  891409199
...        ...       ...          ...        ...
99995      774       228          4.0  888557237
99996      313       333          4.0  891012877
99997      262       567          1.0  879795430
99998      911       183          4.0  892839492
99999      276      1140          2.0  874791894

[100000 rows x 4 columns]
user_id          int64
movie_id         int64
user_rating    float64
timestamp        int64
dtype: object


In [7]:
relevant_cols['latest'] = relevant_cols.groupby(['user_id'])['timestamp'].rank(method='first', ascending=False)

train_ratings = relevant_cols[relevant_cols['latest'] != 1]
test_ratings = relevant_cols[relevant_cols['latest'] == 1]

train_ratings = train_ratings[['user_id', 'movie_id', 'user_rating']]
test_ratings = test_ratings[['user_id', 'movie_id', 'user_rating']]

print(train_ratings.shape)
print(test_ratings.shape)

num_users = len(relevant_cols.user_id.unique()) + 1
num_items = len(relevant_cols.movie_id.unique()) + 1

print('Num Users:', num_users)
print('Num Items:', num_items)

(99057, 3)
(943, 3)
Num Users: 944
Num Items: 1683


## Convert to Implicit Feedback Dataset

In [8]:
# Convert rating to 1 for everything to mark that the user has watched this item
train_ratings.loc[:, 'rating'] = 1
print(train_ratings)


       user_id  movie_id  user_rating  rating
0          138       357          4.0       1
1           92       709          2.0       1
2          301       412          4.0       1
3           60        56          4.0       1
4          197       895          3.0       1
...        ...       ...          ...     ...
99995      774       228          4.0       1
99996      313       333          4.0       1
99997      262       567          1.0       1
99998      911       183          4.0       1
99999      276      1140          2.0       1

[99057 rows x 4 columns]


In [25]:
# Add negative samples
all_movies = relevant_cols['movie_id'].unique()

users, items, labels = [], [], []
user_item_set = set(zip(train_ratings['user_id'], train_ratings['movie_id']))
num_negatives = 4

for (u, i) in user_item_set:
  users.append([u])
  items.append([i])
  labels.append([1])
  for _ in range(num_negatives):
    negative_item = np.random.choice(all_movies)
    while (u, negative_item) in user_item_set:
      negative_item = np.random.choice(all_movies)
    users.append([u])
    items.append([negative_item])
    labels.append([0])



## Create Dataset

In [26]:
train_ds = tf.data.Dataset.from_tensor_slices(({'user': users, 'item': items}, labels))

In [27]:
train_ds = train_ds.shuffle(1000)
for elem in train_ds.take(10):
  print(elem)

({'user': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([311], dtype=int32)>, 'item': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([453], dtype=int32)>}, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>)
({'user': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([450], dtype=int32)>, 'item': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([21], dtype=int32)>}, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>)
({'user': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([793], dtype=int32)>, 'item': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([499], dtype=int32)>}, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>)
({'user': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([833], dtype=int32)>, 'item': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([973], dtype=int32)>}, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>)
({'user': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([339], dtype=int32)>, 

## Define Model

In [36]:
from keras.models import Model
from keras.layers import Embedding, Input, Dense, Reshape, Multiply, Flatten, Lambda, Concatenate
from keras import initializers, regularizers
import sys

def get_GMFmodel(num_users, num_items, latent_dim):
  user_input = Input(shape = (1,), dtype = 'int32', name = 'user')
  item_input = Input(shape = (1,), dtype = 'int32', name = 'item')

  user_embedding = Embedding(input_dim = num_users, output_dim = latent_dim, name = 'user_embed',
                             embeddings_initializer = initializers.RandomNormal(stddev = 0.01), 
                             embeddings_regularizer = regularizers.l2(0), input_length = 1)
  item_embedding = Embedding(input_dim = num_items, output_dim = latent_dim, name = 'item_embed',
                             embeddings_initializer = initializers.RandomNormal(stddev = 0.01), 
                             embeddings_regularizer = regularizers.l2(0), input_length = 1)

  user_latent = Flatten()(user_embedding(user_input))
  item_latent = Flatten()(item_embedding(item_input))

  prediction_vec = Multiply()([user_latent, item_latent])

  prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name = 'prediction')(prediction_vec)

  return Model(inputs=[user_input, item_input], outputs=prediction)

In [49]:
def get_MLPmodel(num_users, num_items, latent_dim):
  user_input = Input(shape = (1,), dtype = 'int32', name = 'user')
  item_input = Input(shape = (1,), dtype = 'int32', name = 'item')

  user_embedding = Embedding(input_dim = num_users, output_dim = latent_dim, name = 'user_embed',
                             embeddings_initializer = initializers.RandomNormal(stddev = 0.01), 
                             embeddings_regularizer = regularizers.l2(0), input_length = 1)
  item_embedding = Embedding(input_dim = num_items, output_dim = latent_dim, name = 'item_embed',
                             embeddings_initializer = initializers.RandomNormal(stddev = 0.01), 
                             embeddings_regularizer = regularizers.l2(0), input_length = 1)
  
  user_latent = Flatten()(user_embedding(user_input))
  item_latent = Flatten()(item_embedding(item_input))

  inputs = Concatenate()([user_latent, item_latent])

  layer = Dense(256, activation='relu', name='Layer1', kernel_initializer='glorot_uniform')(inputs)
  layer = Dense(128, activation='relu', name='Layer2')(layer)
  layer = Dense(32, activation='relu', name='Layer3')(layer)
  output = Dense(1, activation='sigmoid', name='Layer4')(layer)

  return Model(inputs=[user_input, item_input], outputs=output)
    

In [75]:
from keras.optimizers import Adam

topK = 10

model = get_MLPGMFmodel(num_users, num_items, 12)
model.compile(optimizer=Adam(0.001), loss='binary_crossentropy')
model.summary()

Model: "model_24"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
item (InputLayer)               [(None, 1)]          0                                            
__________________________________________________________________________________________________
user_embed (Embedding)          (None, 1, 12)        11328       user[0][0]                       
__________________________________________________________________________________________________
item_embed (Embedding)          (None, 1, 12)        20196       item[0][0]                       
___________________________________________________________________________________________

In [None]:
hist = model.fit(train_ds, batch_size=256, epochs=1)



In [74]:
def get_MLPGMFmodel(num_users, num_items, latent_dim):
  user_input = Input(shape = (1,), dtype = 'int32', name = 'user')
  movie_input = Input(shape = (1,), dtype = 'int32', name = 'item')

  user_embedding = Embedding(input_dim = num_users, output_dim = latent_dim, name = 'user_embed',
                             embeddings_initializer = initializers.RandomNormal(stddev = 0.01), 
                             embeddings_regularizer = regularizers.l2(0), input_length = 1)
  movie_embedding = Embedding(input_dim = num_items, output_dim = latent_dim, name = 'item_embed',
                             embeddings_initializer = initializers.RandomNormal(stddev = 0.01), 
                             embeddings_regularizer = regularizers.l2(0), input_length = 1)
  
  user_latent = Flatten()(user_embedding(user_input))
  item_latent = Flatten()(movie_embedding(movie_input))

  #GMF 
  prediction_vec = Multiply()([user_latent, item_latent])
  prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name = 'prediction')(prediction_vec)

  #MLF
  inputs = Concatenate()([user_latent, item_latent])
  layer = Dense(256, activation='relu', name='Layer1', kernel_initializer='glorot_uniform')(inputs)
  layer = Dense(128, activation='relu', name='Layer2')(layer)
  layer = Dense(32, activation='relu', name='Layer3')(layer)
  output = Dense(1, activation='sigmoid', name='Layer4')(layer)

  predictions = Concatenate()([prediction, output])

  return Model(inputs=[user_input, movie_input], outputs=predictions)


In [16]:
def evaluateNCDG(rankedlist, predictedlist):
  sum_rank, sum_pred = 0,0
  count = 1
  for rank, pred in zip(rankedlist, predictedlist):
    sum_rank += rank/math.log2(count + 1)
    sum_pred += pred/math.log2(count + 1)
    count = count + 1
  
  return sum_pred/sum_rank

In [32]:
def hitRate(item, ranklist):
  for rank in ranklist:
    if item == rank:
      return 1
  return 0

In [31]:
# This method calculates all the evaluation metrics. Individual methods are called from here.
def evaluate(model, testPosRatings, testNegRatings):
    hits = []
    ncdgs = []
    for i in range(len(testPosRatings)):
      hit, ncdg= evaluate_one(model, testPosRatings[i], testNegRatings[i])
      hits.append(hit)
      ncdgs.append(ncdg)

    return hits, ncdg

In [33]:
def evaluate_one(model, posRating, negRating, N):
  user = posRating[0]
  movie = posRating[1]
  negRatings.append(movie)

  user_input = np.full(len(negRatings)+ 1, user)

  predictions = model.predict([user_input, negRatings])

  # associate item with predictions
  items = {}
  for i in range(len(predictions)):
    items[negRatings[i]] = predictions[i]

  rankedList = heapq.nlargest(N, items, items.get)
  ncdg = evaluateNCDG(rankedList.values(), items.values())
  hit = hitRate(item, rankedList)

  return hit, ncdg