### Installs

In [1]:
!pip install pandas
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets
!pip install -q scann
!pip install ipywidgets

[K     |████████████████████████████████| 61kB 5.8MB/s 
[K     |████████████████████████████████| 394.7MB 37kB/s 
[K     |████████████████████████████████| 3.7MB 16.6MB/s 
[K     |████████████████████████████████| 11.1MB 9.8MB/s 


### Imports

In [2]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

### Data Set-up

In [3]:
# Ratings data.
ratings = tfds.load("movielens/100k-ratings", split="train")
# Features of all the available movies.
movies = tfds.load("movielens/100k-movies", split="train")


[1mDownloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /root/tensorflow_datasets/movielens/100k-ratings/0.1.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Extraction completed...', max=1.0, styl…









HBox(children=(FloatProgress(value=0.0, description='Generating splits...', max=1.0, style=ProgressStyle(descr…

HBox(children=(FloatProgress(value=0.0, description='Generating train examples...', max=100000.0, style=Progre…

HBox(children=(FloatProgress(value=0.0, description='Shuffling movielens-train.tfrecord...', max=100000.0, sty…

[1mDataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-ratings/0.1.0. Subsequent calls will reuse this data.[0m
[1mDownloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 150.35 KiB, total: 4.84 MiB) to /root/tensorflow_datasets/movielens/100k-movies/0.1.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Extraction completed...', max=1.0, styl…









HBox(children=(FloatProgress(value=0.0, description='Generating splits...', max=1.0, style=ProgressStyle(descr…

HBox(children=(FloatProgress(value=0.0, description='Generating train examples...', max=1682.0, style=Progress…

HBox(children=(FloatProgress(value=0.0, description='Shuffling movielens-train.tfrecord...', max=1682.0, style…

[1mDataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-movies/0.1.0. Subsequent calls will reuse this data.[0m


In [4]:
for x in ratings.take(1).as_numpy_iterator():
  pprint.pprint(x)

for x in movies.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7]),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}
{'movie_genres': array([4]),
 'movie_id': b'1681',
 'movie_title': b'You So Crazy (1994)'}


In [5]:
dataframe = tfds.as_dataframe(ratings)
print(dataframe)


       bucketized_user_age     movie_genres  ... user_rating user_zip_code
0                     45.0              [7]  ...         4.0      b'53211'
1                     25.0          [4, 14]  ...         2.0      b'80525'
2                     18.0              [4]  ...         4.0      b'55439'
3                     50.0           [5, 7]  ...         4.0      b'06472'
4                     50.0         [10, 16]  ...         3.0      b'75094'
...                    ...              ...  ...         ...           ...
99995                 25.0       [0, 1, 15]  ...         4.0      b'80027'
99996                 35.0         [13, 16]  ...         4.0      b'60035'
99997                 18.0             [10]  ...         1.0      b'78264'
99998                 35.0  [0, 10, 15, 16]  ...         4.0      b'53210'
99999                 18.0              [4]  ...         2.0      b'95064'

[100000 rows x 12 columns]


In [6]:
relevant_cols = dataframe[['user_id', 'movie_id', 'user_rating', 'timestamp']]
print(relevant_cols)
print(relevant_cols.dtypes)

      user_id movie_id  user_rating  timestamp
0      b'138'   b'357'          4.0  879024327
1       b'92'   b'709'          2.0  875654590
2      b'301'   b'412'          4.0  882075110
3       b'60'    b'56'          4.0  883326919
4      b'197'   b'895'          3.0  891409199
...       ...      ...          ...        ...
99995  b'774'   b'228'          4.0  888557237
99996  b'313'   b'333'          4.0  891012877
99997  b'262'   b'567'          1.0  879795430
99998  b'911'   b'183'          4.0  892839492
99999  b'276'  b'1140'          2.0  874791894

[100000 rows x 4 columns]
user_id         object
movie_id        object
user_rating    float64
timestamp        int64
dtype: object


In [8]:
relevant_cols['latest'] = relevant_cols.groupby(['user_id'])['timestamp'].rank(method='first', ascending=False)

train_ratings = relevant_cols[relevant_cols['latest'] != 1]
test_ratings = relevant_cols[relevant_cols['latest'] == 1]

train_ratings = train_ratings[['user_id', 'movie_id', 'user_rating']]
test_ratings = test_ratings[['user_id', 'movie_id', 'user_rating']]

print(train_ratings.shape)
print(test_ratings.shape)

(99057, 3)
(943, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


## Convert to Implicit Feedback Dataset

In [11]:
# Convert rating to 1 for everything to mark that the user has watched this item
train_ratings.loc[:, 'rating'] = 1
print(train_ratings)

      user_id movie_id  user_rating  rating
0      b'138'   b'357'          4.0       1
1       b'92'   b'709'          2.0       1
2      b'301'   b'412'          4.0       1
3       b'60'    b'56'          4.0       1
4      b'197'   b'895'          3.0       1
...       ...      ...          ...     ...
99995  b'774'   b'228'          4.0       1
99996  b'313'   b'333'          4.0       1
99997  b'262'   b'567'          1.0       1
99998  b'911'   b'183'          4.0       1
99999  b'276'  b'1140'          2.0       1

[99057 rows x 4 columns]


In [14]:
# Add negative samples
all_movies = relevant_cols['movie_id'].unique()

users, items, labels = [], [], []
user_item_set = set(zip(train_ratings['user_id'], train_ratings['user_id']))
num_negatives = 4

for (u, i) in user_item_set:
  users.append(u)
  items.append(i)
  labels.append(1)
  for _ in range(num_negatives):
    negative_item = np.random.choice(all_movies)
    while (u, negative_item) in user_item_set:
      negative_item = np.random.choice(all_movies)
    users.append(u)
    items.append(negative_item)
    labels.append(0)

## Create Dataset

In [16]:
train_ds = tf.data.Dataset.from_tensor_slices((users, items, labels))

In [21]:
train_ds = train_ds.shuffle(1000)
for elem in train_ds.take(10):
  print(elem)

(<tf.Tensor: shape=(), dtype=string, numpy=b'870'>, <tf.Tensor: shape=(), dtype=string, numpy=b'1465'>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'187'>, <tf.Tensor: shape=(), dtype=string, numpy=b'1161'>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'755'>, <tf.Tensor: shape=(), dtype=string, numpy=b'1379'>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'115'>, <tf.Tensor: shape=(), dtype=string, numpy=b'126'>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'471'>, <tf.Tensor: shape=(), dtype=string, numpy=b'942'>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'661'>, <tf.Tensor: shape=(), dtype=string, numpy=b'333'>, <tf.Tensor: shape=(), dtype=int32, numpy=0>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'429'>, <tf.Tensor: shape=(), dtype=string, numpy=b'488'>, <tf.Te