### Installs

In [6]:
!pip install pandas
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets
!pip install -q scann
!pip install ipywidgets



### Imports

In [7]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

### Data Set-up

In [8]:
# Ratings data.
ratings = tfds.load("movielens/100k-ratings", split="train")
# Features of all the available movies.
movies = tfds.load("movielens/100k-movies", split="train")


In [9]:
for x in ratings.take(1).as_numpy_iterator():
  pprint.pprint(x)

for x in movies.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7]),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}
{'movie_genres': array([4]),
 'movie_id': b'1681',
 'movie_title': b'You So Crazy (1994)'}


In [11]:
dataframe = tfds.as_dataframe(ratings)
print(dataframe)


       bucketized_user_age     movie_genres  ... user_rating user_zip_code
0                     45.0              [7]  ...         4.0      b'53211'
1                     25.0          [4, 14]  ...         2.0      b'80525'
2                     18.0              [4]  ...         4.0      b'55439'
3                     50.0           [5, 7]  ...         4.0      b'06472'
4                     50.0         [10, 16]  ...         3.0      b'75094'
...                    ...              ...  ...         ...           ...
99995                 25.0       [0, 1, 15]  ...         4.0      b'80027'
99996                 35.0         [13, 16]  ...         4.0      b'60035'
99997                 18.0             [10]  ...         1.0      b'78264'
99998                 35.0  [0, 10, 15, 16]  ...         4.0      b'53210'
99999                 18.0              [4]  ...         2.0      b'95064'

[100000 rows x 12 columns]


In [13]:
relevant_cols = dataframe[['user_id', 'movie_id', 'user_rating', 'timestamp']]
print(relevant_cols)
print(relevant_cols.dtypes)

      user_id movie_id  user_rating  timestamp
0      b'138'   b'357'          4.0  879024327
1       b'92'   b'709'          2.0  875654590
2      b'301'   b'412'          4.0  882075110
3       b'60'    b'56'          4.0  883326919
4      b'197'   b'895'          3.0  891409199
...       ...      ...          ...        ...
99995  b'774'   b'228'          4.0  888557237
99996  b'313'   b'333'          4.0  891012877
99997  b'262'   b'567'          1.0  879795430
99998  b'911'   b'183'          4.0  892839492
99999  b'276'  b'1140'          2.0  874791894

[100000 rows x 4 columns]
user_id         object
movie_id        object
user_rating    float64
timestamp        int64
dtype: object


In [20]:
relevant_cols['latest'] = relevant_cols.groupby(['user_id'])['timestamp'].rank(method='first', ascending=False)
print(relevant_cols)

train_ratings = relevant_cols[relevant_cols['latest'] != 1]
test_ratings = relevant_cols[relevant_cols['latest'] == 1]

train_ratings = train_ratings[['user_id', 'movie_id', 'user_rating']]
test_ratings = test_ratings[['user_id', 'movie_id', 'user_rating']]

print(train_ratings.shape)
print(test_ratings.shape)

      user_id movie_id  user_rating  timestamp  latest
0      b'138'   b'357'          4.0  879024327     4.0
1       b'92'   b'709'          2.0  875654590   240.0
2      b'301'   b'412'          4.0  882075110   224.0
3       b'60'    b'56'          4.0  883326919    94.0
4      b'197'   b'895'          3.0  891409199   104.0
...       ...      ...          ...        ...     ...
99995  b'774'   b'228'          4.0  888557237   119.0
99996  b'313'   b'333'          4.0  891012877   261.0
99997  b'262'   b'567'          1.0  879795430    36.0
99998  b'911'   b'183'          4.0  892839492    59.0
99999  b'276'  b'1140'          2.0  874791894   322.0

[100000 rows x 5 columns]
(99057, 3)
(943, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
