In [None]:
! pip install tensorflow_recommenders

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfr
import pprint
import numpy as np

In [None]:
ratings = tfds.load("movielens/100k-ratings",split="train")
movies = tfds.load("movielens/100k-movies",split="train")

In [4]:
ratings

<PrefetchDataset element_spec={'bucketized_user_age': TensorSpec(shape=(), dtype=tf.float32, name=None), 'movie_genres': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'movie_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'movie_title': TensorSpec(shape=(), dtype=tf.string, name=None), 'raw_user_age': TensorSpec(shape=(), dtype=tf.float32, name=None), 'timestamp': TensorSpec(shape=(), dtype=tf.int64, name=None), 'user_gender': TensorSpec(shape=(), dtype=tf.bool, name=None), 'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_occupation_label': TensorSpec(shape=(), dtype=tf.int64, name=None), 'user_occupation_text': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_rating': TensorSpec(shape=(), dtype=tf.float32, name=None), 'user_zip_code': TensorSpec(shape=(), dtype=tf.string, name=None)}>

In [5]:
for x in ratings.take(2).as_numpy_iterator():
  pprint.pprint(x)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7]),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}
{'bucketized_user_age': 25.0,
 'movie_genres': array([ 4, 14]),
 'movie_id': b'709',
 'movie_title': b'Strictly Ballroom (1992)',
 'raw_user_age': 32.0,
 'timestamp': 875654590,
 'user_gender': True,
 'user_id': b'92',
 'user_occupation_label': 5,
 'user_occupation_text': b'entertainment',
 'user_rating': 2.0,
 'user_zip_code': b'80525'}


In [6]:
for x in movies.take(3).as_numpy_iterator():
  pprint.pprint(x)

{'movie_genres': array([4]),
 'movie_id': b'1681',
 'movie_title': b'You So Crazy (1994)'}
{'movie_genres': array([4, 7]),
 'movie_id': b'1457',
 'movie_title': b'Love Is All There Is (1996)'}
{'movie_genres': array([1, 3]),
 'movie_id': b'500',
 'movie_title': b'Fly Away Home (1996)'}


In [7]:
## taking 2 features id and movie title from ratings

ratings = ratings.map(lambda x:{
    "movie_title":x["movie_title"],
    "user_id":x["user_id"],
    "timestamp":x['timestamp']
   
})
movies = movies.map(lambda x:x["movie_title"])
                              

In [8]:
for x in ratings.take(5).as_numpy_iterator():
  pprint.pprint(x)

{'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'timestamp': 879024327,
 'user_id': b'138'}
{'movie_title': b'Strictly Ballroom (1992)',
 'timestamp': 875654590,
 'user_id': b'92'}
{'movie_title': b'Very Brady Sequel, A (1996)',
 'timestamp': 882075110,
 'user_id': b'301'}
{'movie_title': b'Pulp Fiction (1994)',
 'timestamp': 883326919,
 'user_id': b'60'}
{'movie_title': b'Scream 2 (1997)', 'timestamp': 891409199, 'user_id': b'197'}


In [None]:
### embeddings for the categoricl features and continous data is normalized

In [13]:
### starting with a rating model feature processing
# ratings contains
#user id
#movie title
# time stamp


# dealing with user id we use stringLookup

user_id_vocabuary = tf.keras.layers.StringLookup()
user_id_vocabuary.adapt(ratings.map(lambda x:x['user_id']).batch(100))

In [14]:
## movie title there are two ways to deal with it
# first as whole or as individual values
# as whole use 
movie_title_vocabulary = tf.keras.layers.StringLookup()
movie_title_vocabulary.adapt(ratings.map(lambda x:x['movie_title']).batch(100))

In [16]:
## as text
movie_title_text_vector = tf.keras.layers.TextVectorization()
movie_title_text_vector.adapt(ratings.map(lambda x:x['movie_title']).batch(100))

In [17]:
# timestamp can be discretized or normalized as continous feature
# discretization
timestamps = np.concatenate(list(ratings.map(lambda x:x['timestamp']).batch(100)))

In [25]:
min_time = timestamps.min()
max_time = timestamps.max()
timestamps_bucket = np.linspace(min_time,max_time,num=1000)
discrete_timestamp = tf.keras.layers.Discretization(timestamps_bucket.tolist())

In [26]:
discrete_timestamp (timestamps)

<tf.Tensor: shape=(100000,), dtype=int64, numpy=array([232,  51, 396, ..., 273, 975,   4])>

In [69]:
### Normalization of  timestamps
normalize_timestamps = tf.keras.layers.Normalization(axis=None)
normalize_timestamps.adapt(np.array(timestamps,dtype='float32'))


In [63]:
value

In [59]:
timestamps

array([879024327, 875654590, 882075110, ..., 879795430, 892839492,
       874791894])

In [33]:
ratings.map(lambda x:x['user_id']).batch(100)

<BatchDataset element_spec=TensorSpec(shape=(None,), dtype=tf.string, name=None)>

In [36]:
## convert to embeddings
user_id_embeddings_layer = tf.keras.layers.Embedding(user_id_vocabuary.vocab_size(),32)
lookup = user_id_vocabuary(np.concatenate(list(ratings.map(lambda x:x['user_id']).batch(100))))
user_id_embeddings = user_id_embeddings_layer(lookup)
user_id_embeddings



<tf.Tensor: shape=(100000, 32), dtype=float32, numpy=
array([[-0.04741148,  0.02330904,  0.03542682, ..., -0.02041677,
         0.00529487, -0.02664511],
       [ 0.03803145,  0.02068267, -0.02545168, ...,  0.00787871,
        -0.00931245,  0.00760555],
       [-0.01932745, -0.03018264, -0.03894828, ...,  0.0285219 ,
        -0.0217639 ,  0.00942328],
       ...,
       [ 0.00747423,  0.01938125,  0.02604466, ...,  0.01582741,
         0.03754489, -0.01717795],
       [-0.01697094, -0.0251235 , -0.00992846, ...,  0.00490737,
        -0.04428206, -0.02114056],
       [ 0.0117141 ,  0.02431152,  0.02682875, ...,  0.01206579,
        -0.03181015,  0.04185197]], dtype=float32)>

In [39]:
# Movie title as stringlookup
movie_title_embeddings_layer = tf.keras.layers.Embedding(movie_title_vocabulary.vocab_size(),32)
lookup = movie_title_vocabulary(np.concatenate(list(ratings.map(lambda x:x['movie_title']).batch(100))))
movie_title_embeddings = movie_title_embeddings_layer (lookup)
movie_title_embeddings



<tf.Tensor: shape=(100000, 32), dtype=float32, numpy=
array([[-0.01630978, -0.02569931,  0.03665093, ..., -0.03104436,
        -0.03821566, -0.02537227],
       [ 0.03667306, -0.00716151,  0.04938321, ...,  0.02294308,
         0.04402134, -0.03611802],
       [-0.00269121,  0.02585173, -0.0040849 , ...,  0.00499278,
         0.02188036, -0.03835566],
       ...,
       [-0.04608719, -0.01552929, -0.01678654, ...,  0.00850805,
        -0.01397078,  0.03189094],
       [ 0.04676677,  0.04201912, -0.02155346, ..., -0.03681235,
        -0.03263997,  0.01325144],
       [ 0.02812256,  0.03993626, -0.01516765, ...,  0.0149873 ,
         0.0392216 , -0.04773338]], dtype=float32)>

In [45]:
## Movie Title as textvectorization
movie_title_text_embedding_layer = tf.keras.layers.Embedding(movie_title_text_vector.vocabulary_size(),32)
lookup = movie_title_text_vector(np.concatenate(list(ratings.map(lambda x:x['movie_title']).batch(100))))
movie_title_text_embeddings =movie_title_text_embedding_layer (lookup)
averageMovietitle_embeddings=tf.keras.layers.GlobalAveragePooling1D()(movie_title_text_embeddings)
averageMovietitle_embeddings

<tf.Tensor: shape=(100000, 32), dtype=float32, numpy=
array([[-0.0271943 , -0.01294007, -0.02463014, ...,  0.03660198,
         0.01925104, -0.01043711],
       [-0.03194948, -0.01799601, -0.02133049, ...,  0.03864413,
         0.03078798, -0.00572893],
       [-0.02928905, -0.01709648, -0.01279188, ...,  0.02684232,
         0.0194418 , -0.0043711 ],
       ...,
       [-0.02833093, -0.01832613, -0.02111807, ...,  0.03139702,
         0.02557224,  0.00266598],
       [-0.03918093, -0.02035939, -0.02216072, ...,  0.04086189,
         0.02871422, -0.00659649],
       [-0.03090936, -0.0152417 , -0.018055  , ...,  0.03150586,
         0.01717355, -0.000103  ]], dtype=float32)>

In [46]:
### time stamp discretized form
timestamps_embedding_layer = tf.keras.layers.Embedding(len(timestamps_bucket)+1,32)
lookup =discrete_timestamp(np.concatenate(list(ratings.map(lambda x:x['timestamp']).batch(100))))
timestamps_embeddings = timestamps_embedding_layer(lookup)
timestamps_embeddings

<tf.Tensor: shape=(100000, 32), dtype=float32, numpy=
array([[-0.01324006,  0.03092024, -0.00539099, ..., -0.00962311,
         0.04171616, -0.01374578],
       [-0.04368169,  0.03692111,  0.01683   , ...,  0.01508811,
        -0.0214709 ,  0.01247606],
       [ 0.04076887, -0.00775604,  0.0480671 , ...,  0.02259675,
         0.00748792, -0.02217144],
       ...,
       [ 0.0049464 , -0.00583805, -0.00048954, ...,  0.03865978,
        -0.00950354,  0.02396666],
       [ 0.04102074,  0.03638947,  0.0165466 , ...,  0.04592978,
        -0.01944245,  0.01336234],
       [-0.01417219,  0.00638954,  0.01791376, ...,  0.01943837,
        -0.0446508 , -0.04673837]], dtype=float32)>

In [72]:
norm_timestamps = normalize_timestamps(np.concatenate(list(ratings.map(lambda x:x['timestamp']).batch(100))))
norm_timestamps = tf.reshape(norm_timestamps ,(-1,1))
norm_timestamps 

<tf.Tensor: shape=(100000, 1), dtype=float32, numpy=
array([[-0.8428199],
       [-1.4734049],
       [-0.2719137],
       ...,
       [-0.6985155],
       [ 1.7424442],
       [-1.6348476]], dtype=float32)>

In [73]:
user_embeddings = tf.concat([user_id_embeddings,movie_title_embeddings,averageMovietitle_embeddings,timestamps_embeddings,norm_timestamps],axis=1)

In [74]:
user_embeddings

<tf.Tensor: shape=(100000, 129), dtype=float32, numpy=
array([[-0.04741148,  0.02330904,  0.03542682, ...,  0.04171616,
        -0.01374578, -0.8428199 ],
       [ 0.03803145,  0.02068267, -0.02545168, ..., -0.0214709 ,
         0.01247606, -1.4734049 ],
       [-0.01932745, -0.03018264, -0.03894828, ...,  0.00748792,
        -0.02217144, -0.2719137 ],
       ...,
       [ 0.00747423,  0.01938125,  0.02604466, ..., -0.00950354,
         0.02396666, -0.6985155 ],
       [-0.01697094, -0.0251235 , -0.00992846, ..., -0.01944245,
         0.01336234,  1.7424442 ],
       [ 0.0117141 ,  0.02431152,  0.02682875, ..., -0.0446508 ,
        -0.04673837, -1.6348476 ]], dtype=float32)>