# Deep Neural Network Model

**Resources**
- Baseline code from [tensorflow github](https://github.com/tensorflow/recommenders/blob/main/docs/examples/context_features.ipynb)
    - ❓ Are we sure that we train each entry with the specific movie and not the listo of movies watched from the user

In [1]:
!pip install -q tensorflow-recommenders

In [2]:
import os
import tempfile

import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs

## TODO: remove this cells
- Explore the example dataset

In [28]:
# TODO remove this cell
!pip install -q --upgrade tensorflow-datasets
import tensorflow_datasets as tfds

ratings = tfds.load("movielens/100k-ratings", split="train")
movies = tfds.load("movielens/100k-movies", split="train")

ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
    "timestamp": x["timestamp"],
})
movies = movies.map(lambda x: x["movie_title"]) # Target?



timestamps = np.concatenate(list(ratings.map(lambda x: x["timestamp"]).batch(100)))

max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,
)

unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1000))))
unique_user_ids = np.unique(np.concatenate(list(ratings.batch(1_000).map(
    lambda x: x["user_id"]))))


type(movies), type(ratings)

(tensorflow.python.data.ops.dataset_ops.MapDataset,
 tensorflow.python.data.ops.dataset_ops.MapDataset)

**TODO**
- ❓ why we have both movies and ratings? we can extract movies directly from ratings, or not?

In [4]:
type(unique_user_ids), unique_user_ids[0]

(numpy.ndarray, b'1')

In [5]:
type(unique_movie_titles), unique_movie_titles[0]

(numpy.ndarray, b"'Til There Was You (1997)")

In [6]:
movies.as_numpy_iterator().next()

b'You So Crazy (1994)'

In [7]:
ratings.as_numpy_iterator().next()

{'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'user_id': b'138',
 'timestamp': 879024327}

## Load data

**TODO**
- Store dataset info like all the product id with the dataset

In [30]:
df_train["User_ID"] = df_train["User_ID"].astype(str)
df_train["Gender"] = df_train["Gender"].astype(str)
df_train["Product_ID"] = df_train["Product_ID"].astype(str)

In [31]:
import pandas as pd
from os.path import join

dataset_parsed_path = "/home/jupyter/mlspec-blackfriday/dataset/parsed/202104130952/"
train_path = join(dataset_parsed_path, "train.csv")
test_path = join(dataset_parsed_path, "eval.csv")

df_train=pd.read_csv(train_path)
df_test=pd.read_csv(test_path)

print(f"df_train shape:{df_train.shape}")
print(f"df_test shape:{df_test.shape}")

df_train shape:(78366, 12)
df_test shape:(156733, 12)


In [32]:
df_train.head(3)

Unnamed: 0.1,Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,537521,1004732,P00235842,M,26-35,14,B,2,1,11,15.0,16.0
1,531961,1003903,P00180442,M,26-35,0,C,1,0,11,,
2,93286,1002340,P00303842,M,36-45,2,A,2,0,5,8.0,14.0


**TODO**
- TODO calculate and store those info with all the data, with the data

In [33]:
unique_gender = df_train["Gender"].unique()
type(unique_gender), len(unique_gender), unique_gender[0]

(numpy.ndarray, 2, 'M')

In [34]:
unique_user_id = df_train["User_ID"].unique()
type(unique_user_id), len(unique_user_id), unique_user_id[0]

(numpy.ndarray, 5728, 1004732)

In [35]:
unique_product_id = df_train["Product_ID"].unique()
type(unique_product_id), len(unique_product_id), unique_product_id[0]

(numpy.ndarray, 3250, 'P00235842')

## Model definition

### User model

- Require:
    - All the values of features that need `StringLookup` in `numpy.ndarray` format
    
**TODO** 
- extract embeddings dimension as parameter

In [36]:
class UserModel(tf.keras.Model):
  
    def __init__(self):
        super().__init__()

        self.user_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=unique_user_id, mask_token=None),
            tf.keras.layers.Embedding(len(unique_user_id) + 1, 32),
        ])

        self.gender_embedding = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                    vocabulary=unique_gender, mask_token=None),
            tf.keras.layers.Embedding(len(unique_gender) + 1, 32),
        ])


    def call(self, inputs):
        return tf.concat([
            self.user_embedding(inputs["user_id"]),
            self.gender_embedding(inputs["Gender"]),
        ], axis=1)

## Product model
- Simple start: use only product id

**TODO**
- Find out what `max_tokens` we should use / or if we can remove it
- Parametrize the embeddings dimension

In [37]:
class ProductModel(tf.keras.Model):
  
    def __init__(self):
        super().__init__()

        max_tokens = 10_000

        self.product_embedding = tf.keras.Sequential([
          tf.keras.layers.experimental.preprocessing.StringLookup(
              vocabulary=unique_product_id, mask_token=None),
          tf.keras.layers.Embedding(len(unique_product_id) + 1, 32)
        ])


    def call(self, products_id):
        return tf.concat([
            self.product_embedding(products_id)
        ], axis=1)

## Combined model

**Info**
-  `FactorizedTopK` take the list of candidates for build the metric [link](https://www.tensorflow.org/recommenders/api_docs/python/tfrs/metrics/FactorizedTopK)

**TODO**
- Parametrize the embedding dimension
- ❓ Explore the model: we have created two layers of 32 neurons?
- ❓ Ensure `FactorizedTopK` is well understood

In [38]:
# TODO refactory and move this code
tf_products = tf.data.Dataset.from_tensor_slices(df_train["Product_ID"].astype(str).values)

In [39]:
class BlackFridayModel(tfrs.models.Model):

    def __init__(self):
        super().__init__()
        self.query_model = tf.keras.Sequential([
          UserModel(),
          tf.keras.layers.Dense(32)
        ])
        self.candidate_model = tf.keras.Sequential([
          ProductModel(),
          tf.keras.layers.Dense(32)
        ])
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=tf_products.batch(128).map(self.candidate_model), # dataset of candidate embeddings from which candidates should be retrieved (embedded)
            ),
        )

    def compute_loss(self, features, training=False):
        query_embeddings = self.query_model({
            "User_ID": features["User_ID"],
            "Gender": features["Gender"],
        })
        product_embeddings = self.candidate_model(features["movie_title"])

        return self.task(query_embeddings, product_embeddings)

## Prepare the data

In [40]:
df_train_parsed = df_train[["User_ID", "Gender", "Product_ID"]]
df_train_parsed.head()

Unnamed: 0,User_ID,Gender,Product_ID
0,1004732,M,P00235842
1,1003903,M,P00180442
2,1002340,M,P00303842
3,1005795,M,P00201342
4,1002092,M,P00178442


In [41]:
df_train["User_ID"] = df_train["User_ID"].astype(str)
df_train["Gender"] = df_train["Gender"].astype(str)
df_train["Product_ID"] = df_train["Product_ID"].astype(str)

In [42]:
df_train_parsed = df_train[["User_ID", "Gender", "Product_ID"]]
df_train_parsed = df_train_parsed.reset_index(drop=True)
df_train_parsed.dtypes

User_ID       object
Gender        object
Product_ID    object
dtype: object

In [43]:
# TODO refactory and move this code
tf_dataset = tf.data.Dataset.from_tensor_slices(df_train_parsed.to_numpy())
tf_dataset

<TensorSliceDataset shapes: (3,), types: tf.string>

In [44]:
tf.random.set_seed(42)
shuffled = tf_dataset.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

In [46]:
model = BlackFridayModel()

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

In [22]:
model = BlackFridayModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

model.fit(cached_train, epochs=3)

train_accuracy = model.evaluate(
    cached_train, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
test_accuracy = model.evaluate(
    cached_test, return_dict=True)["factorized_top_k/top_100_categorical_accuracy"]
    
print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

  has_oov = vocab[oov_start:oov_end] == expected_oov
  if self.oov_token in tokens:


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).