In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from dfply import *
import tensorflow_recommenders as tfrs

%matplotlib inline

In [2]:
df = pd.read_csv("./data/ratings_Electronics.csv", names=['userId', 'productId','rating','timestamp'])

In [3]:
df.head()

Unnamed: 0,userId,productId,rating,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


In [4]:
from datetime import datetime
start_date = datetime.fromtimestamp(df["timestamp"].min())
end_date = datetime.fromtimestamp(df["timestamp"].max())
print(f"start date: {start_date}.")
print(f"end date: {end_date}.")

start date: 1998-12-04 00:00:00.
end date: 2014-07-23 01:00:00.


In [5]:
df_sub = df >> mask(
    X.timestamp <= (X.timestamp.min() + X.timestamp.max())/2
)
df_sub

Unnamed: 0,userId,productId,rating,timestamp
165,A2R4GEWPLORVSO,0899336795,2.0,1103328000
166,A1KKUYTDUZDZSA,0899336795,4.0,1104192000
170,A28K8QC9C4WPGE,0899336795,1.0,1141084800
171,A266DODBJYK0X,0899336795,1.0,1116806400
172,A17RBVZX3VTNBW,0899336795,1.0,1111449600
...,...,...,...,...
7341070,A2RWJ3QJRXGD96,B00D12U1IK,5.0,1155340800
7341075,AY8YVQBZ074SB,B00D12U1IK,5.0,1108425600
7438092,ALBE8E3807XTN,B00DR0PDNE,5.0,1044835200
7491311,A3Q6RSWV4M4052,B00E3QH61S,4.0,1135123200


In [6]:
ds = tf.data.Dataset.from_tensor_slices(dict(df_sub))

2021-10-21 09:28:33.671209: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
rating = ds.map(lambda x: {
    "productId": x["productId"],
    "userId": x["userId"],
    "timestamp": x["timestamp"],
    "rating": x["rating"]
})

In [8]:
tf.random.set_seed(42)
shuffled = rating.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

In [9]:
products = rating.batch(1_000_000).map(lambda x: x["productId"])
user_ids = rating.batch(1_000_000).map(lambda x: x["userId"])

unique_products = np.unique(np.concatenate(list(products)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

2021-10-21 09:28:38.723667: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [10]:
timestamps = np.concatenate(list(rating.map(lambda x: x["timestamp"]).batch(100)))

max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,
)

In [18]:
class RankingModel(tf.keras.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 32

    # Compute embeddings for users.
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

    # Compute embeddings for products.
    self.product_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_products, mask_token=None),
      tf.keras.layers.Embedding(len(unique_products) + 1, embedding_dimension)
    ])

    #Compute embeddings for time of rating.
    self.timestamp_embedding = tf.keras.Sequential([
          tf.keras.layers.Discretization(timestamp_buckets.tolist()),
          tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
      ])

    # Compute predictions.
    self.ratings = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
  ])

  def call(self, inputs):

    user_id, timestamp, products = inputs

    user_embedding = self.user_embeddings(user_id)
    timestamp_embedding = self.timestamp_embedding(timestamp)
    product_embedding = self.product_embeddings(products)


    return self.ratings(tf.concat([user_embedding, timestamp_embedding, product_embedding], axis=1))

In [35]:
RankingModel()((["AT6CZDCP4TRGA"],[df_sub.timestamp.max()],["0439886341"]))

Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.


<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.02492321]], dtype=float32)>

In [22]:
from typing import Dict, Text
class ProductRankModel(tfrs.models.Model):

  def __init__(self):
    super().__init__()
    self.ranking_model: tf.keras.Model = RankingModel()
    self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
      loss = tf.keras.losses.MeanSquaredError(),
      metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )

  def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
    return self.ranking_model(
        (features["userId"], features["timestamp"], features["productId"]))

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    labels = features.pop("rating")

    rating_predictions = self(features)

    # The task computes the loss and the metrics.
    return self.task(labels=labels, predictions=rating_predictions)

In [23]:
rank_model = ProductRankModel()
rank_model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [24]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [25]:
rank_model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1a58e1070>

In [26]:
rank_model.evaluate(cached_test, return_dict=True)



{'root_mean_squared_error': 1.5124731063842773,
 'loss': 2.240142822265625,
 'regularization_loss': 0,
 'total_loss': 2.240142822265625}

In [34]:
test_ratings = {}
test_products = ["B00005A1K1", "0439886341","6301977173","7805717443","B00005RKO5"]
for product in test_products:
  test_ratings[product] = rank_model({
      "userId": np.array(["AGHZXQL9F94T9"]),
      "timestamp": np.array([df_sub.timestamp.max()]),
      "productId": np.array([product])
  })

print("Ratings:")
for title, score in sorted(test_ratings.items(), key=lambda x: x[1], reverse=True):
  print(f"{title}: {score}")

Ratings:
6301977173: [[3.707836]]
B00005A1K1: [[3.6616886]]
7805717443: [[3.638921]]
0439886341: [[3.593733]]
B00005RKO5: [[3.5905035]]


In [28]:
(df_sub >> group_by(
    X.userId
) >> summarize(
    Count = n(X.userId)
) >> ungroup() >> arrange(
    X.Count,ascending=False
)).head(10)

Unnamed: 0,userId,Count
126264,A5JLAU2ARJ0BO,374
46877,A231WM2Z2JL0U3,249
49805,A25HBO5V8S8SEA,164
144274,AKT8TGIT6VVZ5,112
154132,AT6CZDCP4TRGA,94
1647,A11D1KHM7DVOQK,90
56487,A2B7BUH8834Y6M,87
55605,A2AEZQ3DGBBLPR,80
139230,AGHZXQL9F94T9,73
2774,A12DLJESJKM1OQ,64


In [33]:
(df_sub >> mask(
    X.userId == "AGHZXQL9F94T9"    
)).head(10)

Unnamed: 0,userId,productId,rating,timestamp
31428,AGHZXQL9F94T9,B00002EQD2,5.0,1017187200
61812,AGHZXQL9F94T9,B00004XRED,5.0,1055635200
99076,AGHZXQL9F94T9,B00005ICE3,5.0,1013817600
111658,AGHZXQL9F94T9,B00005NVPW,2.0,1021507200
113379,AGHZXQL9F94T9,B00005OQMO,5.0,1006214400
115958,AGHZXQL9F94T9,B00005QWZ7,1.0,1007078400
116953,AGHZXQL9F94T9,B00005RKO5,1.0,1039046400
131960,AGHZXQL9F94T9,B0000630TO,5.0,1052611200
134768,AGHZXQL9F94T9,B000063BGY,2.0,1022371200
137987,AGHZXQL9F94T9,B0000645C8,5.0,1021507200
