In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from dfply import *
import tensorflow_recommenders as tfrs

%matplotlib inline

In [2]:
df = pd.read_csv("./data/ratings_Electronics.csv", names=['userId', 'productId','rating','timestamp'])

In [3]:
df.head()

Unnamed: 0,userId,productId,rating,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


In [4]:
from datetime import datetime
start_date = datetime.fromtimestamp(df["timestamp"].min())
end_date = datetime.fromtimestamp(df["timestamp"].max())
print(f"start date: {start_date}.")
print(f"end date: {end_date}.")

start date: 1998-12-04 00:00:00.
end date: 2014-07-23 01:00:00.


In [5]:
df_sub = df >> mask(
    X.timestamp <= (X.timestamp.min() + X.timestamp.max())/2
)
df_after = df >> mask(
    X.timestamp > (X.timestamp.min() + X.timestamp.max())/2
)
df_sub.head()

Unnamed: 0,userId,productId,rating,timestamp
165,A2R4GEWPLORVSO,899336795,2.0,1103328000
166,A1KKUYTDUZDZSA,899336795,4.0,1104192000
170,A28K8QC9C4WPGE,899336795,1.0,1141084800
171,A266DODBJYK0X,899336795,1.0,1116806400
172,A17RBVZX3VTNBW,899336795,1.0,1111449600


In [6]:
product_lst = (df_sub >> group_by(
    X.productId
) >> summarize(
    Count = n(X.userId)
) >> ungroup() >> arrange(
    X.Count
)>> mask(
    X.Count <=10
))['productId'].values

In [7]:
df_sub=(df_sub >> mask(
    ~X.productId.isin(product_lst)
))

In [8]:
ds = tf.data.Dataset.from_tensor_slices(dict(df_sub))

2021-10-21 16:29:10.101220: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
rating = ds.map(lambda x: {
    "productId": x["productId"],
    "userId": x["userId"],
    "timestamp": x["timestamp"],
    "rating": x["rating"]
})

In [10]:
tf.random.set_seed(42)
shuffled = rating.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

In [11]:
products = rating.batch(100_000).map(lambda x: x["productId"])
user_ids = rating.batch(100_000).map(lambda x: x["userId"])

unique_products = np.unique(np.concatenate(list(products)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

2021-10-21 16:30:06.708379: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [12]:
timestamps = np.concatenate(list(rating.map(lambda x: x["timestamp"]).batch(100)))

max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,
)

In [13]:
class RankingModel(tf.keras.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 32

    # Compute embeddings for users.
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

    # Compute embeddings for products.
    self.product_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_products, mask_token=None),
      tf.keras.layers.Embedding(len(unique_products) + 1, embedding_dimension)
    ])

    #Compute embeddings for time of rating.
    self.timestamp_embedding = tf.keras.Sequential([
          tf.keras.layers.Discretization(timestamp_buckets.tolist()),
          tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
      ])

    # Compute predictions.
    self.ratings = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
  ])

  def call(self, inputs):

    user_id, timestamp, products = inputs

    user_embedding = self.user_embeddings(user_id)
    timestamp_embedding = self.timestamp_embedding(timestamp)
    product_embedding = self.product_embeddings(products)


    return self.ratings(tf.concat([user_embedding, timestamp_embedding, product_embedding], axis=1))

In [14]:
RankingModel()((["AT6CZDCP4TRGA"],[df_sub.timestamp.max()],["0439886341"]))

Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.


<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.03232504]], dtype=float32)>

In [15]:
from typing import Dict, Text
class ProductRankModel(tfrs.models.Model):

  def __init__(self):
    super().__init__()
    self.ranking_model: tf.keras.Model = RankingModel()
    self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
      loss = tf.keras.losses.MeanSquaredError(),
      metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )

  def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
    return self.ranking_model(
        (features["userId"], features["timestamp"], features["productId"]))

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    labels = features.pop("rating")

    rating_predictions = self(features)

    # The task computes the loss and the metrics.
    return self.task(labels=labels, predictions=rating_predictions)

In [16]:
rank_model = ProductRankModel()
rank_model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [17]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [18]:
rank_model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1aca81100>

In [19]:
rank_model.evaluate(cached_test, return_dict=True)



{'root_mean_squared_error': 1.4738773107528687,
 'loss': 2.1955771446228027,
 'regularization_loss': 0,
 'total_loss': 2.1955771446228027}

In [23]:
test_ratings = {}
test_products = ["B00002EQD2","B000063BGY"]
for product in test_products:
  test_ratings[product] = rank_model({
      "userId": np.array(["AGHZXQL9F94T9"]),
      "timestamp": np.array([df_sub.timestamp.max()]),
      "productId": np.array([product])
  })

print("Ratings:")
for title, score in sorted(test_ratings.items(), key=lambda x: x[1], reverse=True):
  print(f"{title}: {score}")

Ratings:
B000063BGY: [[3.7539911]]
B00002EQD2: [[3.72746]]


In [21]:
(df_sub >> group_by(
    X.userId
) >> summarize(
    Count = n(X.userId)
) >> ungroup() >> arrange(
    X.Count,ascending=False
)).head(10)

Unnamed: 0,userId,Count
94498,A5JLAU2ARJ0BO,228
35085,A231WM2Z2JL0U3,173
104141,AGHZXQL9F94T9,61
99819,ABH9D1TEOJ56E,49
115345,AT2J7H5TRZM8Z,46
2098,A12DLJESJKM1OQ,46
115439,AT6CZDCP4TRGA,41
3765,A149RNR5RH19YY,41
41666,A2AEZQ3DGBBLPR,40
20292,A1MJMYLRTZ76ZX,38


In [22]:
(df_sub >> mask(
    X.userId == "AGHZXQL9F94T9"    
)).head(10)

Unnamed: 0,userId,productId,rating,timestamp
31428,AGHZXQL9F94T9,B00002EQD2,5.0,1017187200
61812,AGHZXQL9F94T9,B00004XRED,5.0,1055635200
99076,AGHZXQL9F94T9,B00005ICE3,5.0,1013817600
111658,AGHZXQL9F94T9,B00005NVPW,2.0,1021507200
113379,AGHZXQL9F94T9,B00005OQMO,5.0,1006214400
131960,AGHZXQL9F94T9,B0000630TO,5.0,1052611200
134768,AGHZXQL9F94T9,B000063BGY,2.0,1022371200
137987,AGHZXQL9F94T9,B0000645C8,5.0,1021507200
138098,AGHZXQL9F94T9,B0000645C9,2.0,1020556800
144455,AGHZXQL9F94T9,B000065UTC,4.0,1059436800
