In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.utils import shuffle
import pickle

from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from keras.layers import Dropout, BatchNormalization, Activation,  Dot, Add
from keras.regularizers import l2
from keras.optimizers import SGD, Adam

In [None]:
from google.colab import  drive

drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [None]:
df = pd.read_csv('/drive/MyDrive/Datasets/amazon_fashion/cleaned_amazon_fashion.csv')

In [None]:
!pip install -q tensorflow-recommenders

In [None]:
import tensorflow_recommenders as tfrs
import tensorflow as tf
from typing import Dict, Text

In [None]:
df = tf.data.Dataset.from_tensor_slices(df[['asin', 'reviewerID']].to_dict(orient='list'))

In [None]:
ratings = df.map(lambda x: {
    "asin": x["asin"],
    "reviewerID": x["reviewerID"],
})
asin = df.map(lambda x: x["asin"])

In [None]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(len(df), seed=42, reshuffle_each_iteration=False)
cutoff = int(0.8*len(df))
train = shuffled.take(cutoff)
test = shuffled.skip(cutoff).take(len(df) - cutoff)

In [None]:
asin_titles = asin.batch(len(df))
reviewerID = ratings.batch(len(df)).map(lambda x: x["reviewerID"])

unique_asin = np.unique(np.concatenate(list(asin_titles)))
unique_reviewerID = np.unique(np.concatenate(list(reviewerID)))

unique_asin[:10]

array([b'B000YFSR4W', b'B000YFSR5G', b'B0014F7B98', b'B001IKJOLW',
       b'B0058YEJ5K', b'B005AGO4LU', b'B0092UF54A', b'B009MA34NY',
       b'B010RRWKT4', b'B014IBJKNO'], dtype=object)

In [None]:
embedding_dimension = 32

In [None]:
user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_reviewerID, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_reviewerID) + 1, embedding_dimension)
])

In [None]:
fashion_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_asin, mask_token=None),
  tf.keras.layers.Embedding(len(unique_asin) + 1, embedding_dimension)
])

In [None]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=asin.batch(128).map(fashion_model)
)

In [None]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [None]:
class FashionModel(tfrs.Model):

  def __init__(self, user_model, fashion_model):
    super().__init__()
    self.fashion_model: tf.keras.Model = fashion_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["reviewerID"])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_fashion_embeddings = self.fashion_model(features["asin"])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_fashion_embeddings)


In [None]:
FashionModel(user_model,fashion_model)

<__main__.FashionModel at 0x7faeb10b5ca0>

In [None]:
user_model

<keras.engine.sequential.Sequential at 0x7faeb16ebfa0>

In [None]:
class NoBaseClassFashionModel(tf.keras.Model):

  def __init__(self, user_model, fashion_model):
    super().__init__()
    self.fashion_model: tf.keras.Model = fashion_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:

    # Set up a gradient tape to record gradients.
    with tf.GradientTape() as tape:

      # Loss computation.
      user_embeddings = self.user_model(features["reviewerID"])
      positive_fashion_embeddings = self.fashion_model(features["asin"])
      loss = self.task(user_embeddings, positive_fashion_embeddings)

      # Handle regularization losses as well.
      regularization_loss = sum(self.losses)

      total_loss = loss + regularization_loss

    gradients = tape.gradient(total_loss, self.trainable_variables)
    self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

    metrics = {metric.name: metric.result() for metric in self.metrics}
    metrics["loss"] = loss
    metrics["regularization_loss"] = regularization_loss
    metrics["total_loss"] = total_loss

    return metrics

  def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:

    # Loss computation.
    user_embeddings = self.user_model(features["reviewerID"])
    positive_fashion_embeddings = self.fashion_model(features["asin"])
    loss = self.task(user_embeddings, positive_fashion_embeddings)

    # Handle regularization losses as well.
    regularization_loss = sum(self.losses)

    total_loss = loss + regularization_loss

    metrics = {metric.name: metric.result() for metric in self.metrics}
    metrics["loss"] = loss
    metrics["regularization_loss"] = regularization_loss
    metrics["total_loss"] = total_loss

    return metrics

In [None]:
metrics

<tensorflow_recommenders.metrics.factorized_top_k.FactorizedTopK at 0x7faeb10bc220>

In [None]:
NoBaseClassFashionModel(user_model,fashion_model)

<__main__.NoBaseClassFashionModel at 0x7faeb16d2550>

In [None]:
metrics.variables

[<tf.Variable 'counter:0' shape=() dtype=int32, numpy=0>]

In [None]:
model = FashionModel(user_model, fashion_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [None]:
cached_train = train.shuffle(len(df)).batch(2500).cache()
cached_test = test.batch(522).cache()

In [None]:
len(df)-2500

522

In [None]:
model.fit(cached_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7faeb1646070>

In [None]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.0,
 'factorized_top_k/top_5_categorical_accuracy': 0.0,
 'factorized_top_k/top_10_categorical_accuracy': 0.0,
 'factorized_top_k/top_50_categorical_accuracy': 0.00826446246355772,
 'factorized_top_k/top_100_categorical_accuracy': 0.00826446246355772,
 'loss': 5529.19921875,
 'regularization_loss': 0,
 'total_loss': 5529.19921875}

In [None]:
model.evaluate_generator

<bound method Model.evaluate_generator of <__main__.FashionModel object at 0x7faeb1651910>>

In [None]:
model.predict

<bound method Model.predict of <__main__.FashionModel object at 0x7faeb1651910>>

In [None]:
model.predict(cached_test)

In [None]:
len(df) -3000

22

In [None]:
df1 = pd.read_csv('/drive/MyDrive/Datasets/amazon_fashion/cleaned_amazon_fashion.csv')

In [None]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3022 entries, 0 to 3021
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      3022 non-null   int64  
 1   overall         3022 non-null   float64
 2   verified        3022 non-null   bool   
 3   reviewTime      3022 non-null   object 
 4   reviewerID      3022 non-null   object 
 5   asin            3022 non-null   object 
 6   reviewerName    3022 non-null   object 
 7   reviewText      3006 non-null   object 
 8   summary         3022 non-null   object 
 9   unixReviewTime  3022 non-null   int64  
 10  size            3009 non-null   object 
 11  color           3009 non-null   object 
dtypes: bool(1), float64(1), int64(2), object(8)
memory usage: 262.8+ KB


In [None]:
df1 = tf.data.Dataset.from_tensor_slices(df1[['asin', 'reviewerID','overall']].to_dict(orient='list'))

In [None]:
ratings1 = df1.map(lambda x: {
     "asin": x["asin"],
     "reviewerID": x["reviewerID"],
     "overall": x["overall"]
})

In [None]:
tf.random.set_seed(42)
shuffled = ratings1.shuffle(len(df1), seed=42, reshuffle_each_iteration=False)
cutoff = int(0.8*len(df))
train1 = shuffled.take(cutoff)
test1 = shuffled.skip(cutoff).take(len(df1) - cutoff)

In [None]:
asin = ratings1.batch(len(df1)).map(lambda x: x["asin"])
reviewerID = ratings.batch(len(df1)).map(lambda x: x["reviewerID"])

unique_asin = np.unique(np.concatenate(list(asin)))
unique_reviewerID = np.unique(np.concatenate(list(reviewerID)))


In [None]:
class RankingModel(tf.keras.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 32

    # Compute embeddings for users.
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_reviewerID, mask_token=None),
      tf.keras.layers.Embedding(len(unique_reviewerID) + 1, embedding_dimension)
    ])

    # Compute embeddings for asin.
    self.movie_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_asin, mask_token=None),
      tf.keras.layers.Embedding(len(unique_asin) + 1, embedding_dimension)
    ])

    # Compute predictions.
    self.ratings = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
  ])

  def call(self, inputs):

    reviewerID, asin = inputs

    user_embedding = self.user_embeddings(reviewerID)
    asin_embedding = self.movie_embeddings(asin)

    return self.ratings(tf.concat([user_embedding, asin_embedding], axis=1))

In [None]:
RankingModel()((["A3DDWDH9PX2YX2"], ["B000K2PJ4K"]))



<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.01580527]], dtype=float32)>

In [None]:
task = tfrs.tasks.Ranking(
  loss = tf.keras.losses.MeanSquaredError(),
  metrics=[tf.keras.metrics.RootMeanSquaredError()]
)

In [None]:
class MovielensModel(tfrs.models.Model):

  def __init__(self):
    super().__init__()
    self.ranking_model: tf.keras.Model = FashionModel(user_model , fashion_model)
    self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
      loss = tf.keras.losses.MeanSquaredError(),
      metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )

  def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
    return self.ranking_model(
        (features["reviewerID"], features["asin"]))

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    labels = features.pop("overall")

    rating_predictions = self(features)

    # The task computes the loss and the metrics.
    return self.task(labels=labels, predictions=rating_predictions)

In [None]:
model = MovielensModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [None]:
cached_train1 = train1.shuffle(len(df1)).batch(2417).cache()
cached_test1 = test1.batch(605).cache()

In [None]:
cached_train1

<CacheDataset element_spec={'asin': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'reviewerID': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'overall': TensorSpec(shape=(None,), dtype=tf.float32, name=None)}>

In [None]:
model.fit(cached_train1, epochs=3)

In [None]:
model.evaluate(cached_test1, return_dict=True)



{'root_mean_squared_error': 2.0439889430999756,
 'loss': 4.177891254425049,
 'regularization_loss': 0,
 'total_loss': 4.177891254425049}

In [None]:
test_ratings = {}
test_asin = ["B000K2PJ4K", "B00G8Q7JZ4", "B00GKF5BAS"]
for asin in test_asin:
  test_ratings[asin] = model({
      "reviewerID": np.array(["A3DDWDH9PX2YX2"]),
      "asin": np.array([asin])
  })

print("overall:")
for title, score in sorted(test_ratings.items(), key=lambda x: x[1], reverse=True):
  print(f"{title}: {score}")

overall:
B000K2PJ4K: [[5.4934473]]
B00G8Q7JZ4: [[5.4934473]]
B00GKF5BAS: [[5.4934473]]
