In [None]:
!pip install -q tensorflow-recommenders
!pip install -q scann

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from typing import Dict, Text
import os
import pprint
import tempfile
import datetime
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
click_data = pd.read_csv("/content/drive/MyDrive/Datasets/recommender/CLEVERTRAP MASTER Test-1687804060501.csv")
click_data.head()

In [None]:
# 'event_props' column contains string values instead of dictionaries
# ast.literal_eval() function is used to parse the string values in the 'event_props' column into dictionaries
import ast
def parse_event_props(x):
    if isinstance(x, str):  # Check if x is a string
        return ast.literal_eval(x)
    else:
        return np.nan  # Return nan for non-string values

click_data['event_props'] = click_data['event_props'].apply(parse_event_props)

In [None]:
def get_item_id(x):
    if isinstance(x, dict):  # Check if x is a dictionary
        return x.get('item_id', None)
    else:
        return None  # Return None for non-dictionary values

click_data['sku']=click_data['event_props'].apply(get_item_id)
click_data.dropna(subset=['sku', 'email'], inplace=True)

In [None]:
event_type_weights = {
   'remove_from_cart':0.0,
   'view_item': 1.0,
   'homepage_carousel_product_press':1.0,
   'add_to_cart':2.0,
   'image_download':2.0
}
click_data['events'] = click_data['events'].apply(lambda x: event_type_weights[x])

In [None]:
click_data = pd.DataFrame({'email': click_data['email'], 'sku': click_data['sku'],'events':click_data['events']})
click_data.head()

In [None]:
interactions = tf.data.Dataset.from_tensor_slices(dict(click_data))
interactions = interactions.map(lambda x: {
    'email' : x['email'],
    'sku' : x['sku'],
    'events' : float(x['events']),

})

In [None]:
items = pd.DataFrame(click_data["sku"].unique(), columns=["sku"])
items = tf.data.Dataset.from_tensor_slices(dict(items))
items = items.map(lambda x: x['sku'])

In [None]:
tf.random.set_seed(42)
shuffled = interactions.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(100_000)
# test = shuffled.skip(80_000).take(20_000)

In [None]:
item_names = items.batch(1_000)
emails = interactions.batch(1_000_000).map(lambda x: x["email"])

unique_items = np.unique(np.concatenate(list(item_names)))
unique_emails = np.unique(np.concatenate(list(emails)))


In [None]:
class RecommendationModel(tfrs.Model):
  def __init__(self):
    super().__init__()
    embedding_dimension = 32

    self.item_model: tf.keras.Model = tf.keras.Sequential([tf.keras.layers.StringLookup(vocabulary=unique_items, mask_token=None),
                                                           tf.keras.layers.Embedding(len(unique_items) + 1, embedding_dimension)
    ])

    self.email_model: tf.keras.Model = tf.keras.Sequential([tf.keras.layers.StringLookup(vocabulary=unique_emails, mask_token=None),
                                                            tf.keras.layers.Embedding(len(unique_emails) + 1, embedding_dimension)
    ])

    metrics = tfrs.metrics.FactorizedTopK(candidates=items.batch(128).map(self.item_model))
    self.task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(metrics=metrics)


  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

    user_embeddings = self.email_model(features["email"])
    item_embeddings = self.item_model(features["sku"])
    return self.task(user_embeddings, item_embeddings)

In [None]:
model = RecommendationModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
cached_train = train.shuffle(100_000).batch(8192).cache()
# cached_test = test.batch(4096).cache()

In [None]:
model_hist = model.fit(cached_train, epochs=10, callbacks=[tensorboard_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
scann_index = tfrs.layers.factorized_top_k.ScaNN(model.email_model)
scann_index.index_from_dataset(
  tf.data.Dataset.zip((items.batch(100), items.batch(100).map(model.item_model)))
)

<tensorflow_recommenders.layers.factorized_top_k.ScaNN at 0x79f3ad9f5c90>

In [None]:
_, titles = scann_index(tf.constant(["abcdefghijk@gmail.com"]))
print(f"Recommendations for user: {titles[0,:].numpy().astype(str)}")

In [None]:
%timeit _, titles = scann_index(tf.constant(["abcdef@gmail.com"]))

3.35 ms ± 618 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
with tempfile.TemporaryDirectory() as tmp:
  path = os.path.join(tmp, "/content/drive/MyDrive/Datasets")

  # Save the index.
  tf.saved_model.save(
      scann_index,
      path,
      options=tf.saved_model.SaveOptions(namespace_whitelist=["Scann"])
  )



In [None]:
loaded = tf.saved_model.load(path)
_, titles = loaded(tf.constant(["abcdefghijk@gmail.com"]))
print(f"Recommendations for user: {titles[0,:].numpy().astype(str)}")