In [2]:
%%capture
!pip install tensorflow-recommenders
!pip install scann

In [3]:
### Import necessary libraries

from typing import Dict, Text

import numpy as np
import tensorflow as tf

import tensorflow_recommenders as tfrs

import time
import datetime

import os
import pprint
import tempfile

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
def reduce_size(df):
    df = df.copy()
    for col in df.select_dtypes(include='int64').columns:
        df[col] = pd.to_numeric(df[col], downcast='unsigned')

    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype('category')

    df.info(verbose=False)
    return df

In [5]:
df_1 = reduce_size(pd.read_csv(r'/kaggle/input/online-retail-ii-data-set-from-ml-repository/Year 2009-2010.csv', encoding='unicode_escape'))
df_2 = reduce_size(pd.read_csv(r'/kaggle/input/online-retail-ii-data-set-from-ml-repository/Year 2010-2011.csv', encoding='unicode_escape'))
master_df = df_1.append(df_2, ignore_index = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Columns: 8 entries, Invoice to Country
dtypes: category(5), float64(2), int64(1)
memory usage: 19.3 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541910 entries, 0 to 541909
Columns: 8 entries, Invoice to Country
dtypes: category(5), float64(2), int64(1)
memory usage: 19.3 MB


  master_df = df_1.append(df_2, ignore_index = True)


In [6]:
master_df = master_df.dropna(subset=['Customer ID']).reset_index(drop=True)
master_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 824364 entries, 0 to 824363
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Invoice      824364 non-null  object 
 1   StockCode    824364 non-null  object 
 2   Description  824364 non-null  object 
 3   Quantity     824364 non-null  int64  
 4   InvoiceDate  824364 non-null  object 
 5   Price        824364 non-null  float64
 6   Customer ID  824364 non-null  float64
 7   Country      824364 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 50.3+ MB


In [42]:
user_df = master_df[['StockCode', 'Description', 'Quantity', 'InvoiceDate', 'Customer ID', 'Country']]
item_df = master_df[['StockCode', 'Description']]

In [43]:
import time
import datetime
user_df['timestamp'] = user_df['InvoiceDate'].apply(lambda x: time.mktime(datetime.datetime.strptime(x, timestamp_format).timetuple()))
user_df['Customer ID'] = user_df['Customer ID'].astype(str)
user_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_df['timestamp'] = user_df['InvoiceDate'].apply(lambda x: time.mktime(datetime.datetime.strptime(x, timestamp_format).timetuple()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_df['Customer ID'] = user_df['Customer ID'].astype(str)


Unnamed: 0,StockCode,Description,Quantity,InvoiceDate,Customer ID,Country,timestamp
0,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,12/1/2009 7:45,13085.0,United Kingdom,1259654000.0
1,79323P,PINK CHERRY LIGHTS,12,12/1/2009 7:45,13085.0,United Kingdom,1259654000.0
2,79323W,WHITE CHERRY LIGHTS,12,12/1/2009 7:45,13085.0,United Kingdom,1259654000.0
3,22041,"RECORD FRAME 7"" SINGLE SIZE",48,12/1/2009 7:45,13085.0,United Kingdom,1259654000.0
4,21232,STRAWBERRY CERAMIC TRINKET BOX,24,12/1/2009 7:45,13085.0,United Kingdom,1259654000.0


In [44]:
interactions_dict = user_df.groupby(['Customer ID', 'StockCode', 'Description', 'timestamp', 'Country'])[ 'Quantity'].sum().reset_index()

interactions_dict = {name: np.array(value) for name, value in interactions_dict.items()}
interactions = tf.data.Dataset.from_tensor_slices(interactions_dict)

items_dict = item_df[['StockCode', 'Description']].drop_duplicates(subset=['StockCode'])

items_dict = {name: np.array(value) for name, value in items_dict.items()}
items = tf.data.Dataset.from_tensor_slices(items_dict)

interactions = interactions.map(lambda x: {
                                            'user_id' : x['Customer ID'], 
                                            'item_id' : x['StockCode'], 
                                            'quantity' : float(x['Quantity']),
                                            "timestamp": x["timestamp"],
                                            'country': x['Country'],
                                            'item_name': x['Description'],
                                        })

items = items.map(lambda x: {'item_id': x['StockCode'],
                             'item_name': x['Description'],
                            })

In [45]:
## Basic housekeeping to prepare feature vocabularies

## timestamp is an exmaple of continuous features, which needs to be rescaled, or otherwise it will be 
## too large for the model.
## there are other methods to reduce the size of the timestamp, ,such as standardization and normalization
## here we use discretization, which puts them into buckets of categorical features, 

timestamps = np.concatenate(list(interactions.map(lambda x: x["timestamp"]).batch(100)))
max_timestamp = timestamps.max()
min_timestamp = timestamps.min()
timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,)

unique_user_ids = np.unique(np.concatenate(list(interactions.batch(1_000).map(lambda x: x["user_id"]))))
unique_user_quantity = np.unique(np.concatenate(list(interactions.batch(1_000).map(lambda x: x["quantity"]))))
unique_user_country = np.unique(np.concatenate(list(interactions.batch(1_000).map(lambda x: x["country"]))))

unique_item_ids = np.unique(np.concatenate(list(items.batch(1_000).map(lambda x: x["item_id"]))))
unique_item_name = np.unique(np.concatenate(list(items.batch(1_000).map(lambda x: x["item_name"]))))

# item_titles = interactions.batch(10_000).map(lambda x: x["item_id"])
# user_ids = interactions.batch(10_000).map(lambda x: x["user_id"])

# unique_item_titles = np.unique(np.concatenate(list(item_titles)))
# unique_user_ids = np.unique(np.concatenate(list(user_ids)))

# unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1000))))
 
# unique_user_gender = np.unique(np.concatenate(list(ratings.batch(1_000).map(
#     lambda x: x["user_gender"]))))
 
# unique_bucketized_user_age = np.unique(np.concatenate(list(ratings.batch(1_000).map(
#     lambda x: x["bucketized_user_age"]))))
# unique_user_occupation_label = np.unique(np.concatenate(list(ratings.batch(1_000).map(
#     lambda x: x["user_occupation_label"]))))

In [46]:
class UserModel(tf.keras.Model):
   
  def __init__(self):
    super().__init__()
    
    self.user_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(
            vocabulary=unique_user_ids, mask_token=None),
        tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32),
    ])
 
    self.country_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(
            vocabulary=unique_user_country, mask_token=None),
        tf.keras.layers.Embedding(len(unique_user_country) + 1, 32),
    ])
         
    self.timestamp_embedding = tf.keras.Sequential([
        tf.keras.layers.Discretization(timestamp_buckets.tolist()),
        tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
    ])
    self.normalized_timestamp = tf.keras.layers.Normalization(
        axis=None
    )
 
    self.normalized_timestamp.adapt(timestamps)
 
  def call(self, inputs):
    # Take the input dictionary, pass it through each input layer,
    # and concatenate the result.
    return tf.concat([
        self.user_embedding(inputs["user_id"]),
        self.country_embedding(inputs["country"]),
        self.timestamp_embedding(inputs["timestamp"]),
        tf.reshape(self.normalized_timestamp(inputs["timestamp"]), (-1, 1)),
    ], axis=1)

In [47]:
class QueryModel(tf.keras.Model):
  '"""Model for encoding user queries."""'
 
  def __init__(self, layer_sizes):
    """Model for encoding user queries.
 
    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()
 
    # We first use the user model for generating embeddings.
    self.embedding_model = UserModel()
 
    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()
 
    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))
 
    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))
     
  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

In [78]:
class ItemModel(tf.keras.Model):
   
  def __init__(self):
    super().__init__()
 
    max_tokens = 10_000
    
    self.item_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(
            vocabulary=unique_item_ids, mask_token=None),
        tf.keras.layers.Embedding(len(unique_item_ids) + 1, 32),
    ])
 
    self.name_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_item_name,mask_token=None),
      tf.keras.layers.Embedding(len(unique_item_name) + 1, 32)
    ])
 
    self.name_vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=max_tokens)
 
    self.name_text_embedding = tf.keras.Sequential([
      self.name_vectorizer,
      tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
      tf.keras.layers.GlobalAveragePooling1D(),
    ])
 
    self.name_vectorizer.adapt(items.map(lambda x: x['item_name']))
 
  def call(self, inputs):
    return tf.concat([
        self.item_embedding(inputs['item_id']),
        self.name_embedding(inputs['item_name']),
        self.name_text_embedding(inputs['item_name']),
    ], axis=1)

In [79]:
class CandidateModel(tf.keras.Model):
  """Model for encoding items."""
 
  def __init__(self, layer_sizes):
    """Model for encoding items.
 
    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()
 
    self.embedding_model = ItemModel()
 
    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()
 
    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))
 
    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))
     
  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

In [80]:
class ItemlensModel(tfrs.models.Model):
 
  def __init__(self, layer_sizes):
    super().__init__()
    self.query_model = QueryModel(layer_sizes)
    self.candidate_model = CandidateModel(layer_sizes)
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=items.batch(128).map(self.candidate_model),
        ),
    )
 
  def compute_loss(self, features, training=False):
 
    query_embeddings = self.query_model({
        "user_id": features["user_id"],
        "country": features["country"],
        "timestamp": features["timestamp"],
    })
    item_embeddings = self.candidate_model({
        "item_id": features["item_id"],
        "item_name": features["item_name"],
    })
 
    return self.task(
        query_embeddings, item_embeddings, compute_metrics=not training)

In [81]:
tf.random.set_seed(42)
shuffled = interactions.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)
 
cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

In [82]:
num_epochs = 50
 
model = ItemlensModel([32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
 
one_layer_history = model.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=5,
    epochs=num_epochs,
    verbose=0)
 
accuracy = one_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"][-1]
print(f"Top-100 accuracy: {accuracy:.2f}.")

Top-100 accuracy: 0.29.


In [94]:
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
index.index_from_dataset(
  tf.data.Dataset.zip((items.batch(100).map(lambda x: x["item_name"]), items.batch(100).map(model.candidate_model)))
)

In [98]:
_, titles = index({
    "user_id": np.array(['13085.0']),
    "country": np.array(["United Kingdom"]),
    "timestamp": np.array([1.259654e+09])},
    k=50
)
titles[0].numpy()

array([b'LUNCHBOX WITH CUTLERY FAIRY CAKES ',
       b'FANCY FONT HOME SWEET HOME DOORMAT',
       b'UNION JACK GUNS & ROSES  DOORMAT',
       b'SALT AND PEPPER SHAKERS TOADSTOOLS',
       b'72 CAKE CASES VINTAGE CHRISTMAS',
       b'WOOLLY HAT SOCK GLOVE ADVENT STRING', b'STRAWBERRY CANDY BAG',
       b'PACK OF 20 FAIRY CAKE PAPER NAPKINS', b'RED SPOTTY COIR DOORMAT',
       b'RETRO "TEA FOR ONE" ', b'TRIANGULAR POUFFE VINTAGE ',
       b'6 CROCHET STRAWBERRIES', b'UNION JACK HOT WATER BOTTLE ',
       b'DINOSAURS WATER TRANSFER TATTOOS ',
       b'VINTAGE SEASIDE JIGSAW PUZZLES', b'SET 10 LIGHTS NIGHT OWL',
       b'WHITE HANGING BEADS CANDLE HOLDER', b'MILK PAN BLUE RETROSPOT',
       b'FRYING PAN BLUE POLKADOT ',
       b'JUNGLE POPSICLES ICE LOLLY HOLDERS', b' WHITE CHERRY LIGHTS',
       b'VINTAGE SNAP CARDS', b'CURIOUS  IMAGES NOTEBOOK SET',
       b'WRAP BLIZZARD', b'HANGING FAIRY CAKE DECORATION',
       b'SET OF MEADOW  FLOWER STICKERS', b'RED SPOTTY ROUND CAKE TINS',
       

In [101]:
_, titles = index({
    "user_id": np.array(['12680.0']),
    "country": np.array(["France"]),
    "timestamp": np.array([1.323435e+09])},
    k=50
)

titles[0].numpy()

array([b'CHILDRENS CUTLERY CIRCUS PARADE', b'CHILDRENS CUTLERY SPACEBOY ',
       b'ALARM CLOCK BAKELIKE GREEN', b'ALARM CLOCK BAKELIKE IVORY',
       b'ALARM CLOCK BAKELIKE PINK', b'BOYS VINTAGE TIN SEASIDE BUCKET',
       b'RED METAL BEACH SPADE ', b'CHILDRENS CUTLERY DOLLY GIRL ',
       b'MAGIC DRAWING SLATE SPACEBOY ', b'ALARM CLOCK BAKELIKE RED ',
       b'PLASTERS IN TIN CIRCUS PARADE ',
       b'WOODLAND BUNNIES LOLLY MAKERS', b'POSTAGE',
       b'DINOSAUR HEIGHT CHART STICKER SET',
       b'BAKING SET 9 PIECE RETROSPOT ', b'BLUE HARMONICA IN BOX ',
       b'PASTEL COLOUR HONEYCOMB FAN', b'COSY SLIPPER SHOES LARGE GREEN',
       b'FAIRY CAKE BIRTHDAY CANDLE SET', b'SET OF 6 SOLDIER SKITTLES',
       b'PINK DINER WALL CLOCK', b'ALARM CLOCK BAKELIKE CHOCOLATE',
       b'GINGERBREAD MAN COOKIE CUTTER', b'4 TRADITIONAL SPINNING TOPS',
       b'DOLLY GIRL BABY GIFT SET', b'TREASURE ISLAND BOOK BOX',
       b'HOLIDAY FUN LUDO', b'MINI JIGSAW DOLLY GIRL',
       b'GIRLS VINTAGE TIN SE

In [102]:
user_df[user_df['Customer ID'] == '12680.0'].head(20)

Unnamed: 0,StockCode,Description,Quantity,InvoiceDate,Customer ID,Country,timestamp
638501,21981,PACK OF 12 WOODLAND TISSUES,24,8/18/2011 15:44,12680.0,France,1313682000.0
638502,21986,PACK OF 12 PINK POLKADOT TISSUES,24,8/18/2011 15:44,12680.0,France,1313682000.0
638503,22037,ROBOT BIRTHDAY CARD,12,8/18/2011 15:44,12680.0,France,1313682000.0
638504,23190,BUNDLE OF 3 SCHOOL EXERCISE BOOKS,12,8/18/2011 15:44,12680.0,France,1313682000.0
638505,22555,PLASTERS IN TIN STRONGMAN,12,8/18/2011 15:44,12680.0,France,1313682000.0
638506,22629,SPACEBOY LUNCH BOX,12,8/18/2011 15:44,12680.0,France,1313682000.0
638507,22980,PANTRY SCRUBBING BRUSH,12,8/18/2011 15:44,12680.0,France,1313682000.0
638508,22979,PANTRY WASHING UP BRUSH,12,8/18/2011 15:44,12680.0,France,1313682000.0
638509,22712,CARD DOLLY GIRL,12,8/18/2011 15:44,12680.0,France,1313682000.0
638510,22630,DOLLY GIRL LUNCH BOX,12,8/18/2011 15:44,12680.0,France,1313682000.0


In [None]:
import tempfile
import os
# Export the query model.
with tempfile.TemporaryDirectory() as tmp:
  path = os.path.join(tmp, "model")
  
  # Save the index.
  tf.saved_model.save(index, path)
  
  # Load it back; can also be done in TensorFlow Serving.
  loaded = tf.saved_model.load(path)
  
  # Pass a user id in, get top predicted movie titles back.
  _, titles = loaded({
    "bucketized_user_age": np.array([25]),
    "user_occupation_label": np.array([17]),
    "user_gender": np.array([True]),
    "timestamp": np.array([879024327])}
)
  
  print(f"Recommendations: {titles[0][:10]}")

In [106]:
import pickle
pkl_model = "model.h5"
pkl_index = 'index.h5'

# with open(pkl_model, 'wb') as file:  
#     pickle.dump(model, file)

with open(pkl_index, 'wb') as file:     
    pickle.dump(index, file)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......query_model
.........dense_layers
............layers
...............dense
..................vars
.....................0
.....................1
............vars
.........embedding_model
............country_embedding
...............layers
..................embedding
.....................vars
........................0
..................string_lookup
.....................vars
...............vars
............layers
...............normalization
..................vars
.....................0
.....................1
.....................2
...............sequential
..................layers
.....................embedding
........................vars
...........................0
.....................string_lookup
........................vars
..................vars
...............sequential_2
..................layers
.....................discretization
........................vars
.....................embedding
.......