In [1]:
import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd
import os
from tqdm.auto import tqdm

tqdm.pandas()

# params

In [2]:
dataset_location = "gs://mlteam-ml-specialization-2021-blackfriday/dataset/raw/"
trainset = os.path.join(dataset_location, "train.csv") 
testset = os.path.join(dataset_location, "test.csv") 
model_location = "gs://mlteam-ml-specialization-2021-blackfriday/aiplatform_jobs/bf_20210513_110816/2/2/Scann"
k_min = 10
model_num_neighbors = 10

# baseline

In [3]:
class BlackFridayBaseline:
    def __init__(self, trainset_location, user_features_list, build_inplace=False):
        self.trainset_location, self.user_features_list = trainset_location, user_features_list
        if build_inplace:
            self.build()
        
    def build(self):
        self.df = pd.read_csv(self.trainset_location).groupby(
            user_features+["Product_ID"])["User_ID"].count().reset_index().rename(
            columns={"User_ID":"count_product"})
        
    def predict(self, user_features, n_products):
        mask = None
        for k,v in user_features.items():
            if mask is None:
                mask = self.df[k]==v
            else:
                mask = mask&(self.df[k]==v)
        return self.df[mask].sort_values(self.user_features_list+["count_product"])["Product_ID"].values[:n_products]
        

In [4]:
user_features = [
    "Gender",
    "Age",
    "Occupation",
    "City_Category",
    "Stay_In_Current_City_Years",
    "Marital_Status"
    ]

In [5]:
# test
BlackFridayBaseline(trainset, user_features, True).predict(
    {'Gender': 'F',
     'Age': '0-17',
     'Occupation': 0,
     'City_Category': 'A',
     'Stay_In_Current_City_Years': '2',
     'Marital_Status': 0}, 
    10)

array(['P00034742', 'P00035542', 'P00048742', 'P00057542', 'P00058242',
       'P00102642', 'P00129542', 'P00137242', 'P00145042', 'P00154642'],
      dtype=object)

# model

In [6]:
model = tf.keras.models.load_model(model_location)
model



<tensorflow.python.keras.saving.saved_model.load.ScaNN at 0x7f83546d7f10>

In [9]:
class PredictionModel:
    def __init__(self, model_path, build_inplace=False):
        self.odel_path=model_path
        if build_inplace:
            self.build()
    
    def build(self):
        self.model=tf.keras.models.load_model(model_location)
        
    def predict_batch(self, model_input):
        scores, products = self.model({k:tf.constant(v) for k,v in model_input.items()})
        return products.numpy()
    
    def predict_single(self, model_input):
        scores, products = self.model({k:tf.constant([v]) for k,v in model_input.items()})
        return products.numpy()

In [11]:
pred_model=PredictionModel(model_location, True)
pred_model.model()



<tensorflow.python.keras.saving.saved_model.load.ScaNN at 0x7f834c914e90>

In [14]:
model_input = {k: tf.constant([str(v)]) for k,v in 
    {'Gender': 'F',
     'Age': '0-17',
     'Occupation': 0,
     'City_Category': 'A',
     'Stay_In_Current_City_Years': '2',
     'Marital_Status': 0}.items()}

In [17]:
pred_model.model(model_input)

(<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
 array([[5.0978603, 4.69528  , 4.6860933, 4.6760817, 4.6086154, 4.4028254,
         4.400078 , 4.3954306, 4.315371 , 4.213595 ]], dtype=float32)>,
 <tf.Tensor: shape=(1, 10), dtype=string, numpy=
 array([[b'P00137142', b'P00265142', b'P00293742', b'P00248742',
         b'P00178342', b'P00183542', b'P00337642', b'P00339042',
         b'P00214842', b'P00087342']], dtype=object)>)

In [10]:
PredictionModel(model_location, True).predict_single({k: str(v) for k,v in 
    {'Gender': 'F',
     'Age': '0-17',
     'Occupation': 0,
     'City_Category': 'A',
     'Stay_In_Current_City_Years': '2',
     'Marital_Status': 0}.items()})



array([[b'P00137142', b'P00265142', b'P00293742', b'P00248742',
        b'P00178342', b'P00183542', b'P00337642', b'P00339042',
        b'P00214842', b'P00087342']], dtype=object)

# ground truth

In [None]:
df=pd.read_csv(testset).groupby(user_features)["Product_ID"].apply(list).reset_index()

In [None]:
df

In [None]:
baseline = BlackFridayBaseline(trainset, user_features, True)

In [None]:
df["baseline"]=df[user_features].progress_apply(lambda x: baseline.predict(x, k_min), axis=1)

In [None]:
model = PredictionModel(model_location, True)

In [None]:
df["predictions"]=df[user_features].progress_apply(lambda x: model.predict_single({k:str(v) for k,v in x.items()})[0].astype(str), axis=1)

In [None]:
df

In [None]:
df_orig=df
df=df_orig[df_orig["Product_ID"].apply(len) >= k_min]
df_metrics = pd.DataFrame(index=df.index)
df_metrics["baseline_precision"]=df[["Product_ID","baseline"]].progress_apply(lambda x: np.intersect1d(x["Product_ID"],x["baseline"]).shape[0]/k_min,axis=1)
df_metrics["predictions_precision"]=df[["Product_ID","predictions"]].progress_apply(lambda x: np.intersect1d(x["Product_ID"],x["predictions"]).shape[0]/k_min,axis=1)

In [None]:
pd.DataFrame([
    df_metrics["baseline_precision"].apply(lambda x: round(x,2)).value_counts(),
    df_metrics["predictions_precision"].apply(lambda x: round(x,2)).value_counts()
]).transpose().fillna(0).plot.bar(figsize=(20,10))

In [None]:
df_metrics[["baseline_precision","predictions_precision"]].mean()