# Evaluations of Our System

In [1]:
%load_ext autoreload
%autoreload 2

# notebook path get folder path, python path gets abs path
from pathlib import Path
import os
current_dir = Path.cwd().parent
os.chdir(current_dir)
print(f"Current working directory is now: {Path.cwd()}")

import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")

from rsdb.preprocess.data_preprocessing import get_clean_review_data
from rsdb.features.featuring import featuring_engineering
from rsdb.eval.eval_model import eval_result, eval_downstream

from rsdb.train import tdlf_df_to_tf_dataset, fpmc_df_to_tf_dataset, blf_df_to_tf_dataset


url = "https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/review-California_10.json.gz"
meta_url = "https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/meta-California.json.gz"

Current working directory is now: /Users/guoxuanxu/Documents/local_repo/RSDB


In [2]:
cleaned_df = get_clean_review_data(url, meta_url)
featured_df = featuring_engineering(cleaned_df)
data_query = featured_df[['gmap_id', 'reviewer_id', 'rating']]

/Users/guoxuanxu/Documents/local_repo/RSDB/rsdb
Loading metadata from: /Users/guoxuanxu/Documents/local_repo/RSDB/rsdb/data/metadata.json.gz
Loaded 463034 metadata entries.
Processing review data from: /Users/guoxuanxu/Documents/local_repo/RSDB/rsdb/data/data.json.gz


Processing chunks: 101it [00:21,  4.80it/s, Processed Rows=949381]                         


Processed 949381 review entries.
finished finding generalized categories. Takes 0.5371699333190918
finished bining locations. Takes 0.422374963760376
finished featuring hours. Takes 3.5618338584899902
finished creating model specalizied feature. Takes 1.206752061843872


In [21]:
train_df = featured_df.sample(frac=0.8, random_state=42) # not gonna use here
test_df = featured_df.drop(train_df.index)

## Load the Baseline Latent Factor Model


In [4]:
print(f"Current working directory is now: {Path.cwd()}")
model_path = 'trained_blf_model'
model_blf = tf.keras.models.load_model(model_path)
print(model_blf.summary())

Current working directory is now: /Users/guoxuanxu/Documents/local_repo/RSDB
Model: "latent_factor_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential (Sequential)     (None, 20)                3685020   
                                                                 
 sequential_1 (Sequential)   (None, 20)                9043460   
                                                                 
 sequential_2 (Sequential)   (None, 1)                 184251    
                                                                 
 sequential_3 (Sequential)   (None, 1)                 452173    
                                                                 
 ranking (Ranking)           multiple                  0         
                                                                 
Total params: 13364905 (50.98 MB)
Trainable params: 13364905 (50.98 MB)
Non-trainable params: 0 (0.00 

In [5]:
train_data = blf_df_to_tf_dataset(train_df).shuffle(1024).batch(4096)
test_data_blf = blf_df_to_tf_dataset(test_df).batch(4096)

## Load the FMPC Model

In [6]:
print(f"Current working directory is now: {Path.cwd()}")
model_path = 'trained_fpmc_model'
model_fpmc = tf.keras.models.load_model(model_path)
print(model_fpmc.summary())

Current working directory is now: /Users/guoxuanxu/Documents/local_repo/RSDB
Model: "fpmc_variants"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  18724680  
                                                                 
 embedding_1 (Embedding)     multiple                  3968224   
                                                                 
 embedding_2 (Embedding)     multiple                  3968224   
                                                                 
 embedding_3 (Embedding)     multiple                  452172    
                                                                 
 embedding_4 (Embedding)     multiple                  184250    
                                                                 
 embedding_5 (Embedding)     multiple                  312       
                                          

In [7]:
train_data = fpmc_df_to_tf_dataset(train_df).shuffle(1024).batch(4096)
test_data_fpmc = fpmc_df_to_tf_dataset(test_df).batch(4096)

## Load the TDLF Model

In [8]:
print(f"Current working directory is now: {Path.cwd()}")
model_path = 'trained_tdlf_model'
model_tdlf = tf.keras.models.load_model(model_path)
print(model_tdlf.summary())

Current working directory is now: /Users/guoxuanxu/Documents/local_repo/RSDB
Model: "temporal_dynamic_variants"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 string_lookup (StringLooku  multiple                  0         
 p)                                                              
                                                                 
 string_lookup_1 (StringLoo  multiple                  0         
 kup)                                                            
                                                                 
 integer_lookup (IntegerLoo  multiple                  0         
 kup)                                                            
                                                                 
 embedding (Embedding)       multiple                  9043440   
                                                                 
 embedding_1 (Embedding)     m

In [9]:
train_data = tdlf_df_to_tf_dataset(train_df).shuffle(1024).batch(4096)
test_data_tdlf = tdlf_df_to_tf_dataset(test_df).batch(4096)

## Evaluate Metrics

In [10]:
eval_result([model_blf, model_fpmc, model_tdlf], [test_data_blf, test_data_fpmc, test_data_tdlf])



Unnamed: 0,mse,rmse,r2,mase
latent_factor_model,1.260431,1.122689,0.106109,0.831818
fpmc_variants,1.410808,1.187775,-0.000538,0.892633
temporal_dynamic_variants,1.268289,1.126184,0.100536,0.810097


# Downstream Applications

In [58]:
eval_downstream([model_blf, model_fpmc, model_tdlf], cleaned_df, ['BasicLatentFactors', 'FPMCVariants', 'TemporalDynamicVariants'])

please make sure all the tensorflow data is same
finished finding generalized categories. Takes 0.5869848728179932
finished bining locations. Takes 0.46233201026916504
finished featuring hours. Takes 4.263368844985962
finished creating model specalizied feature. Takes 1.26283597946167
for model: BasicLatentFactors
The given gmapids has the property ['Senior citizen center', 'Transportation infrastructure']
Of all the recommended user, their categorical visited business are in these categories
category
Senior citizen center    0.053073
Grocery store            0.022346
Auto repair shop         0.022346
Name: proportion, dtype: float64


Their average rating is:
reviewer_id
1.018359e+20    5.000000
1.028439e+20    5.000000
1.041582e+20    4.916667
1.042955e+20    5.000000
1.059067e+20    5.000000
1.063647e+20    5.000000
1.065096e+20    5.000000
1.076373e+20    5.000000
1.077612e+20    4.600000
1.081316e+20    4.916667
1.085644e+20    5.000000
1.086065e+20    4.600000
1.095809e+20    4.7