# Evaluations of Our System

In [1]:
%load_ext autoreload
%autoreload 2

# notebook path get folder path, python path gets abs path
from pathlib import Path
import os
current_dir = Path.cwd().parent
os.chdir(current_dir)
print(f"Current working directory is now: {Path.cwd()}")

import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")

from rsdb.preprocess.data_preprocessing import get_clean_review_data
from rsdb.features.featuring import featuring_engineering
from rsdb.eval.eval_model import eval_result

from rsdb.train import tdlf_df_to_tf_dataset, fpmc_df_to_tf_dataset, blf_df_to_tf_dataset


url = "https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/review-California_10.json.gz"
meta_url = "https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/meta-California.json.gz"

Current working directory is now: /Users/kevinb/Desktop/cse158/RSDB


In [2]:
cleaned_df = get_clean_review_data(url, meta_url)
featured_df = featuring_engineering(cleaned_df)
data_query = featured_df[['gmap_id', 'reviewer_id', 'rating']]

/Users/kevinb/Desktop/cse158/RSDB/rsdb
Loading metadata from: /Users/kevinb/Desktop/cse158/RSDB/rsdb/data/metadata.json.gz
Loaded 463034 metadata entries.
Processing review data from: /Users/kevinb/Desktop/cse158/RSDB/rsdb/data/data.json.gz


Processing chunks: 101it [00:32,  3.14it/s, Processed Rows=949381]                         


Processed 949381 review entries.
finished finding generalized categories. Takes 0.8954708576202393
finished bining locations. Takes 0.8869030475616455
finished featuring hours. Takes 3.995192289352417
finished creating model specalizied feature. Takes 1.7601001262664795


In [3]:
train_df = featured_df.sample(frac=0.8, random_state=42) # not gonna use here
test_df = featured_df.drop(train_df.index)

## Load the Baseline Latent Factor Model


In [5]:
print(f"Current working directory is now: {Path.cwd()}")
model_path = 'trained_blf_model'
model_blf = tf.keras.models.load_model(model_path)
print(model_blf.summary())

Current working directory is now: /Users/kevinb/Desktop/cse158/RSDB
Model: "latent_factor_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential (Sequential)     (None, 1)                 38157     
                                                                 
 sequential_1 (Sequential)   (None, 1)                 180046    
                                                                 
 sequential_2 (Sequential)   (None, 1)                 38157     
                                                                 
 sequential_3 (Sequential)   (None, 1)                 180046    
                                                                 
 ranking (Ranking)           multiple                  0         
                                                                 
Total params: 436407 (1.66 MB)
Trainable params: 436407 (1.66 MB)
Non-trainable params: 0 (0.00 Byte)
_________

In [6]:
train_data = blf_df_to_tf_dataset(train_df).shuffle(1024).batch(4096)
test_data_blf = blf_df_to_tf_dataset(test_df).batch(4096)

## Load the FMPC Model

In [14]:
print(f"Current working directory is now: {Path.cwd()}")
model_path = 'trained_fpmc_model'
model_fpmc = tf.keras.models.load_model(model_path)
print(model_fpmc.summary())

Current working directory is now: /Users/kevinb/Desktop/cse158/RSDB
Model: "fpmc_variants"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  21605400  
                                                                 
 embedding_1 (Embedding)     multiple                  4578720   
                                                                 
 embedding_2 (Embedding)     multiple                  4578720   
                                                                 
 embedding_3 (Embedding)     multiple                  180045    
                                                                 
 embedding_4 (Embedding)     multiple                  38156     
                                                                 
 embedding_5 (Embedding)     multiple                  360       
                                                   

In [15]:
train_data = fpmc_df_to_tf_dataset(train_df).shuffle(1024).batch(4096)
test_data_fpmc = fpmc_df_to_tf_dataset(test_df).batch(4096)

## Load the TDLF Model

In [16]:
print(f"Current working directory is now: {Path.cwd()}")
model_path = 'trained_tdlf_model'
model_tdlf = tf.keras.models.load_model(model_path)
print(model_tdlf.summary())

Current working directory is now: /Users/kevinb/Desktop/cse158/RSDB
Model: "temporal_dynamic_variants"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 string_lookup (StringLooku  multiple                  0         
 p)                                                              
                                                                 
 string_lookup_1 (StringLoo  multiple                  0         
 kup)                                                            
                                                                 
 integer_lookup (IntegerLoo  multiple                  0         
 kup)                                                            
                                                                 
 embedding (Embedding)       multiple                  21605400  
                                                                 
 embedding_1 (Embedding)     multiple  

In [17]:
train_data = tdlf_df_to_tf_dataset(train_df).shuffle(1024).batch(4096)
test_data_tdlf = tdlf_df_to_tf_dataset(test_df).batch(4096)

## Evaluate Metrics

In [18]:
eval_result([model_blf, model_fpmc, model_tdlf], [test_data_blf, test_data_fpmc, test_data_tdlf])



Unnamed: 0,mse,rmse,r2,mase
latent_factor_model,1.262906,1.123791,0.104354,0.835496
fpmc_variants,1.442097,1.200874,-0.022728,0.941939
temporal_dynamic_variants,1.267707,1.125925,0.100949,0.80945


# Downstream Applications

In [None]:
from rsdb.recommendation import Recommendation
rec = Recommendation(model_tdlf, cleaned_df, "TemporalDynamicVariants")

finished finding generalized categories. Takes 0.879101037979126
finished bining locations. Takes 0.7532398700714111
finished featuring hours. Takes 5.123684883117676
finished creating model specalizied feature. Takes 1.5045130252838135


For reproducibility

In [None]:
seed = 1

In [None]:
rand_sample = cleaned_df.sample(5, random_state=seed)['gmap_id'].iloc[1]
print(rand_sample)

0x80c2c916f135cf75:0xe9cf50202b11b15d


In [None]:
rec_users = rec.recommend(rand_sample)

Let's look at what our business do here

In [None]:
cleaned_df[cleaned_df['gmap_id'] == rand_sample].iloc[1].category

['Auto repair shop']

Let's merge the main data frame and look at what does our reconmanded users for our business likes to do else

In [None]:
rec_users["reviewer_id"] = rec_users["reviewer_id"].astype(float)
merged = cleaned_df.merge(rec_users, on="reviewer_id")
merged.category.explode().value_counts()

Auto repair shop             34
Gas station                  24
Restaurant                   23
ATM                          21
Fashion accessories store    19
                             ..
Foundation                    1
Dessert restaurant            1
Meat products                 1
Hunan restaurant              1
Outdoor sports store          1
Name: category, Length: 490, dtype: int64

Let's see our user's rating for thes second most popular category

In [None]:
# Filter the dataframe to include only rows where the category contains the target category
tourist_attraction_df = cleaned_df[cleaned_df['category'].apply(lambda x: 'Gas station' in x)]

# Check the ratings of the filtered dataframe
tourist_attraction_ratings = tourist_attraction_df['rating']
print(tourist_attraction_ratings.describe())
print(tourist_attraction_ratings.value_counts(normalize=True))

count    17351.000000
mean         3.944787
std          1.274876
min          1.000000
25%          3.000000
50%          4.000000
75%          5.000000
max          5.000000
Name: rating, dtype: float64
5    0.473344
4    0.223157
3    0.166561
1    0.088122
2    0.048816
Name: rating, dtype: float64


Rouphly 70% of the `user_id` that matches our `gmap_id` also give high ratings to similar places in there historical interactions