# Evaluations of Our System

In [1]:
%load_ext autoreload
%autoreload 2

# notebook path get folder path, python path gets abs path
from pathlib import Path
import os
current_dir = Path.cwd().parent
os.chdir(current_dir)
print(f"Current working directory is now: {Path.cwd()}")

import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")

from rsdb.preprocess.data_preprocessing import get_clean_review_data
from rsdb.features.featuring import featuring_engineering

url = "https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/review-California_10.json.gz"
meta_url = "https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/meta-California.json.gz"

Current working directory is now: /Users/kevinb/Desktop/cse158/RSDB


In [2]:
cleaned_df = get_clean_review_data(url, meta_url)
featured_df = featuring_engineering(cleaned_df)
data_query = featured_df[['gmap_id', 'reviewer_id', 'rating']]

/Users/kevinb/Desktop/cse158/RSDB/rsdb
Loading metadata from: /Users/kevinb/Desktop/cse158/RSDB/rsdb/data/metadata.json.gz
Loaded 463034 metadata entries.
Processing review data from: /Users/kevinb/Desktop/cse158/RSDB/rsdb/data/data.json.gz


Processing chunks: 101it [00:31,  3.23it/s, Processed Rows=949381]                         


Processed 949381 review entries.
finished finding generalized categories. Takes 0.8849248886108398
finished bining locations. Takes 0.846621036529541
finished featuring hours. Takes 3.940809965133667
finished creating model specalizied feature. Takes 1.7310199737548828


In [9]:
train_df = featured_df.sample(frac=0.8, random_state=42) # not gonna use here
test_df = featured_df.drop(train_df.index)

## Load the FMPC Model

In [13]:
print(f"Current working directory is now: {Path.cwd()}")
model_path = 'trained_fpmc_model'
model = tf.keras.models.load_model(model_path)
print(model.summary())

Current working directory is now: /Users/kevinb/Desktop/cse158/RSDB
Model: "fpmc_variants"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  18724680  
                                                                 
 embedding_1 (Embedding)     multiple                  3968224   
                                                                 
 embedding_2 (Embedding)     multiple                  3968224   
                                                                 
 embedding_3 (Embedding)     multiple                  180045    
                                                                 
 embedding_4 (Embedding)     multiple                  38156     
                                                                 
 embedding_5 (Embedding)     multiple                  312       
                                                   

In [75]:
from rsdb.train import tdlf_df_to_tf_dataset, fpmc_df_to_tf_dataset
train_data = fpmc_df_to_tf_dataset(train_df).shuffle(1024).batch(4096)
test_data = fpmc_df_to_tf_dataset(test_df).batch(4096)

In [19]:
test_metrics = model.evaluate(test_data, return_dict=True)
print(f"Test RMSE: {test_metrics['root_mean_squared_error']}")

Test RMSE: 0.0


## Load the TDLF Model

In [7]:
print(f"Current working directory is now: {Path.cwd()}")
model_path = 'trained_tdlf_model'
model = tf.keras.models.load_model(model_path)
print(model.summary())

Current working directory is now: /Users/kevinb/Desktop/cse158/RSDB
Model: "temporal_dynamic_variants"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 string_lookup (StringLooku  multiple                  0         
 p)                                                              
                                                                 
 string_lookup_1 (StringLoo  multiple                  0         
 kup)                                                            
                                                                 
 integer_lookup (IntegerLoo  multiple                  0         
 kup)                                                            
                                                                 
 embedding (Embedding)       multiple                  21605400  
                                                                 
 embedding_1 (Embedding)     multiple  

In [13]:
train_data = tdlf_df_to_tf_dataset(train_df).shuffle(1024).batch(4096)
test_data = tdlf_df_to_tf_dataset(test_df).batch(4096)

In [71]:
test_metrics = model.evaluate(test_data, return_dict=True)
print(f"Test RMSE: {test_metrics['root_mean_squared_error']}")

Test RMSE: 0.0


# Downstream Applications

In [47]:
from rsdb.recommendation import Recommendation
rec = Recommendation(model, cleaned_df, "TemporalDynamicVariants")

finished finding generalized categories. Takes 0.879101037979126
finished bining locations. Takes 0.7532398700714111
finished featuring hours. Takes 5.123684883117676
finished creating model specalizied feature. Takes 1.5045130252838135


For reproducibility

In [63]:
seed = 1

In [64]:
rand_sample = cleaned_df.sample(5, random_state=seed)['gmap_id'].iloc[1]
print(rand_sample)

0x80c2c916f135cf75:0xe9cf50202b11b15d


In [65]:
rec_users = rec.recommend(rand_sample)

Let's look at what our business do here

In [66]:
cleaned_df[cleaned_df['gmap_id'] == rand_sample].iloc[1].category

['Auto repair shop']

Let's merge the main data frame and look at what does our reconmanded users for our business likes to do else

In [67]:
rec_users["reviewer_id"] = rec_users["reviewer_id"].astype(float)
merged = cleaned_df.merge(rec_users, on="reviewer_id")
merged.category.explode().value_counts()

Auto repair shop             34
Gas station                  24
Restaurant                   23
ATM                          21
Fashion accessories store    19
                             ..
Foundation                    1
Dessert restaurant            1
Meat products                 1
Hunan restaurant              1
Outdoor sports store          1
Name: category, Length: 490, dtype: int64

Let's see our user's rating for thes second most popular category

In [69]:
# Filter the dataframe to include only rows where the category contains the target category
tourist_attraction_df = cleaned_df[cleaned_df['category'].apply(lambda x: 'Gas station' in x)]

# Check the ratings of the filtered dataframe
tourist_attraction_ratings = tourist_attraction_df['rating']
print(tourist_attraction_ratings.describe())
print(tourist_attraction_ratings.value_counts(normalize=True))

count    17351.000000
mean         3.944787
std          1.274876
min          1.000000
25%          3.000000
50%          4.000000
75%          5.000000
max          5.000000
Name: rating, dtype: float64
5    0.473344
4    0.223157
3    0.166561
1    0.088122
2    0.048816
Name: rating, dtype: float64


Rouphly 70% of the `user_id` that matches our `gmap_id` also give high ratings to similar places in there historical interactions