# Evaluations of Our System

In [1]:
%load_ext autoreload
%autoreload 2

# notebook path get folder path, python path gets abs path
from pathlib import Path
import os
current_dir = Path.cwd().parent
os.chdir(current_dir)
print(f"Current working directory is now: {Path.cwd()}")

import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")

from rsdb.preprocess.data_preprocessing import get_clean_review_data
from rsdb.features.featuring import featuring_engineering
from rsdb.eval.eval_model import eval_result, eval_downstream

from rsdb.train import tdlf_df_to_tf_dataset, fpmc_df_to_tf_dataset, blf_df_to_tf_dataset


url = "https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/review-California_10.json.gz"
meta_url = "https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/meta-California.json.gz"

Current working directory is now: /Users/kevinb/Desktop/cse158/RSDB


In [2]:
cleaned_df = get_clean_review_data(url, meta_url)
featured_df = featuring_engineering(cleaned_df)
data_query = featured_df[['gmap_id', 'reviewer_id', 'rating']]

/Users/kevinb/Desktop/cse158/RSDB/rsdb
Loading metadata from: /Users/kevinb/Desktop/cse158/RSDB/rsdb/data/metadata.json.gz
Loaded 463034 metadata entries.
Processing review data from: /Users/kevinb/Desktop/cse158/RSDB/rsdb/data/data.json.gz


Processing chunks: 445it [04:24,  1.68it/s, Processed Rows=2119537]                         


Processed 2119537 review entries.
finished finding generalized categories. Takes 1.9581727981567383
finished bining locations. Takes 1.6777291297912598
finished featuring hours. Takes 10.277329921722412
finished creating model specalizied feature. Takes 3.792642831802368


In [3]:
train_df = featured_df.sample(frac=0.8, random_state=42) # not gonna use here
test_df = featured_df.drop(train_df.index)

## Load the Baseline Latent Factor Model


In [4]:
print(f"Current working directory is now: {Path.cwd()}")
model_path = 'trained_blf_model'
model_blf = tf.keras.models.load_model(model_path)
print(model_blf.summary())

Current working directory is now: /Users/kevinb/Desktop/cse158/RSDB
Model: "latent_factor_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential (Sequential)     (None, 20)                3685020   
                                                                 
 sequential_1 (Sequential)   (None, 20)                9043460   
                                                                 
 sequential_2 (Sequential)   (None, 1)                 184251    
                                                                 
 sequential_3 (Sequential)   (None, 1)                 452173    
                                                                 
 ranking (Ranking)           multiple                  0         
                                                                 
Total params: 13364905 (50.98 MB)
Trainable params: 13364905 (50.98 MB)
Non-trainable params: 0 (0.00 Byte)
___

In [5]:
train_data = blf_df_to_tf_dataset(train_df).shuffle(1024).batch(4096)
test_data_blf = blf_df_to_tf_dataset(test_df).batch(4096)

## Load the FMPC Model

In [6]:
print(f"Current working directory is now: {Path.cwd()}")
model_path = 'trained_fpmc_model'
model_fpmc = tf.keras.models.load_model(model_path)
print(model_fpmc.summary())

Current working directory is now: /Users/kevinb/Desktop/cse158/RSDB
Model: "fpmc_variants"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  9043440   
                                                                 
 embedding_1 (Embedding)     multiple                  3685000   
                                                                 
 embedding_2 (Embedding)     multiple                  3685000   
                                                                 
 embedding_3 (Embedding)     multiple                  452172    
                                                                 
 embedding_4 (Embedding)     multiple                  184250    
                                                                 
 embedding_5 (Embedding)     multiple                  60        
                                                   

In [7]:
train_data = fpmc_df_to_tf_dataset(train_df).shuffle(1024).batch(4096)
test_data_fpmc = fpmc_df_to_tf_dataset(test_df).batch(4096)

## Load the TDLF Model

In [17]:
print(f"Current working directory is now: {Path.cwd()}")
model_path = 'trained_tdlf_model'
model_tdlf = tf.keras.models.load_model(model_path)
print(model_tdlf.summary())

Current working directory is now: /Users/kevinb/Desktop/cse158/RSDB
Model: "temporal_dynamic_variants"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 string_lookup (StringLooku  multiple                  0         
 p)                                                              
                                                                 
 string_lookup_1 (StringLoo  multiple                  0         
 kup)                                                            
                                                                 
 integer_lookup (IntegerLoo  multiple                  0         
 kup)                                                            
                                                                 
 embedding (Embedding)       multiple                  13565160  
                                                                 
 embedding_1 (Embedding)     multiple  

In [18]:
train_data = tdlf_df_to_tf_dataset(train_df).shuffle(1024).batch(4096)
test_data_tdlf = tdlf_df_to_tf_dataset(test_df).batch(4096)

## Evaluate Metrics

In [19]:
eval_result([model_blf, model_fpmc, model_tdlf], [test_data_blf, test_data_fpmc, test_data_tdlf])



Unnamed: 0,mse,rmse,r2,mase
latent_factor_model,0.968255,0.983999,0.092328,0.817679
fpmc_variants,1.286426,1.134207,-0.205936,1.064545
temporal_dynamic_variants,0.967203,0.983465,0.093314,0.801941


# Downstream Applications

In [20]:
eval_downstream([model_blf, model_fpmc, model_tdlf], cleaned_df, ['BasicLatentFactors', 'FPMCVariants', 'TemporalDynamicVariants'])

please make sure all the tensorflow data is same
finished finding generalized categories. Takes 2.1818060874938965
finished bining locations. Takes 1.8872990608215332
finished featuring hours. Takes 10.597061157226562
finished creating model specalizied feature. Takes 3.7900006771087646
for model: BasicLatentFactors
The given gmapids has the property ['Italian restaurant', 'Restaurant']
Of all the recommended user, their categorical visited business are in these categories
Restaurant              0.056936
Fast food restaurant    0.041618
Breakfast restaurant    0.029480
Name: category, dtype: float64


Their average rating is:
reviewer_id
1.013470e+20    5.000000
1.015551e+20    5.000000
1.025808e+20    4.900000
1.030183e+20    5.000000
1.033885e+20    4.957447
1.050070e+20    4.875000
1.056125e+20    4.961165
1.061405e+20    4.979167
1.062844e+20    5.000000
1.077835e+20    5.000000
1.097545e+20    4.885714
1.103570e+20    5.000000
1.108545e+20    4.902439
1.115175e+20    5.000000
1.1