# Running Full Model

In [3]:
%load_ext autoreload
%autoreload 2
import numpy as np
import tensorflow as tf
import warnings
warnings.filterwarnings("ignore")

from preprocess.data_preprocessing import *
from features.featuring import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [39]:
url = "https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/review-California_10.json.gz"
meta_url = "https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/meta-California.json.gz"

cleaned_df = get_single_chunk(url,meta_url) #get_clean_review_data(url, meta_url)
featured_df = featuring_engineering(cleaned_df)

Loading metadata from: /Users/kevinb/Desktop/cse158/RDSB/rsdb/data/metadata.json.gz
Loaded 463034 metadata entries.
Processing review data from: /Users/kevinb/Desktop/cse158/RDSB/rsdb/data/data.json.gz
Processed 79763 entries in the first chunk.


In [57]:
from models.tdlf.temporal_dynamic_v import TemporalDynamicVariants
from models.fpmc.fpmc_v import FPMCVariants

In [41]:
featured_df

Unnamed: 0,review_time(unix),reviewer_id,gmap_id,rating,isin_category_restaurant,isin_category_park,isin_category_store,lon_bin_0,lon_bin_1,lon_bin_2,...,lat_bin_15,lat_bin_16,lat_bin_17,lat_bin_18,lat_bin_19,closed_on_weekend,weekly_operating_hours,time_bin,user_mean_time,prev_item_id
76001,1.322523,1.000046e+20,0x8091aad7a7c4d8f3:0xe4aa0b5a0d2fbcb8,5,0,0,0,0,0,0,...,0,0,0,0,0,False,47.0,2676141,0.634746,0x8090504ec98781ad:0xd5235eff697582d1
83574,0.394244,1.000331e+20,0x808fcecafdb31371:0xd9acab442d78ae57,5,0,0,0,0,0,0,...,0,0,0,0,0,False,75.0,2598092,0.029116,0x808fcecafdb31371:0x8bd0cb4a080e362f
12856,1.226621,1.000378e+20,0x80c333fe0f143baf:0xd4c005c08bb117d6,2,0,0,0,0,0,0,...,0,0,0,0,0,False,40.0,2668078,1.160554,0x80dcac8730afa353:0x51a1d5eebf63f15f
98068,-0.507204,1.000410e+20,0x80c3281188d0d8a7:0xd9055ec15e626693,5,0,0,1,0,0,0,...,0,0,0,0,0,False,91.0,2522298,-0.539851,0x80dcad120946ced7:0x61926b580ff47c0a
13096,0.926559,1.000463e+20,0x80db57d6f3f45b17:0xa2494325a8e241cf,5,0,0,0,0,0,0,...,0,0,0,0,0,False,77.0,2642849,0.722403,0x80db41130e809e9b:0x96166498b19c5e97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64884,-0.147489,1.184288e+20,0x80c2b66db72fe81b:0xfd5e2f86736d05f4,5,0,0,0,0,0,0,...,0,0,0,0,0,False,70.0,2552542,-0.194477,0x80dd33c48f82119f:0xace1cb276e60af3e
103333,1.006453,1.184296e+20,0x80c32fa16dbdf329:0xf8cbe9d0e8fb186c,4,0,0,0,0,0,0,...,0,0,0,0,0,False,76.0,2649566,0.369517,0x80c32789715c74dd:0x9c332f98ca8c42cc
14720,-0.291660,1.184301e+20,0x80c2d39636c5d87f:0x3d868d3def90a396,5,0,0,0,0,0,0,...,0,0,0,0,0,False,58.0,2540421,-0.523079,0x80c2cf91a4492b67:0xf36a895f62d7c8f5
30486,0.282890,1.184347e+20,0x80c36440129b0c55:0x94d1998653c8f3e9,4,0,0,0,0,0,0,...,0,0,0,0,0,False,37.0,2588729,-0.044054,0x80c365d97d417abf:0xdeb65caeafecadfe


***
# Prepare Features
***

In [42]:
df = featured_df
data_query = df[['gmap_id', 'reviewer_id', 'rating']]
df.info()

# need sequential?
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6157 entries, 76001 to 73346
Data columns (total 52 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   review_time(unix)         6157 non-null   float64
 1   reviewer_id               6157 non-null   float64
 2   gmap_id                   6157 non-null   object 
 3   rating                    6157 non-null   int64  
 4   isin_category_restaurant  6157 non-null   int64  
 5   isin_category_park        6157 non-null   int64  
 6   isin_category_store       6157 non-null   int64  
 7   lon_bin_0                 6157 non-null   int64  
 8   lon_bin_1                 6157 non-null   int64  
 9   lon_bin_2                 6157 non-null   int64  
 10  lon_bin_3                 6157 non-null   int64  
 11  lon_bin_4                 6157 non-null   int64  
 12  lon_bin_5                 6157 non-null   int64  
 13  lon_bin_6                 6157 non-null   int64  
 14  lon

In [43]:
def tlfm_df_to_tf_dataset(dataframe):
    '''change featuers from data frame to tensorfloe styles'''
    
    return tf.data.Dataset.from_tensor_slices({
        "reviewer_id": dataframe["reviewer_id"].astype(str),
        "gmap_id": dataframe["gmap_id"].astype(str),
        "time": dataframe["review_time(unix)"].astype(float),
        "time_bin": dataframe["time_bin"].astype(float),
        "user_mean_time": dataframe["user_mean_time"],
        "rating": dataframe["rating"],
        "isin_category_restaurant": dataframe["isin_category_restaurant"].astype(float),
        "isin_category_park": dataframe["isin_category_park"].astype(float),
        "isin_category_store": dataframe["isin_category_store"].astype(float),
        "closed_on_weekend": dataframe["closed_on_weekend"].astype(float),
        "weekly_operating_hours": dataframe["weekly_operating_hours"].astype(float),
        
        # Longitude bins
        **{f"lon_bin_{i}": dataframe[f"lon_bin_{i}"].astype(float) for i in range(20) if f"lon_bin_{i}" in dataframe.columns},
        # Latitude bins
        **{f"lat_bin_{i}": dataframe[f"lat_bin_{i}"].astype(float) for i in range(20) if f"lat_bin_{i}" in dataframe.columns},
    })

def fpmc_df_to_tf_dataset(dataframe):
    '''change featuers from data frame to tensorfloe styles'''
    dataframe["reviewer_id"] = dataframe["reviewer_id"].astype(str)
    dataframe["prev_item_id"] = dataframe["prev_item_id"].astype(str)
    dataframe["gmap_id"] = dataframe["gmap_id"].astype(str)
    
    user_lookup = tf.keras.layers.StringLookup(
        vocabulary=dataframe["reviewer_id"].unique(), mask_token=None
    )
    item_lookup = tf.keras.layers.StringLookup(
        vocabulary=dataframe["gmap_id"].unique(), mask_token=None
    )
    
    return tf.data.Dataset.from_tensor_slices({
        "reviewer_id": user_lookup(dataframe["reviewer_id"]),
        "prev_item_id": item_lookup(dataframe["prev_item_id"]),
        "next_item_id": item_lookup(dataframe["gmap_id"]),
        "rating": dataframe["rating"].astype(float),
        "isin_category_restaurant": dataframe["isin_category_restaurant"].astype(float),
        "isin_category_park": dataframe["isin_category_park"].astype(float),
        "isin_category_store": dataframe["isin_category_store"].astype(float),
        "closed_on_weekend": dataframe["closed_on_weekend"].astype(float),
        "weekly_operating_hours": dataframe["weekly_operating_hours"].astype(float),
        
        # Longitude bins
        **{f"lon_bin_{i}": dataframe[f"lon_bin_{i}"].astype(float) for i in range(20) if f"lon_bin_{i}" in dataframe.columns},
        # Latitude bins
        **{f"lat_bin_{i}": dataframe[f"lat_bin_{i}"].astype(float) for i in range(20) if f"lat_bin_{i}" in dataframe.columns},
    })


***
# Description (Many Assumptions)
***

## Temporal Latent Factor Model + Neural Corrolative (Dynamic Latent)
**Notice that item $i$ refers to the business and user $u$ refers to the user.**

**Modification**:

$$
\gamma_{u,k}(t) = \gamma_{u,k} + \alpha_{u,k} \cdot \text{dev}_u(t) + \gamma_{u,k,t}
$$

$$
\hat{r}_{u,i,t} = \mu + b_i + b_i(t) + b_u + \alpha_u \cdot \text{dev}_u(t) + f(\gamma_{u,k}(t), \gamma_{i,k})
$$

**Optimization**:

$$
\arg \min_{\alpha, \beta, \gamma} \sum_{u,i} \left(\mu + b_i + b_i(t) + b_u + \alpha_u \cdot \text{dev}_u(t) + f(\gamma_{u,k}(t), \gamma_{i,k}) - R_{u,i} \right)^2 + \lambda \left[ \sum_u \beta_u^2 + \sum_i \beta_i^2 + \sum_i \left\| \gamma_i \right\|_2^2 + \sum_u \left\| \gamma_u \right\|_2^2 \right]
$$

In [46]:
train_data = tlfm_df_to_tf_dataset(train_df).shuffle(1024).batch(4096)
test_data = tlfm_df_to_tf_dataset(test_df).batch(4096)

In [50]:
embedding_dim = 30
dense_units = 30
l2_reg = 0.0201
time_bins= 30
model = TemporalDynamicVariants(l2_reg, dense_units, embedding_dim, data_query, time_bins)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_root_mean_squared_error", 
    patience=10,
    min_delta=0.001,
    restore_best_weights=True
)

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-2, 
    decay_steps=1000, 
    decay_rate=0.8
)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule))
model.fit(train_data, epochs=500, validation_data=test_data, callbacks=[early_stopping])



Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500


<keras.src.callbacks.History at 0x438229250>

In [51]:
test_metrics = model.evaluate(test_data, return_dict=True)
print(f"Test RMSE: {test_metrics['root_mean_squared_error']}")

predictions = []
actual_ratings = []
for batch in test_data:
    predicted_ratings = model(batch).numpy()
    actual_ratings.extend(batch["rating"].numpy())
    predictions.extend(predicted_ratings)

predictions = np.array(predictions)
actual_ratings = np.array(actual_ratings)

rmse = np.sqrt(np.mean((predictions - actual_ratings) ** 2))
print(f"RECHECK RMSE: {rmse}")

if np.all(actual_ratings == actual_ratings.round()):
    correct = np.mean(predictions.round() == actual_ratings)
    print(f"Rounded Accuracy: {correct}")
else:
    print("Actual ratings are not integers, skipping rounded accuracy calculation.")

Test RMSE: 1.1441680192947388
RECHECK RMSE: 1.1441680726347163
Rounded Accuracy: 0.27213647441104794


***
# Description (Less Assumptions)
***

# Factorized Personalized Markov Chain

$$
p(i_{t+1} \mid i_t, u) = p(i_{t+1} \mid i_t, u)
$$

In Factorized Personalized Markov Chain (FPMC), we do this calculation by:

$$
f(i \mid u, j) = \underbrace{\gamma_{ui} \cdot \gamma_{iu}}_{\mathclap{f(i \mid u)}} + \underbrace{\gamma_{ij} \cdot \gamma_{ji}}_{\mathclap{f(i \mid j)}} + \underbrace{\gamma_{uj} \cdot \gamma_{ju}}_{\mathclap{f(u, j)}}.
$$

Neglecting independent terms:

$$
f(i \mid u, j) = 
\underbrace{\gamma_{ui} \cdot \gamma_{iu}}_{\text{user's compatibility with the next item}} + 
\underbrace{\gamma_{ij} \cdot \gamma_{ji}}_{\text{next item's compatibility with the previous item}}
$$

In [58]:
train_data = fpmc_df_to_tf_dataset(train_df).shuffle(1024).batch(4096)
test_data = fpmc_df_to_tf_dataset(test_df).batch(4096)

In [61]:
embedding_dim = 32
l2_reg = 0.0201
lr = 1e-3
model = FPMCVariants(l2_reg=l2_reg, embedding_dim=embedding_dim, data_query=data_query)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_root_mean_squared_error", 
    patience=10,
    min_delta=0.001,
    restore_best_weights=True
)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr))

history = model.fit(
    train_data, 
    validation_data=test_data, 
    epochs=500, 
    callbacks=[early_stopping]
)



Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [38]:
test_metrics = model.evaluate(test_data, return_dict=True)
print(f"Test RMSE: {test_metrics['root_mean_squared_error']}")

predictions = []
actual_ratings = []
for batch in test_data:
    predicted_ratings = model(batch).numpy()
    actual_ratings.extend(batch["rating"].numpy())
    predictions.extend(predicted_ratings)

predictions = np.array(predictions)
actual_ratings = np.array(actual_ratings)

rmse = np.sqrt(np.mean((predictions - actual_ratings) ** 2))
print(f"RECHECK RMSE: {rmse}")

if np.all(actual_ratings == actual_ratings.round()):
    correct = np.mean(predictions.round() == actual_ratings)
    print(f"Rounded Accuracy: {correct}")
else:
    print("Actual ratings are not integers, skipping rounded accuracy calculation.")

Test RMSE: 1.1811875104904175
RECHECK RMSE: 1.1811874576036936
Rounded Accuracy: 0.31844029244516653
