# Training a Recommendation System ⚙️

In [1]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
import os
current_dir = Path.cwd().parent
os.chdir(current_dir)
print(f"Current working directory is now: {Path.cwd()}")

import numpy as np
import tensorflow as tf
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from rsdb.preprocess.data_preprocessing import get_clean_review_data
from rsdb.features.featuring import featuring_engineering

url = "https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/review-California_10.json.gz"
meta_url = "https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/meta-California.json.gz"

Current working directory is now: /Users/kevinb/Desktop/cse158/RSDB


In [2]:
cleaned_df = get_clean_review_data(url,meta_url)
featured_df = featuring_engineering(cleaned_df)

/Users/kevinb/Desktop/cse158/RSDB/rsdb
Loading metadata from: /Users/kevinb/Desktop/cse158/RSDB/rsdb/data/metadata.json.gz
Loaded 463034 metadata entries.
Processing review data from: /Users/kevinb/Desktop/cse158/RSDB/rsdb/data/data.json.gz


Processing chunks: 445it [04:22,  1.70it/s, Processed Rows=2119537]                         


Processed 2119537 review entries.
finished finding generalized categories. Takes 1.9211199283599854
finished bining locations. Takes 1.7784991264343262
finished featuring hours. Takes 10.298488855361938
finished creating model specalizied feature. Takes 3.8551278114318848


In [3]:
from rsdb.models.tdlf.temporal_dynamic_v import TemporalDynamicVariants
from rsdb.models.fpmc.fpmc_v import FPMCVariants
from rsdb.train import tdlf_df_to_tf_dataset, fpmc_df_to_tf_dataset

Let's view the featured data frame

In [4]:
featured_df

Unnamed: 0,reviewer_id,reviewer_name,review_time(unix),rating,text,resp,gmap_id,has_rep,gmap_name,address,...,lat_bin_17,lat_bin_18,lat_bin_19,hours_dict,closed_on_weekend,operating_hours,weekly_operating_hours,time_bin,user_mean_time,prev_item_id
1681942,1.000000e+20,Nidia Arce,-0.274298,5,good activities for the family's!,,0x80c2c5b647201a83:0x56c42931ca38e173,False,El Sereno Recreation Center,"El Sereno Recreation Center, 4721 Klamath St, ...",...,0,0,0,"{'Saturday': 'Closed', 'Sunday': 'Closed', 'Mo...",True,"{'Saturday': 'Closed', 'Sunday': 'Closed', 'Mo...",55.0,2540809,-0.393874,0x80c2c44c5a048859:0xc0434f92a925d033
1062018,1.000000e+20,Johnathan Kirkconnell,1.067820,5,Farmer boys always does a really good and serv...,,0x80db7e24219d21c5:0xe913b948a5d3484e,False,Farmer Boys,"Farmer Boys, 41700 Winchester Rd, Temecula, CA...",...,0,0,0,"{'Friday': '6AM–9PM', 'Saturday': '6:30AM–9PM'...",False,"{'Friday': (6, 21), 'Saturday': (6, 21), 'Sund...",105.0,2633997,1.088781,0x80dcd773a025114b:0x31108f717afb10a3
995204,1.000000e+20,Brittany Webb,-0.454389,5,,,0x80dce08b102d8dc9:0x1d9555185b9b6364,False,"Tommy Bahama Restaurant, Bar & Store","Tommy Bahama Restaurant, Bar & Store, Corona d...",...,0,0,0,"{'Saturday': '11AM–9PM', 'Sunday': '11AM–9PM',...",False,"{'Saturday': (11, 21), 'Sunday': (11, 21), 'Mo...",70.0,2528305,-0.362812,0x80dce0860254d0d1:0x6f405d7d31682430
1670961,1.000000e+20,Brittany Webb,-0.439019,5,"Excellent food, good service. One of the few p...",,0x80dce08605fbc3db:0xd2d3b6c46257455b,False,Fleming’s Prime Steakhouse & Wine Bar,"Fleming’s Prime Steakhouse & Wine Bar, 455 New...",...,0,0,0,"{'Saturday': '4–10PM', 'Sunday': '4–9PM', 'Mon...",False,"{'Saturday': (4, 22), 'Sunday': (4, 21), 'Mond...",125.0,2529372,-0.362812,0x80dce08b102d8dc9:0x1d9555185b9b6364
479617,1.000000e+20,Brittany Webb,-0.316381,4,,,0x80dce08bb087fcfd:0xcb418a5c11354e1e,False,Bristol Farms,"Bristol Farms, 810 Avocado Ave, Newport Beach,...",...,0,0,0,"{'Sunday': '7AM–10PM', 'Monday': '7AM–10PM', '...",False,"{'Sunday': (7, 22), 'Monday': (7, 22), 'Tuesda...",105.0,2537887,-0.362812,0x80dce08605fbc3db:0xd2d3b6c46257455b
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1642729,1.184467e+20,Sam Bledsoe,-5.777381,4,"Innovative, creative and constantly changing m...",,0x809ad0dee7757ff1:0x8e3bc7bff4e43fb4,False,Magpie Café,"Magpie Café, 1601 16th St, Sacramento, CA 95814",...,0,0,0,"{'Saturday': '12AM–8PM', 'Sunday': '12–8PM', '...",False,"{'Saturday': (0, 20), 'Sunday': (12, 20), 'Mon...",56.0,2158714,-6.912044,0x808580d7481293a5:0x72e362536fe79765
1295348,1.184467e+20,C.D- MACK,0.209885,5,,,0x80dcaf42e812758d:0x249236a0c15635a6,False,Guitar Center,"Guitar Center, 2550 Canyon Springs Pkwy Suite ...",...,0,0,0,"{'Saturday': '10AM–9PM', 'Sunday': '11AM–7PM',...",False,"{'Saturday': (10, 21), 'Sunday': (11, 19), 'Mo...",74.0,2574428,-0.809298,0x80c2cb604e81a429:0x64ac626154cb85b8
367467,1.184467e+20,C.D- MACK,0.209887,4,,,0x80c2cb5f3b70bbc7:0x76e71218cc8b3829,False,Winchell's Donut House,"Winchell's Donut House, 1300 E Alondra Blvd, C...",...,0,0,0,"{'Thursday': 'Open 24 hours', 'Friday': 'Open ...",False,"{'Thursday': (0, 24), 'Friday': (0, 24), 'Satu...",0.0,2574428,-0.809298,0x80dcaf42e812758d:0x249236a0c15635a6
552369,1.184467e+20,C.D- MACK,0.209903,3,,,0x80c2cc9e481f0cb5:0x590a63d81901a3e7,False,Not Your Mama's Kitchen,"Not Your Mama's Kitchen, 1701 E Compton Blvd, ...",...,0,0,0,"{'Thursday': '11AM–6:30PM', 'Friday': '11AM–6:...",False,"{'Thursday': (11, 18), 'Friday': (11, 18), 'Sa...",32.0,2574429,-0.809298,0x80c2cb5f3b70bbc7:0x76e71218cc8b3829


In [5]:
cleaned_df['review_time(unix)'] = cleaned_df['review_time(unix)'] / 1000
cleaned_df['review_time(unix)'] = pd.to_datetime(cleaned_df['review_time(unix)'], unit='s', errors='coerce')
cleaned_df["review_time(unix)"].dt.year.value_counts()

2019    665883
2018    543590
2020    361181
2017    290311
2021    135756
2016     90690
2015     15564
2014      5688
2013      4330
2012      2531
2011      2394
2010      1410
2009        66
2008        56
2007        36
2005        28
2006        23
Name: review_time(unix), dtype: int64

# Prepare Features

In [None]:
data_query = featured_df[['gmap_id', 'reviewer_id', 'rating']]
train_df = featured_df.sample(frac=0.8, random_state=42)
test_df = featured_df.drop(train_df.index)
featured_df.info()

# Temporal Dynamic Latent Factor Model With Neural Correlative Variants (TDLF-V)

This is a model with many assumptions. Notice that item $i$ refers to the business and user $u$ refers to the user.


$$
\hat{r}_{u,i,t} = \mu + \beta_i + \beta_i(t) + \beta_u + f(\gamma_u, \gamma_i)
$$

$$
\beta_i(t) = \beta_i + \beta_{i,\text{bin}}(t) + \beta_{i,\text{period}}(t)
$$

Notice that we are not using a deviation term in here.


**Static User/Item Bias**:
- Static bias for item $ i $:
  $$
  \beta_i = \text{Embedding}(\text{gmap\_id})
  $$
- Static bias for user $ u $:
  $$
  \beta_u = \text{Embedding}(\text{reviewer\_id})
  $$


**Latent Interaction**:
- User embedding:
  $$
  \gamma_u = \text{Embedding}(\text{reviewer\_id})
  $$
- Item embedding:
  $$
  \gamma_i = \text{Embedding}(\text{gmap\_id})
  $$
- Interaction between user and item embeddings is the following where $ \text{NN} $ is a dense neural network:
  $$
  f(\gamma_u, \gamma_i) = \text{NN}([\gamma_u, \gamma_i])
  $$

**Final Prediction**

$$
\gamma_{u,k}(t) = \gamma_{u,k} + \gamma_{u,k,t}
$$

$$
\hat{r}_{u,i,t} = \mu + b_i + b_i(t) + b_u + f(\gamma_{u,k}(t), \gamma_{i,k})
$$

**Optimization**:

$$
\arg \min_{\alpha, \beta, \gamma} \sum_{u,i} \left(\mu + b_i + b_i(t) + b_u + f(\gamma_{u,k}(t), \gamma_{i,k}) - R_{u,i} \right)^2 + \lambda \left[ \sum_u \beta_u^2 + \sum_i \beta_i^2 + \sum_i \left\| \gamma_i \right\|_2^2 + \sum_u \left\| \gamma_u \right\|_2^2 \right]
$$

**Variants**:

$$
\hat{r}_{u,i,t} = 
\underbrace{\mu}_{\text{Global bias}} + 
\underbrace{b_i}_{\text{Static item bias}} + 
\underbrace{b_i(t)}_{\text{Dynamic item bias}} + 
\underbrace{b_u}_{\text{Static user bias}} + 
\underbrace{f(\gamma_{u,k}(t), \gamma_{i,k})}_{\text{Interaction score}} + 
\underbrace{\mathbf{w}_{\text{item}}^\top \mathbf{F}_{\text{item}}}_{\text{Item-specific feature effect}}
$$

**Variants Optimization**:

$$
\arg \min_{\alpha, \beta, \gamma, \mathbf{w}} \sum_{u,i} 
\left(
\mu + b_i + b_i(t) + b_u + f(\gamma_{u,k}(t), \gamma_{i,k}) + \mathbf{w}_{\text{item}}^\top \mathbf{F}_{\text{item}} - R_{u,i}
\right)^2 
+ 
\lambda \left( \sum_u b_u^2 + \sum_i b_i^2 + \sum_u \|\gamma_u\|_2^2 + \sum_i \|\gamma_i\|_2^2 + \sum \|\mathbf{w}\|_2^2 \right).
$$


In [46]:
train_data = tdlf_df_to_tf_dataset(train_df).shuffle(1024).batch(4096)
test_data = tdlf_df_to_tf_dataset(test_df).batch(4096)

In [None]:
# demo purpose
embedding_dim = 30
dense_units = 30
l2_reg = 1e-3
time_bins= 20
model = TemporalDynamicVariants(l2_reg, dense_units, embedding_dim, data_query, time_bins)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_root_mean_squared_error", 
    patience=10,
    min_delta=0.001,
    restore_best_weights=True
)

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-2, 
    decay_steps=1000, 
    decay_rate=0.8
)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule))
model.fit(train_data, epochs=500, validation_data=test_data, callbacks=[early_stopping])

In [None]:
test_metrics = model.evaluate(test_data, return_dict=True)
print(f"Test RMSE: {test_metrics['root_mean_squared_error']}")

predictions = []
actual_ratings = []
for batch in test_data:
    predicted_ratings = model(batch).numpy()
    actual_ratings.extend(batch["rating"].numpy())
    predictions.extend(predicted_ratings)

predictions = np.array(predictions)
actual_ratings = np.array(actual_ratings)

rmse = np.sqrt(np.mean((predictions - actual_ratings) ** 2))
print(f"RECHECK RMSE: {rmse}")

if np.all(actual_ratings == actual_ratings.round()):
    correct = np.mean(predictions.round() == actual_ratings)
    print(f"Rounded Accuracy: {correct}")
else:
    print("Actual ratings are not integers, skipping rounded accuracy calculation.")

# Factorized Personalized Markov Chain Variants (FMPC-V)

This is a model with less assumptions.

$$
p(i_{t+1} \mid i_t, u) = p(i_{t+1} \mid i_t, u)
$$

In Factorized Personalized Markov Chain (FPMC), we do **Tensor decomposition** and we get the following:

$$
f(i \mid u, j) = \underbrace{\gamma_{ui} \cdot \gamma_{iu}}_{\mathclap{f(i \mid u)}} + \underbrace{\gamma_{ij} \cdot \gamma_{ji}}_{\mathclap{f(i \mid j)}} + \underbrace{\gamma_{uj} \cdot \gamma_{ju}}_{\mathclap{f(u, j)}}.
$$

Neglecting terms that is trvial (user's compatibility with previous terms that user already rated):

$$
f(i \mid u, j) = 
\underbrace{\gamma_{ui} \cdot \gamma_{iu}}_{\text{user's compatibility with the next item}} + 
\underbrace{\gamma_{ij} \cdot \gamma_{ji}}_{\text{next item's compatibility with the previous item}}
$$

For our variants:

$$
f(i \mid u, j, \mathbf{F}) = 
\underbrace{\gamma_{ui} \cdot \gamma_{iu}}_{\text{user, next-item's compatibility}} + 
\underbrace{\gamma_{ij} \cdot \gamma_{ji}}_{\text{next, prev item's compatibility}} + 
\underbrace{\beta_u + \beta_i}_{\text{user and next-item biases}} + 
\underbrace{\mathbf{w}^\top \mathbf{F}_{\text{cat}}}_{\text{categorical embeddings}} + 
\underbrace{\mathbf{v}^\top \mathbf{F}_{\text{num}}}_{\text{numerical embeddings}} + 
\underbrace{b_g}_{\text{global bias}}
$$


Where
- $\gamma_{ui}, \gamma_{iu}, \gamma_{ij}, \gamma_{ji}: \text{Embedding vectors capturing user-item and item-item interactions.}$
- $\beta_u, \beta_i: \text{Bias terms for the user and the next item.}$
- $\mathbf{F}_{\text{cat}}: \text{Categorical feature embeddings.}$
- $\mathbf{F}_{\text{num}}: \text{Dense representations of numerical features (e.g., from a dense layer).}$
- $\mathbf{w}, \mathbf{v}: \text{Learnable weights for categorical and numerical features, respectively.}$
- $b_g: \text{Global bias.}$


In [58]:
train_data = fpmc_df_to_tf_dataset(train_df).shuffle(1024).batch(4096)
test_data = fpmc_df_to_tf_dataset(test_df).batch(4096)

In [None]:
# for demo purpose
embedding_dim = 30
l2_reg = 1e-3
lr = 1e-3
model = FPMCVariants(l2_reg=l2_reg, embedding_dim=embedding_dim, data_query=data_query)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_root_mean_squared_error", 
    patience=10,
    min_delta=0.001,
    restore_best_weights=True
)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr))

history = model.fit(
    train_data, 
    validation_data=test_data, 
    epochs=500, 
    callbacks=[early_stopping]
)

In [None]:
test_metrics = model.evaluate(test_data, return_dict=True)
print(f"Test RMSE: {test_metrics['root_mean_squared_error']}")

predictions = []
actual_ratings = []
for batch in test_data:
    predicted_ratings = model(batch).numpy()
    actual_ratings.extend(batch["rating"].numpy())
    predictions.extend(predicted_ratings)

predictions = np.array(predictions)
actual_ratings = np.array(actual_ratings)

rmse = np.sqrt(np.mean((predictions - actual_ratings) ** 2))
print(f"RECHECK RMSE: {rmse}")

if np.all(actual_ratings == actual_ratings.round()):
    correct = np.mean(predictions.round() == actual_ratings)
    print(f"Rounded Accuracy: {correct}")
else:
    print("Actual ratings are not integers, skipping rounded accuracy calculation.")