# Running Models

In [2]:
%load_ext autoreload
%autoreload 2

import gzip
from collections import defaultdict
import math
import numpy as np
import string
import random
import string
from pathlib import Path

import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd
import keras_tuner as kt

import warnings
warnings.filterwarnings("ignore")

***
# Description
***

## Latent Factor Models + Neural Corrolative Filteration

$$
\arg \min_{\alpha, \beta, \gamma} \sum_{u,i} \left( \alpha + \beta_u + \beta_i + \gamma_u \cdot \gamma_i - R_{u,i} \right)^2 + \lambda \left[ \sum_u \beta_u^2 + \sum_i \beta_i^2 + \sum_i \left\| \gamma_i \right\|_2^2 + \sum_u \left\| \gamma_u \right\|_2^2 \right]
$$

Single terms:
- **Global bias, $ \alpha $**: the overall average rating across all users and items.
- **User bias, $ \beta_u $**: captures the tendency of user $ u $ to rate items higher or lower than the global average.
- **Item bias, $ \beta_i $**: inherent popularity or quality of item $ i $.
- **User and item latent factors, $ \gamma_u $ and $ \gamma_i $**: capture the latent preferences of user $ u $ and the latent characteristics of item $ i $, respectively.

Combinations of terms:
- **Prediction error**: The expression $ \left( \alpha + \beta_u + \beta_i + \gamma_u \cdot \gamma_i - R_{u,i} \right)^2 $ measures the squared difference between the predicted rating $ (\alpha + \beta_u + \beta_i + \gamma_u \cdot \gamma_i) $ and the actual rating $ R_{u,i} $ for user $ u $ and item $ i $.
- **Regularization term**: The term $ \lambda \left[ \sum_u \beta_u^2 + \sum_i \beta_i^2 + \sum_i \left\| \gamma_i \right\|_2^2 + \sum_u \left\| \gamma_u \right\|_2^2 \right] $ penalizes large values of the biases and latent factors to prevent overfitting. Here:
  - $ \sum_u \beta_u^2 $ and $ \sum_i \beta_i^2 $ apply regularization to the user and item biases, respectively.
  - $ \sum_i \left\| \gamma_i \right\|_2^2 $ and $ \sum_u \left\| \gamma_u \right\|_2^2 $ apply regularization to the latent factors of items and users, respectively.
- **Regularization coefficient, $ \lambda $**: This parameter controls the strength of the regularization, balancing the fit to the data with the complexity of the model.


In [3]:
from latent_factor import LatentFactorModel

In [6]:
base_path = Path.cwd().parent.parent
meta_file_path = base_path / "data" / "california_clean_metadata.json.gz"
data_path = base_path / "data" / "california_clean_data.json.gz"

meta_df = pd.read_json(meta_file_path, compression = 'gzip', lines = True)
df = pd.read_json(data_path, compression = 'gzip', lines = True)
df = df.reset_index()

merged_df = df.merge(meta_df, on='gmap_id', how='inner')
columns_to_remove = [col for col in df.columns if col.endswith('_y')]
merged_clean = df.drop(columns=columns_to_remove)

data_query = merged_clean[['gmap_id', 'reviewer_id', 'rating']]
merged_clean["gmap_id"] = merged_clean["gmap_id"].astype(str)
merged_clean["reviewer_id"] = merged_clean["reviewer_id"].astype(str)

merged_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1158336 entries, 0 to 1158335
Data columns (total 21 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   index              1158336 non-null  int64  
 1   reviewer_id        1158336 non-null  object 
 2   reviewer_name      1158336 non-null  object 
 3   review_time(unix)  1158336 non-null  int64  
 4   rating             1158336 non-null  int64  
 5   text               650897 non-null   object 
 6   resp               73022 non-null    object 
 7   gmap_id            1158336 non-null  object 
 8   has_rep            1158336 non-null  bool   
 9   gmap_name          1158336 non-null  object 
 10  address            1158336 non-null  object 
 11  latitude           1158336 non-null  float64
 12  longitude          1158336 non-null  float64
 13  description        838359 non-null   object 
 14  category           1158336 non-null  object 
 15  avg_rating         1158336 non-n

In [5]:
train_df = merged_clean.sample(frac=0.8, random_state=42)
test_df = merged_clean.drop(train_df.index)

def df_to_tf_dataset(dataframe):
    return tf.data.Dataset.from_tensor_slices({
        "gmap_id": dataframe["gmap_id"].values,
        "reviewer_id": dataframe["reviewer_id"].values,
        "rating": dataframe["rating"].values
    })

train_data = df_to_tf_dataset(train_df).shuffle(1024).batch(4096)
test_data = df_to_tf_dataset(test_df).batch(4096)

In [31]:
embedding_dim = 32
dense_units = 32
l2_reg = 0.0201
model = LatentFactorModel(l2_reg, dense_units, embedding_dim, data_query)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_root_mean_squared_error", 
    patience=10,
    min_delta=0.001,
    restore_best_weights=True
)

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-4, 
    decay_steps=10000, 
    decay_rate=0.9
)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule))
model.fit(train_data, epochs=200, validation_data=test_data, callbacks=[early_stopping])



Epoch 1/200








Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 7

<keras.src.callbacks.History at 0x3bd312100>

Evaluation

In [32]:
test_metrics = model.evaluate(test_data, return_dict=True)
print(f"Test RMSE: {test_metrics['root_mean_squared_error']}")

predictions = []
actual_ratings = []
for batch in test_data:
    predicted_ratings = model(batch).numpy()
    actual_ratings.extend(batch["rating"].numpy())
    predictions.extend(predicted_ratings)

predictions = np.array(predictions)
actual_ratings = np.array(actual_ratings)

rmse = np.sqrt(np.mean((predictions - actual_ratings) ** 2))
print(f"RECHECK RMSE: {rmse}")

if np.all(actual_ratings == actual_ratings.round()):
    correct = np.mean(predictions.round() == actual_ratings)
    print(f"Rounded Accuracy: {correct}")
else:
    print("Actual ratings are not integers, skipping rounded accuracy calculation.")

Test RMSE: 0.8058447241783142
RECHECK RMSE: 0.8058448037566474
Rounded Accuracy: 0.5649272447090005


***
# Description
***

## Temporal Latent Factor Model + Neural Corrolative (Static Latent)
**Notice that item $i$ refers to the business and user $u$ refers to the user.**

$$
\hat{r}_{u,i,t} = \mu + \beta_i + \beta_i(t) + \beta_u + \alpha_u \cdot \text{dev}_u(t) + f(\gamma_u, \gamma_i)
$$

$$
\beta_i(t) = \beta_i + \beta_{i,\text{bin}}(t) + \beta_{i,\text{period}}(t)
$$


**Static User/Item Bias**:
- Static bias for item $ i $:
  $$
  \beta_i = \text{Embedding}(\text{gmap\_id})
  $$
- Static bias for user $ u $:
  $$
  \beta_u = \text{Embedding}(\text{reviewer\_id})
  $$


**Temporal User/Item Bias**:
- Temporal bias for item $ i $ based on time bins:
  $$
  \beta_i(t) = \text{Embedding}(\text{time\_bin})
  $$
- Temporal deviation for user $ u $:
  $$
  \text{dev}_u(t) = \text{sgn}(t - \bar{t}_u) \cdot |t - \bar{t}_u|^{0.4}
  $$
  - $ t $: Timestamp of the rating.
  - $ \bar{t}_u $: Mean timestamp of user $ u $'s ratings.
  - $ \text{sgn}(x) $: Sign function, returning $ -1 $ if $ x < 0 $, and $ 1 $ otherwise.
- Scaled user deviation:
  $$
  \alpha_u \cdot \text{dev}_u(t)
  $$
  - $ \alpha_u $: Trainable scaling factor for user $ u $.

**Latent Interaction**:
- User embedding:
  $$
  \gamma_u = \text{Embedding}(\text{reviewer\_id})
  $$
- Item embedding:
  $$
  \gamma_i = \text{Embedding}(\text{gmap\_id})
  $$
- Interaction between user and item embeddings is the following where $ \text{NN} $ is a dense neural network:
  $$
  f(\gamma_u, \gamma_i) = \text{NN}([\gamma_u, \gamma_i])
  $$

**Final Prediction**
$$
\hat{r}_{u,i,t} = \mu + b_i + b_i(t) + b_u + \alpha_u \cdot \text{dev}_u(t) + f(\gamma_u, \gamma_i)
$$

**Loss Function**:
$$
\mathcal{L} = \frac{1}{N} \sum_{(u, i, t) \in \text{Train}} \left( r_{u,i,t} - \hat{r}_{u,i,t} \right)^2
$$

**Optimization**:

$$
\arg \min_{\alpha, \beta, \gamma} \sum_{u,i} \left(\mu + b_i + b_i(t) + b_u + \alpha_u \cdot \text{dev}_u(t) + f(\gamma_u, \gamma_i) - R_{u,i} \right)^2 + \lambda \left[ \sum_u \beta_u^2 + \sum_i \beta_i^2 + \sum_i \left\| \gamma_i \right\|_2^2 + \sum_u \left\| \gamma_u \right\|_2^2 \right]
$$

In [28]:
from temporal_static import TemporalStaticModel
data_query = merged_clean

Preparation and normalization (key)

In [None]:
# Weekly bins for item bias
data_query["time_bin"] = data_query["review_time(unix)"] // (7 * 24 * 3600) # same week in same bin

# Mean timestamp for user bias
user_mean_times = data_query.groupby("reviewer_id")["review_time(unix)"].mean()
data_query["user_mean_time"] = data_query["reviewer_id"].map(user_mean_times)

train_df = data_query.sample(frac=0.8, random_state=42)
test_df = data_query.drop(train_df.index)

# Calculate mean and std for normalization
time_mean, time_std = train_df["review_time(unix)"].mean(), train_df["review_time(unix)"].std()
user_mean_time_mean, user_mean_time_std = train_df["user_mean_time"].mean(), train_df["user_mean_time"].std()

# Normalize the training and test data
train_df["review_time(unix)"] = (train_df["review_time(unix)"] - time_mean) / time_std
test_df["review_time(unix)"] = (test_df["review_time(unix)"] - time_mean) / time_std
train_df["user_mean_time"] = (train_df["user_mean_time"] - user_mean_time_mean) / user_mean_time_std
test_df["user_mean_time"] = (test_df["user_mean_time"] - user_mean_time_mean) / user_mean_time_std

Put in TF format

In [31]:
# Function to convert DataFrame to TensorFlow dataset
def df_to_tf_dataset(dataframe):
    return tf.data.Dataset.from_tensor_slices({
        "reviewer_id": dataframe["reviewer_id"].astype(str),
        "gmap_id": dataframe["gmap_id"].astype(str),
        "time": dataframe["review_time(unix)"].astype(float),
        "time_bin": dataframe["time_bin"].astype(float),
        "user_mean_time": dataframe["user_mean_time"],
        "rating": dataframe["rating"]
    })

# Create TensorFlow datasets
train_data = df_to_tf_dataset(train_df).shuffle(1024).batch(4096)
test_data = df_to_tf_dataset(test_df).batch(4096)

train_df[["reviewer_id", "gmap_id", "review_time(unix)", "time_bin", "user_mean_time", "rating"]].sample(6)

Unnamed: 0,reviewer_id,gmap_id,review_time(unix),time_bin,user_mean_time,rating
196909,1.0916236319999998e+20,0x7954c9dceb584fe7:0x78519543b84f347f,0.891973,2616794,0.715646,5
790084,1.090580207e+20,0x7c00688823ee72fb:0x40d02d372515187e,-0.017285,2559611,-0.020666,5
13726,1.181338133e+20,0x7c006df045a36271:0x821f7e897cfe859f,1.195468,2635880,0.9427,5
6863,1.016425418e+20,0x79524b7280778479:0x60eed0aa0e92f995,-0.48322,2530309,-0.100013,5
974016,1.149971521e+20,0x7c00660d84ab0d0b:0x1ae443215ed5b146,0.291537,2579033,0.089732,5
557289,1.031438527e+20,0x7c0065ffde090f89:0xc2eb0075cafc16d,-0.467079,2531324,-0.63308,3


In [36]:
embedding_dim = 32
dense_units = 32
l2_reg = 0.0201
time_bins= 10
model = TemporalStaticModel(l2_reg, dense_units, embedding_dim, data_query, time_bins)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_root_mean_squared_error", 
    patience=10,
    min_delta=0.001,
    restore_best_weights=True
)

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-3, 
    decay_steps=1000, 
    decay_rate=0.8
)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule))
model.fit(train_data, epochs=500, validation_data=test_data, callbacks=[early_stopping])



Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.src.callbacks.History at 0x8a39abd90>

In [37]:
test_metrics = model.evaluate(test_data, return_dict=True)
print(f"Test RMSE: {test_metrics['root_mean_squared_error']}")

predictions = []
actual_ratings = []
for batch in test_data:
    predicted_ratings = model(batch).numpy()
    actual_ratings.extend(batch["rating"].numpy())
    predictions.extend(predicted_ratings)

predictions = np.array(predictions)
actual_ratings = np.array(actual_ratings)

rmse = np.sqrt(np.mean((predictions - actual_ratings) ** 2))
print(f"RECHECK RMSE: {rmse}")

if np.all(actual_ratings == actual_ratings.round()):
    correct = np.mean(predictions.round() == actual_ratings)
    print(f"Rounded Accuracy: {correct}")
else:
    print("Actual ratings are not integers, skipping rounded accuracy calculation.")

Test RMSE: 0.8352956175804138
RECHECK RMSE: 0.8338168255728705
Rounded Accuracy: 0.5068093427203701


***
# Description
***

## Temporal Latent Factor Model + Neural Corrolative (Dynamic Latent)
**Notice that item $i$ refers to the business and user $u$ refers to the user.**

**Modification**:

$$
\gamma_{u,k}(t) = \gamma_{u,k} + \alpha_{u,k} \cdot \text{dev}_u(t) + \gamma_{u,k,t}
$$

$$
\hat{r}_{u,i,t} = \mu + b_i + b_i(t) + b_u + \alpha_u \cdot \text{dev}_u(t) + f(\gamma_{u,k}(t), \gamma_{i,k})
$$

**Optimization**:

$$
\arg \min_{\alpha, \beta, \gamma} \sum_{u,i} \left(\mu + b_i + b_i(t) + b_u + \alpha_u \cdot \text{dev}_u(t) + f(\gamma_{u,k}(t), \gamma_{i,k}) - R_{u,i} \right)^2 + \lambda \left[ \sum_u \beta_u^2 + \sum_i \beta_i^2 + \sum_i \left\| \gamma_i \right\|_2^2 + \sum_u \left\| \gamma_u \right\|_2^2 \right]
$$

In [39]:
from temporal_dynamic import TemporalDynamicModel

In [42]:
embedding_dim = 90
dense_units = 90
l2_reg = 0.0201
time_bins= 10
model = TemporalDynamicModel(l2_reg, dense_units, embedding_dim, data_query, time_bins)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_root_mean_squared_error", 
    patience=10,
    min_delta=0.001,
    restore_best_weights=True
)

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-2, 
    decay_steps=1000, 
    decay_rate=0.8
)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule))
model.fit(train_data, epochs=500, validation_data=test_data, callbacks=[early_stopping])



Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500


<keras.src.callbacks.History at 0x83b595310>

In [43]:
test_metrics = model.evaluate(test_data, return_dict=True)
print(f"Test RMSE: {test_metrics['root_mean_squared_error']}")

predictions = []
actual_ratings = []
for batch in test_data:
    predicted_ratings = model(batch).numpy()
    actual_ratings.extend(batch["rating"].numpy())
    predictions.extend(predicted_ratings)

predictions = np.array(predictions)
actual_ratings = np.array(actual_ratings)

rmse = np.sqrt(np.mean((predictions - actual_ratings) ** 2))
print(f"RECHECK RMSE: {rmse}")

if np.all(actual_ratings == actual_ratings.round()):
    correct = np.mean(predictions.round() == actual_ratings)
    print(f"Rounded Accuracy: {correct}")
else:
    print("Actual ratings are not integers, skipping rounded accuracy calculation.")

 1/57 [..............................] - ETA: 0s - root_mean_squared_error: 0.8852 - loss: 0.7836 - regularization_loss: 0.0147 - total_loss: 0.7983

Test RMSE: 0.8028972148895264
RECHECK RMSE: 0.8028971318795725
Rounded Accuracy: 0.5753128412764874
