# Running Models

In [1]:
import gzip
from collections import defaultdict
import math
import numpy as np
import string
import random
import string
from pathlib import Path

import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd
import keras_tuner as kt

import warnings
warnings.filterwarnings("ignore")

***
# Description
***

## Latent Factor Models + Neural Corrolative Filteration

$$
\arg \min_{\alpha, \beta, \gamma} \sum_{u,i} \left( \alpha + \beta_u + \beta_i + \gamma_u \cdot \gamma_i - R_{u,i} \right)^2 + \lambda \left[ \sum_u \beta_u^2 + \sum_i \beta_i^2 + \sum_i \left\| \gamma_i \right\|_2^2 + \sum_u \left\| \gamma_u \right\|_2^2 \right]
$$

Single terms:
- **Global bias, $ \alpha $**: the overall average rating across all users and items.
- **User bias, $ \beta_u $**: captures the tendency of user $ u $ to rate items higher or lower than the global average.
- **Item bias, $ \beta_i $**: inherent popularity or quality of item $ i $.
- **User and item latent factors, $ \gamma_u $ and $ \gamma_i $**: capture the latent preferences of user $ u $ and the latent characteristics of item $ i $, respectively.

Combinations of terms:
- **Prediction error**: The expression $ \left( \alpha + \beta_u + \beta_i + \gamma_u \cdot \gamma_i - R_{u,i} \right)^2 $ measures the squared difference between the predicted rating $ (\alpha + \beta_u + \beta_i + \gamma_u \cdot \gamma_i) $ and the actual rating $ R_{u,i} $ for user $ u $ and item $ i $.
- **Regularization term**: The term $ \lambda \left[ \sum_u \beta_u^2 + \sum_i \beta_i^2 + \sum_i \left\| \gamma_i \right\|_2^2 + \sum_u \left\| \gamma_u \right\|_2^2 \right] $ penalizes large values of the biases and latent factors to prevent overfitting. Here:
  - $ \sum_u \beta_u^2 $ and $ \sum_i \beta_i^2 $ apply regularization to the user and item biases, respectively.
  - $ \sum_i \left\| \gamma_i \right\|_2^2 $ and $ \sum_u \left\| \gamma_u \right\|_2^2 $ apply regularization to the latent factors of items and users, respectively.
- **Regularization coefficient, $ \lambda $**: This parameter controls the strength of the regularization, balancing the fit to the data with the complexity of the model.


In [2]:
from latent_factor import LatentFactorModel

In [3]:
base_path = Path.cwd().parent.parent
meta_file_path = base_path / "data" / "california_clean_metadata.json.gz"
data_path = base_path / "data" / "california_clean_data.json.gz"

meta_df = pd.read_json(meta_file_path, compression = 'gzip', lines = True)
df = pd.read_json(data_path, compression = 'gzip', lines = True)
df = df.reset_index()

merged_df = df.merge(meta_df, on='gmap_id', how='inner')
columns_to_remove = [col for col in df.columns if col.endswith('_y')]
merged_clean = df.drop(columns=columns_to_remove)
merged_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1158336 entries, 0 to 1158335
Data columns (total 21 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   index              1158336 non-null  int64  
 1   reviewer_id        1158336 non-null  float64
 2   reviewer_name      1158336 non-null  object 
 3   review_time(unix)  1158336 non-null  int64  
 4   rating             1158336 non-null  int64  
 5   text               650897 non-null   object 
 6   resp               73022 non-null    object 
 7   gmap_id            1158336 non-null  object 
 8   has_rep            1158336 non-null  bool   
 9   gmap_name          1158336 non-null  object 
 10  address            1158336 non-null  object 
 11  latitude           1158336 non-null  float64
 12  longitude          1158336 non-null  float64
 13  description        838359 non-null   object 
 14  category           1158336 non-null  object 
 15  avg_rating         1158336 non-n

In [4]:
merged_clean["gmap_id"] = merged_clean["gmap_id"].astype(str)
merged_clean["reviewer_id"] = merged_clean["reviewer_id"].astype(str)

train_df = merged_clean.sample(frac=0.8, random_state=42)
test_df = merged_clean.drop(train_df.index)

def df_to_tf_dataset(dataframe):
    return tf.data.Dataset.from_tensor_slices({
        "gmap_id": dataframe["gmap_id"].values,
        "reviewer_id": dataframe["reviewer_id"].values,
        "rating": dataframe["rating"].values
    })

train_data = df_to_tf_dataset(train_df).shuffle(1024).batch(4096)
test_data = df_to_tf_dataset(test_df).batch(4096)
data_query = merged_clean[['gmap_id', 'reviewer_id', 'rating']]

In [31]:
embedding_dim = 32
dense_units = 32
l2_reg = 0.0201
model = LatentFactorModel(l2_reg, dense_units, embedding_dim, data_query)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_root_mean_squared_error", 
    patience=10,
    min_delta=0.001,
    restore_best_weights=True
)

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-4, 
    decay_steps=10000, 
    decay_rate=0.9
)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule))
model.fit(train_data, epochs=200, validation_data=test_data, callbacks=[early_stopping])



Epoch 1/200








Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 7

<keras.src.callbacks.History at 0x3bd312100>

Evaluation

In [32]:
test_metrics = model.evaluate(test_data, return_dict=True)
print(f"Test RMSE: {test_metrics['root_mean_squared_error']}")

predictions = []
actual_ratings = []
for batch in test_data:
    predicted_ratings = model(batch).numpy()
    actual_ratings.extend(batch["rating"].numpy())
    predictions.extend(predicted_ratings)

predictions = np.array(predictions)
actual_ratings = np.array(actual_ratings)

rmse = np.sqrt(np.mean((predictions - actual_ratings) ** 2))
print(f"RECHECK RMSE: {rmse}")

if np.all(actual_ratings == actual_ratings.round()):
    correct = np.mean(predictions.round() == actual_ratings)
    print(f"Rounded Accuracy: {correct}")
else:
    print("Actual ratings are not integers, skipping rounded accuracy calculation.")

Test RMSE: 0.8058447241783142
RECHECK RMSE: 0.8058448037566474
Rounded Accuracy: 0.5649272447090005


***
# Description
***

## Temporal Latent Factor Model + Neural Corrolative (Static Latent)
**Notice that item $i$ refers to the business and user $u$ refers to the user.**

$$
\hat{r}_{u,i,t} = \mu + \beta_i + \beta_i(t) + \beta_u + \alpha_u \cdot \text{dev}_u(t) + f(\gamma_u, \gamma_i)
$$

$$
\beta_i(t) = \beta_i + \beta_{i,\text{bin}}(t) + \beta_{i,\text{period}}(t)
$$


**Static User/Item Bias**:
- Static bias for item $ i $:
  $$
  \beta_i = \text{Embedding}(\text{gmap\_id})
  $$
- Static bias for user $ u $:
  $$
  \beta_u = \text{Embedding}(\text{reviewer\_id})
  $$


**Temporal User/Item Bias**:
- Temporal bias for item $ i $ based on time bins:
  $$
  \beta_i(t) = \text{Embedding}(\text{time\_bin})
  $$
- Temporal deviation for user $ u $:
  $$
  \text{dev}_u(t) = \text{sgn}(t - \bar{t}_u) \cdot |t - \bar{t}_u|^{0.4}
  $$
  - $ t $: Timestamp of the rating.
  - $ \bar{t}_u $: Mean timestamp of user $ u $'s ratings.
  - $ \text{sgn}(x) $: Sign function, returning $ -1 $ if $ x < 0 $, and $ 1 $ otherwise.
- Scaled user deviation:
  $$
  \alpha_u \cdot \text{dev}_u(t)
  $$
  - $ \alpha_u $: Trainable scaling factor for user $ u $.

**Latent Interaction**:
- User embedding:
  $$
  \gamma_u = \text{Embedding}(\text{reviewer\_id})
  $$
- Item embedding:
  $$
  \gamma_i = \text{Embedding}(\text{gmap\_id})
  $$
- Interaction between user and item embeddings is the following where $ \text{NN} $ is a dense neural network:
  $$
  f(\gamma_u, \gamma_i) = \text{NN}([\gamma_u, \gamma_i])
  $$

**Final Prediction**
$$
\hat{r}_{u,i,t} = \mu + b_i + b_i(t) + b_u + \alpha_u \cdot \text{dev}_u(t) + f(\gamma_u, \gamma_i)
$$

**Loss Function**:
$$
\mathcal{L} = \frac{1}{N} \sum_{(u, i, t) \in \text{Train}} \left( r_{u,i,t} - \hat{r}_{u,i,t} \right)^2
$$

**Optimization**:

$$
\arg \min_{\alpha, \beta, \gamma} \sum_{u,i} \left(\mu + b_i + b_i(t) + b_u + \alpha_u \cdot \text{dev}_u(t) + f(\gamma_u, \gamma_i) - R_{u,i} \right)^2 + \lambda \left[ \sum_u \beta_u^2 + \sum_i \beta_i^2 + \sum_i \left\| \gamma_i \right\|_2^2 + \sum_u \left\| \gamma_u \right\|_2^2 \right]
$$

In [5]:
from temporal_static import TemporalStaticModel

In [14]:
data_query = merged_clean
train_df = merged_clean

# Weekly bins
data_query["time_bin"] = data_query["review_time(unix)"] // (7 * 24 * 3600)
user_mean_times = data_query.groupby("reviewer_id")["review_time(unix)"].mean()
data_query["user_mean_time"] = data_query["reviewer_id"].map(user_mean_times)

train_data = tf.data.Dataset.from_tensor_slices({
    "reviewer_id": data_query["reviewer_id"].astype(str),
    "gmap_id": data_query["gmap_id"].astype(str),
    "time": data_query["review_time(unix)"].astype(int),
    "time_bin": data_query["time_bin"].astype(int),
    "user_mean_time": data_query["user_mean_time"],
    "rating": data_query["rating"]
}).batch(4096).cache()

train_data = train_data.take(int(0.8 * len(data_query)))
test_data = train_data.skip(int(0.8 * len(data_query)))

In [17]:
embedding_dim = 90
dense_units = 90
l2_reg = 0.0201
time_bins= 100
model = TemporalStaticModel(l2_reg, dense_units, embedding_dim, data_query, time_bins)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_root_mean_squared_error", 
    patience=10,
    min_delta=0.001,
    restore_best_weights=True
)

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-4, 
    decay_steps=10000, 
    decay_rate=0.9
)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule))
model.fit(train_data, epochs=500, validation_data=test_data, callbacks=[early_stopping])



Epoch 1/500
 65/283 [=====>........................] - ETA: 2s - root_mean_squared_error: 2118.0627 - loss: 4486189.3385 - regularization_loss: 98.9962 - total_loss: 4486288.3423

In [37]:
test_metrics = model.evaluate(test_data, return_dict=True)
print(f"Test RMSE: {test_metrics['root_mean_squared_error']}")

predictions = []
actual_ratings = []
for batch in test_data:
    predicted_ratings = model(batch).numpy()
    actual_ratings.extend(batch["rating"].numpy())
    predictions.extend(predicted_ratings)

predictions = np.array(predictions)
actual_ratings = np.array(actual_ratings)

rmse = np.sqrt(np.mean((predictions - actual_ratings) ** 2))
print(f"RECHECK RMSE: {rmse}")

if np.all(actual_ratings == actual_ratings.round()):
    correct = np.mean(predictions.round() == actual_ratings)
    print(f"Rounded Accuracy: {correct}")
else:
    print("Actual ratings are not integers, skipping rounded accuracy calculation.")

Test RMSE: 0.7611459493637085
RECHECK RMSE: 0.7611459465026031
Rounded Accuracy: 0.5991099293382312


***
# Description
***

## Temporal Latent Factor Model + Neural Corrolative (Dynamic Latent)
**Notice that item $i$ refers to the business and user $u$ refers to the user.**

**Modification**:

$$
\gamma_{u,k}(t) = \gamma_{u,k} + \alpha_{u,k} \cdot \text{dev}_u(t) + \gamma_{u,k,t}
$$

$$
\hat{r}_{u,i,t} = \mu + b_i + b_i(t) + b_u + \alpha_u \cdot \text{dev}_u(t) + f(\gamma_{u,k}(t), \gamma_{i,k})
$$

**Optimization**:

$$
\arg \min_{\alpha, \beta, \gamma} \sum_{u,i} \left(\gamma_{u,k} + \alpha_{u,k} \cdot \text{dev}_u(t) + \gamma_{u,k,t} - R_{u,i} \right)^2 + \lambda \left[ \sum_u \beta_u^2 + \sum_i \beta_i^2 + \sum_i \left\| \gamma_i \right\|_2^2 + \sum_u \left\| \gamma_u \right\|_2^2 \right]
$$

In [76]:
from temporal_dynamic import *