# Running Models

In [1]:
%load_ext autoreload
%autoreload 2

import gzip
from collections import defaultdict
import math
import numpy as np
import string
import random
import string
from pathlib import Path

import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd
import keras_tuner as kt

import warnings
warnings.filterwarnings("ignore")

***
# Description
***

# Factorized Personalized Markov Chain

$$
p(i_{t+1} \mid i_t, u) = p(i_{t+1} \mid i_t, u)
$$

In Factorized Personalized Markov Chain (FPMC), we do this calculation by:

$$
f(i \mid u, j) = \underbrace{\gamma_{ui} \cdot \gamma_{iu}}_{\mathclap{f(i \mid u)}} + \underbrace{\gamma_{ij} \cdot \gamma_{ji}}_{\mathclap{f(i \mid j)}} + \underbrace{\gamma_{uj} \cdot \gamma_{ju}}_{\mathclap{f(u, j)}}.
$$

Neglecting independent terms:

$$
f(i \mid u, j) = 
\underbrace{\gamma_{ui} \cdot \gamma_{iu}}_{\text{user's compatibility with the next item}} + 
\underbrace{\gamma_{ij} \cdot \gamma_{ji}}_{\text{next item's compatibility with the previous item}}
$$

In [12]:
from fpmc import FPMCModel

In [3]:
base_path = Path.cwd().parent.parent
meta_file_path = base_path / "data" / "california_clean_metadata.json.gz"
data_path = base_path / "data" / "california_clean_data.json.gz"

meta_df = pd.read_json(meta_file_path, compression = 'gzip', lines = True)
df = pd.read_json(data_path, compression = 'gzip', lines = True)
df = df.reset_index()

merged_df = df.merge(meta_df, on='gmap_id', how='inner')
columns_to_remove = [col for col in df.columns if col.endswith('_y')]
merged_clean = df.drop(columns=columns_to_remove)

data_query = merged_clean[['gmap_id', 'reviewer_id', 'rating']]
merged_clean["gmap_id"] = merged_clean["gmap_id"].astype(str)
merged_clean["reviewer_id"] = merged_clean["reviewer_id"].astype(str)

merged_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1158336 entries, 0 to 1158335
Data columns (total 21 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   index              1158336 non-null  int64  
 1   reviewer_id        1158336 non-null  object 
 2   reviewer_name      1158336 non-null  object 
 3   review_time(unix)  1158336 non-null  int64  
 4   rating             1158336 non-null  int64  
 5   text               650897 non-null   object 
 6   resp               73022 non-null    object 
 7   gmap_id            1158336 non-null  object 
 8   has_rep            1158336 non-null  bool   
 9   gmap_name          1158336 non-null  object 
 10  address            1158336 non-null  object 
 11  latitude           1158336 non-null  float64
 12  longitude          1158336 non-null  float64
 13  description        838359 non-null   object 
 14  category           1158336 non-null  object 
 15  avg_rating         1158336 non-n

In [13]:
# Sort data by user and timestamp and reate sequences
data = merged_clean.sort_values(by=["reviewer_id", "review_time(unix)"])
data["prev_item_id"] = data.groupby("reviewer_id")["gmap_id"].shift(1)
data = data.dropna(subset=["prev_item_id"])

# Instantiate StringLookup layers
user_lookup = tf.keras.layers.StringLookup(
    vocabulary=data_query["reviewer_id"].unique(), mask_token=None
)
item_lookup = tf.keras.layers.StringLookup(
    vocabulary=data_query["gmap_id"].unique(), mask_token=None
)

# Apply StringLookup to the dataset
def df_to_tf_dataset(df):
    return tf.data.Dataset.from_tensor_slices({
        "reviewer_id": user_lookup(df["reviewer_id"]),
        "prev_item_id": item_lookup(df["prev_item_id"]),
        "next_item_id": item_lookup(df["gmap_id"]),
        "rating": df["rating"].astype(float),
    })

train_df = data.sample(frac=0.8, random_state=42)
test_df = data.drop(train_df.index)

train_data = df_to_tf_dataset(train_df).shuffle(1024).batch(4096)
test_data = df_to_tf_dataset(test_df).batch(4096)

train_df.sample(3)

Unnamed: 0,index,reviewer_id,reviewer_name,review_time(unix),rating,text,resp,gmap_id,has_rep,gmap_name,...,longitude,description,category,avg_rating,num_of_reviews,price,hours,MISC,relative_results,prev_item_id
85136,85136,1.065456092e+20,redwine,1530505857212,5,(Translated by Google) This is the 8th time th...,,0x7c006d8a4c2dd1ab:0xe200a1829f0437e9,False,Four Paddle Condominium,...,-157.82924,,[Hotel],4.4,68,,,,"[0x7c006d89cc6ab0c3:0x356d5cfd0e454750, 0x7c00...",0x7c006e6de49e8a1f:0xa44b7af81f555ead
703347,703347,1.099487128e+20,Analu Morris,1559406429033,5,,,0x7c001265ac2c9e61:0x10baeee14a815372,False,Bubbies Homemade Ice Cream and Desserts,...,-157.704846,"Casual, counter-serve joint featuring mochi ic...","[Ice cream shop, Dessert shop]",4.5,378,$$,"[[Tuesday, 10AM–11PM], [Wednesday, 10AM–11PM],...",{'Accessibility': ['Wheelchair accessible park...,"[0x7c006e741cc3c1e3:0x5ee37990fd95cdda, 0x7c00...",0x7c0012675bf5ab2b:0x295c3d9f236971f7
692075,692075,1.009809914e+20,chan an,1523495350593,4,(Translated by Google) I went on a company tri...,,0x7c006d80a119ccfd:0xa67649fe6e24a623,False,Side Street Inn On Da Strip,...,-157.814106,No-frills eatery is a buzzy stop for American ...,[Restaurant],4.6,943,$$,"[[Tuesday, 4–9PM], [Wednesday, 4–9PM], [Thursd...","{'Service options': ['Curbside pickup', 'Deliv...","[0x7c006e08c20839b5:0x35691945f7610652, 0x7c00...",0x7c00660d84ab0d0b:0x1ae443215ed5b146


In [18]:
embedding_dim = 90
l2_reg = 0.0201
lr = 1e-3
model = FPMCModel(l2_reg=l2_reg, embedding_dim=embedding_dim, data_query=data_query)

# lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
#     initial_learning_rate=1e-4,
#     decay_steps=10000,
#     decay_rate=0.9
# )

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_root_mean_squared_error", 
    patience=10,
    min_delta=0.001,
    restore_best_weights=True
)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr))

history = model.fit(
    train_data, 
    validation_data=test_data, 
    epochs=50, 
    callbacks=[early_stopping]
)



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50


Evaluation

In [19]:
test_metrics = model.evaluate(test_data, return_dict=True)
print(f"Test RMSE: {test_metrics['root_mean_squared_error']}")

predictions = []
actual_ratings = []
for batch in test_data:
    predicted_ratings = model(batch).numpy()
    actual_ratings.extend(batch["rating"].numpy())
    predictions.extend(predicted_ratings)

predictions = np.array(predictions)
actual_ratings = np.array(actual_ratings)

rmse = np.sqrt(np.mean((predictions - actual_ratings) ** 2))
print(f"RECHECK RMSE: {rmse}")

if np.all(actual_ratings == actual_ratings.round()):
    correct = np.mean(predictions.round() == actual_ratings)
    print(f"Rounded Accuracy: {correct}")
else:
    print("Actual ratings are not integers, skipping rounded accuracy calculation.")

Test RMSE: 0.7975631356239319
RECHECK RMSE: 0.7975630972223813
Rounded Accuracy: 0.5646894294972281
