# SVD

In [1]:
import os
import sys
import surprise
import scrapbook as sb
import pandas as pd

from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions

print(f"System version: {sys.version}")
print(f"Surprise version: {surprise.__version__}")

System version: 3.8.0 (tags/v3.8.0:fa919fd, Oct 14 2019, 19:37:50) [MSC v.1916 64 bit (AMD64)]
Surprise version: 1.1.3


In [42]:
df = pd.read_csv('../datasets/reviews-cleaned.csv')
df = df.rename(columns={'user':'userID', 'name':'itemID'})
df['rating'] = df['rating'] / 2
df.head()

Unnamed: 0,userID,itemID,rating
0,1 Family Meeple,10 Days in Europe,2.05
1,1 Family Meeple,12 Days,3.5
2,1 Family Meeple,7 Wonders,3.25
3,1 Family Meeple,A Column of Fire,2.5
4,1 Family Meeple,A Feast for Odin,5.0


In [43]:
train, test = python_random_split(df, 0.75)
train_set = surprise.Dataset.load_from_df(train, reader=surprise.Reader('ml-100k')).build_full_trainset()
train_set

<surprise.trainset.Trainset at 0x184ba128190>

In [None]:
from surprise import accuracy

svd = surprise.SVD(random_state=0, n_factors=200, n_epochs=40, verbose=True)

with Timer() as train_time:
    for epoch in range(30):  # Set the maximum number of epochs
        svd.fit(train_set)
        
        # Make predictions on the test set
        predictions = svd.test(test_set)
        
        # Compute RMSE
        current_rmse = accuracy.rmse(predictions)
        print(f"Epoch {epoch + 1} - Test RMSE: {current_rmse}")
        
        # Check if RMSE is increasing
        if current_rmse > prev_rmse + tolerance:
            print("Stopping training as RMSE is increasing.")
            break
        
        prev_rmse = current_rmse

In [66]:
svd = surprise.SVD(random_state=0, n_factors=200, n_epochs=40, verbose=True)

with Timer() as train_time:
    svd.fit(train_set)
    
print(f"Took {train_time.interval} seconds for training.")

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3


KeyboardInterrupt: 

In [64]:
predictions = predict(svd, test, usercol='userID', itemcol='itemID')
predictions.head()

Unnamed: 0,userID,itemID,prediction
0,twohu2001,Dream Home,3.40039
1,strubs42,Chimera Station,3.426129
2,Magnus the Blue,Ticket to Ride: The Card Game,2.855045
3,dontylw,Thurn and Taxis,3.40253
4,Hessu68,Room 25,2.968334


In [61]:
predictions = predict(svd, test, usercol='userID', itemcol='itemID')
predictions.head()

Unnamed: 0,userID,itemID,prediction
0,twohu2001,Dream Home,3.392358
1,strubs42,Chimera Station,3.415
2,Magnus the Blue,Ticket to Ride: The Card Game,2.775777
3,dontylw,Thurn and Taxis,3.380153
4,Hessu68,Room 25,2.90688


In [57]:
# df_test = test[:1]
# df_test['userID'] = 'kop'
# df_test['itemID'] = 'Ticket to Ride: The Card Game'

In [58]:
# predictions = predict(svd, df_test, usercol='userID', itemcol='itemID')
# predictions.head()

In [13]:
with Timer() as test_time:
    all_predictions = compute_ranking_predictions(svd, train, usercol='userID', itemcol='itemID', remove_seen=True)
    
print(f"Took {test_time.interval} seconds for prediction.")

Took 371.59553730000005 seconds for prediction.


In [65]:
TOP_K = 10

eval_rmse = rmse(test, predictions)
eval_mae = mae(test, predictions)
eval_rsquared = rsquared(test, predictions)
eval_exp_var = exp_var(test, predictions)

print("RMSE:\t\t%f" % eval_rmse,
      "MAE:\t\t%f" % eval_mae,
      "rsquared:\t%f" % eval_rsquared,
      "exp var:\t%f" % eval_exp_var, sep='\n')

RMSE:		0.559294
MAE:		0.412656
rsquared:	0.397795
exp var:	0.397801


In [62]:
TOP_K = 10

eval_rmse = rmse(test, predictions)
eval_mae = mae(test, predictions)
eval_rsquared = rsquared(test, predictions)
eval_exp_var = exp_var(test, predictions)

print("RMSE:\t\t%f" % eval_rmse,
      "MAE:\t\t%f" % eval_mae,
      "rsquared:\t%f" % eval_rsquared,
      "exp var:\t%f" % eval_exp_var, sep='\n')

RMSE:		0.559032
MAE:		0.412289
rsquared:	0.398359
exp var:	0.398361


In [59]:
TOP_K = 10

eval_rmse = rmse(test, predictions)
eval_mae = mae(test, predictions)
eval_rsquared = rsquared(test, predictions)
eval_exp_var = exp_var(test, predictions)

eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)


print("RMSE:\t\t%f" % eval_rmse,
      "MAE:\t\t%f" % eval_mae,
      "rsquared:\t%f" % eval_rsquared,
      "exp var:\t%f" % eval_exp_var, sep='\n')

print('----')

print("MAP:\t\t%f" % eval_map,
      "NDCG:\t\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

RMSE:		0.590728
MAE:		0.437791
rsquared:	0.328201
exp var:	0.328201
----


# Standard VAE	

In [2]:
import warnings
warnings.filterwarnings('ignore')

import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# %matplotlib inline
import seaborn as sns
# sns.set()
import tensorflow as tf
import keras
from tqdm import tqdm 

from recommenders.utils.timer import Timer
# from recommenders.datasets import movielens
from recommenders.datasets.split_utils import min_rating_filter_pandas
from recommenders.datasets.python_splitters import numpy_stratified_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.utils.constants import SEED as DEFAULT_SEED

from recommenders.datasets.sparse import AffinityMatrix
from recommenders.utils.python_utils import binarize
from recommenders.models.vae.standard_vae import StandardVAE

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))
print("Keras version: {}".format(keras.__version__))

ModuleNotFoundError: No module named 'flatbuffers'

In [2]:
# top 100 items to recommend
TOP_K = 100

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '1m'

# Model parameters
HELDOUT_USERS = 10 # CHANGE FOR DIFFERENT DATASIZE
INTERMEDIATE_DIM = 200
LATENT_DIM = 70
EPOCHS = 400
BATCH_SIZE = 100

# temporary Path to save the optimal model's weights
tmp_dir = '../model'
WEIGHTS_PATH = os.path.join(tmp_dir, "svae_weights.hdf5")

SEED = 98765

In [3]:
df = pd.read_csv('../datasets/reviews-cleaned.csv')
df = df[:100000]
df = df.rename(columns={'user':'userID', 'name':'itemID'})
df.head()

Unnamed: 0,userID,itemID,rating
0,1 Family Meeple,10 Days in Europe,4.1
1,1 Family Meeple,12 Days,7.0
2,1 Family Meeple,7 Wonders,6.5
3,1 Family Meeple,A Column of Fire,5.0
4,1 Family Meeple,A Feast for Odin,10.0


In [4]:
df.shape

(100000, 3)

In [5]:
# Binarize the data (only keep ratings >= 7)
df_preferred = df[df['rating'] >= 7]
print (df_preferred.shape)
df_low_rating = df[df['rating'] < 7]

df_preferred.head(10)

(58538, 3)


Unnamed: 0,userID,itemID,rating
1,1 Family Meeple,12 Days,7.0
4,1 Family Meeple,A Feast for Odin,10.0
5,1 Family Meeple,Above and Below,9.0
6,1 Family Meeple,Abyss,8.0
7,1 Family Meeple,Africana,7.0
11,1 Family Meeple,Agricola,7.0
12,1 Family Meeple,Agricola (Revised Edition),8.0
13,1 Family Meeple,Akrotiri,8.0
14,1 Family Meeple,Alien Frontiers,8.0
15,1 Family Meeple,Altiplano,8.5


In [6]:
# # Keep users who clicked on at least 5 movies
# df = min_rating_filter_pandas(df_preferred, min_rating=5, filter_by="user")

# # Keep movies that were clicked on by at least on 1 user
# df = min_rating_filter_pandas(df, min_rating=1, filter_by="item")

In [7]:
# Obtain both usercount and itemcount after filtering
usercount = df[['userID']].groupby('userID', as_index = False).size()
itemcount = df[['itemID']].groupby('itemID', as_index = False).size()

# Compute sparsity after filtering
sparsity = 1. * df.shape[0] / (usercount.shape[0] * itemcount.shape[0])

print("After filtering, there are %d watching events from %d users and %d movies (sparsity: %.3f%%)" % 
      (df.shape[0], usercount.shape[0], itemcount.shape[0], sparsity * 100))

After filtering, there are 100000 watching events from 154 users and 4506 movies (sparsity: 14.411%)


In [8]:
unique_users = sorted(df.userID.unique())
np.random.seed(SEED)
unique_users = np.random.permutation(unique_users)

In [9]:
# Create train/validation/test users
n_users = len(unique_users)
print("Number of unique users:", n_users)

train_users = unique_users[:(n_users - HELDOUT_USERS * 2)]
print("\nNumber of training users:", len(train_users))

val_users = unique_users[(n_users - HELDOUT_USERS * 2) : (n_users - HELDOUT_USERS)]
print("\nNumber of validation users:", len(val_users))

test_users = unique_users[(n_users - HELDOUT_USERS):]
print("\nNumber of test users:", len(test_users))

Number of unique users: 154

Number of training users: 134

Number of validation users: 10

Number of test users: 10


In [10]:
# For training set keep only users that are in train_users list
train_set = df.loc[df['userID'].isin(train_users)]
print("Number of training observations: ", train_set.shape[0])

# For validation set keep only users that are in val_users list
val_set = df.loc[df['userID'].isin(val_users)]
print("\nNumber of validation observations: ", val_set.shape[0])

# For test set keep only users that are in test_users list
test_set = df.loc[df['userID'].isin(test_users)]
print("\nNumber of test observations: ", test_set.shape[0])

# train_set/val_set/test_set contain user - movie interactions with rating 4 or 5

Number of training observations:  87541

Number of validation observations:  6555

Number of test observations:  5904


In [11]:
# Obtain list of unique movies used in training set
unique_train_items = pd.unique(train_set['itemID'])
print("Number of unique movies that rated in training set", unique_train_items.size)

Number of unique movies that rated in training set 4485


In [12]:
# For validation set keep only movies that used in training set
val_set = val_set.loc[val_set['itemID'].isin(unique_train_items)]
print("Number of validation observations after filtering: ", val_set.shape[0])

# For test set keep only movies that used in training set
test_set = test_set.loc[test_set['itemID'].isin(unique_train_items)]
print("\nNumber of test observations after filtering: ", test_set.shape[0])

Number of validation observations after filtering:  6542

Number of test observations after filtering:  5893


In [13]:
# Instantiate the sparse matrix generation for train, validation and test sets
# use list of unique items from training set for all sets
am_train = AffinityMatrix(df=train_set, items_list=unique_train_items)

am_val = AffinityMatrix(df=val_set, items_list=unique_train_items)

am_test = AffinityMatrix(df=test_set, items_list=unique_train_items)

In [14]:
# Obtain the sparse matrix for train, validation and test sets
train_data, _, _ = am_train.gen_affinity_matrix()
print(train_data.shape)

val_data, val_map_users, val_map_items = am_val.gen_affinity_matrix()
print(val_data.shape)

test_data, test_map_users, test_map_items = am_test.gen_affinity_matrix()
print(test_data.shape)

(134, 4485)
(10, 4485)
(10, 4485)


In [15]:
# Split validation and test data into training and testing parts
val_data_tr, val_data_te = numpy_stratified_split(val_data, ratio=0.75, seed=SEED)
test_data_tr, test_data_te = numpy_stratified_split(test_data, ratio=0.75, seed=SEED)

# Binarize train, validation and test data
train_data = binarize(a=train_data, threshold=3.5)
val_data = binarize(a=val_data, threshold=3.5)
test_data = binarize(a=test_data, threshold=3.5)

# Binarize validation data: training part  
val_data_tr = binarize(a=val_data_tr, threshold=3.5)

# Binarize validation data: testing part (save non-binary version in the separate object, will be used for calculating NDCG)
val_data_te_ratings = val_data_te.copy()
val_data_te = binarize(a=val_data_te, threshold=3.5)

# Binarize test data: training part 
test_data_tr = binarize(a=test_data_tr, threshold=3.5)

# Binarize test data: testing part (save non-binary version in the separate object, will be used for calculating NDCG)
test_data_te_ratings = test_data_te.copy()
test_data_te = binarize(a=test_data_te, threshold=3.5)

In [16]:
# retrieve real ratings from initial dataset 

test_data_te_ratings=pd.DataFrame(test_data_te_ratings)
val_data_te_ratings=pd.DataFrame(val_data_te_ratings)

total_iterations = len(df_low_rating)

for index,i in tqdm(df_low_rating.iterrows(), total=total_iterations, desc="Processing Rows"):
  user_old= i['userID'] # old value 
  item_old=i['itemID'] # old value 

  if (test_map_users.get(user_old) is not None)  and (test_map_items.get(item_old) is not None) :
      user_new=test_map_users.get(user_old) # new value 
      item_new=test_map_items.get(item_old) # new value 
      rating=i['rating'] 
      test_data_te_ratings.at[user_new,item_new]= rating   

  if (val_map_users.get(user_old) is not None)  and (val_map_items.get(item_old) is not None) :
      user_new=val_map_users.get(user_old) # new value 
      item_new=val_map_items.get(item_old) # new value 
      rating=i['rating'] 
      val_data_te_ratings.at[user_new,item_new]= rating   


val_data_te_ratings=val_data_te_ratings.to_numpy()    
test_data_te_ratings=test_data_te_ratings.to_numpy()    
# test_data_te_ratings  

Processing Rows:   3%|▎         | 1238/41462 [00:00<00:07, 5588.48it/s]

Processing Rows: 100%|██████████| 41462/41462 [00:03<00:00, 10617.87it/s]


In [17]:
model_without_anneal = StandardVAE(n_users=train_data.shape[0], # Number of unique users in the training set
                                   original_dim=train_data.shape[1], # Number of unique items in the training set
                                   intermediate_dim=INTERMEDIATE_DIM, 
                                   latent_dim=LATENT_DIM, 
                                   n_epochs=EPOCHS, 
                                   batch_size=BATCH_SIZE, 
                                   k=TOP_K,
                                   verbose=0,
                                   seed=SEED,
                                   save_path=WEIGHTS_PATH,
                                   drop_encoder=0.5,
                                   drop_decoder=0.5,
                                   annealing=False,
                                   beta=1.0
                                   )

In [18]:
am_train = AffinityMatrix(df=train_set, items_list=unique_train_items)

am_val = AffinityMatrix(df=val_set, items_list=unique_train_items)

am_test = AffinityMatrix(df=test_set, items_list=unique_train_items)

In [19]:
# from tensorflow.python.framework.ops import disable_eager_execution
# disable_eager_execution()

In [20]:
with Timer() as t:
    model_without_anneal.fit(x_train=train_data, 
                             x_valid=val_data, 
                             x_val_tr=val_data_tr, 
                             x_val_te=val_data_te_ratings, # with the original ratings 
                             mapper=am_val,
                             )
print("Took {} seconds for training.".format(t))

TypeError: in user code:

    File "C:\Users\Cop\AppData\Roaming\Python\Python38\site-packages\keras\engine\training.py", line 1051, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\Cop\AppData\Roaming\Python\Python38\site-packages\keras\engine\training.py", line 1040, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Cop\AppData\Roaming\Python\Python38\site-packages\keras\engine\training.py", line 1030, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Cop\AppData\Roaming\Python\Python38\site-packages\keras\engine\training.py", line 890, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\Cop\AppData\Roaming\Python\Python38\site-packages\keras\engine\training.py", line 948, in compute_loss
        return self.compiled_loss(
    File "C:\Users\Cop\AppData\Roaming\Python\Python38\site-packages\keras\engine\compile_utils.py", line 239, in __call__
        self._loss_metric.update_state(
    File "C:\Users\Cop\AppData\Roaming\Python\Python38\site-packages\keras\utils\metrics_utils.py", line 70, in decorated
        update_op = update_state_fn(*args, **kwargs)
    File "C:\Users\Cop\AppData\Roaming\Python\Python38\site-packages\keras\metrics\base_metric.py", line 140, in update_state_fn
        return ag_update_state(*args, **kwargs)
    File "C:\Users\Cop\AppData\Roaming\Python\Python38\site-packages\keras\metrics\base_metric.py", line 449, in update_state  **
        sample_weight = tf.__internal__.ops.broadcast_weights(
    File "C:\Users\Cop\AppData\Roaming\Python\Python38\site-packages\keras\engine\keras_tensor.py", line 254, in __array__
        raise TypeError(

    TypeError: You are passing KerasTensor(type_spec=TensorSpec(shape=(), dtype=tf.float32, name=None), name='Placeholder:0', description="created by layer 'tf.cast_2'"), an intermediate Keras symbolic input/output, to a TF API that does not allow registering custom dispatchers, such as `tf.cond`, `tf.function`, gradient tapes, or `tf.map_fn`. Keras Functional model construction only supports TF API calls that *do* support dispatching, such as `tf.math.add` or `tf.reshape`. Other APIs cannot be called directly on symbolic Kerasinputs/outputs. You can work around this limitation by putting the operation in a custom Keras layer `call` and calling that layer on this symbolic input/output.
