In [1]:
# run_experiment("configuration/file/path")

In [20]:
import tensorflow as tf
from tensorflow.python.client import device_lib
tf.autograph.set_verbosity(5)

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
device_lib.list_local_devices()

Num GPUs Available:  1


[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 15398128136902885495
 xla_global_id: -1,
 name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 10709499904
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 15177779476710547821
 physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3060, pci bus id: 0000:01:00.0, compute capability: 8.6"
 xla_global_id: 416903419]

# Import stuff

In [21]:
import os
import sys
sys.path.append(os.path.abspath('') + '/../../..')

In [22]:
from data import ImplicitData, getBucketsHoldouts
from plot_utils import lineplot_recallxholdout, recall_heatmap
from dataset_evaluation_utils import *
from recommenders_implicit import ISGD, RAISGD, RSISGD  # ISGD framework, BISGD,
from eval_implicit import EvaluateHoldouts, EvaluateAndStore, EvalPrequential # EvaluateAndStore para guardar estados do modelo e holdouts, a avaliação prequencial de ratings implicitos é opcional, , EvalHoldout

from datetime import datetime
import joblib
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')

___
# BWT FWT

ACC, BWT, e FWT - Lopez-Paz e Ranzato GEM

In [5]:
def avg_recall(results_matrix): # Lopez-Paz e Ranzato GEM 2017
    return np.mean( np.diag(results_matrix) )

def compute_BWT(results_matrix): # Lopez-Paz e Ranzato GEM 2017
    BWT = []
    n_checkpoints = results_matrix.shape[0]
    for T in range(1, n_checkpoints): # 1 means holdout 2, 2 means 3, so on
        Rti = results_matrix.iloc[T, 0:T] # get models performances' on previous holdouts
        Rii = np.diag(results_matrix)[0:T] # get models performances' on their closest holdouts (diagonal)
        E = sum( Rti - Rii ) # future models performances' - performances' of models closest to holdouts (diagonal)
        BWT.append( E/T ) # store average BWT for model
    return BWT, np.mean( BWT ) # return BWT and average BWT for all models

def compute_FWT(results_matrix): # Díaz-Rodriguez et al. 2018
    upper_tri = results_matrix.to_numpy()[np.triu_indices(results_matrix.shape[0], k=1)]
    return np.mean(upper_tri)

___
# Read Data

In [6]:
# importa dataset 'movieles'
data = pd.read_csv('../../output/movielens_dump/sampled_movielens.csv')
user_col = 'UserID'
item_col = 'ItemID'

In [7]:
data.shape, data[user_col].nunique(), data[item_col].nunique()

((50742, 7), 1427, 2492)

In [8]:
data[[user_col, item_col]].duplicated().sum()

0

In [9]:
data.head()

Unnamed: 0,UserID,ItemID,Timestamp,date2,year,month,date
0,4448,902,965087178,2000-07-31 23:46:18,2000,7,2000-07-01 00:00:00
1,4448,3793,965087267,2000-07-31 23:47:47,2000,7,2000-07-01 00:00:00
2,4448,3751,965087267,2000-07-31 23:47:47,2000,7,2000-07-01 00:00:00
3,4448,3578,965087349,2000-07-31 23:49:09,2000,7,2000-07-01 00:00:00
4,4448,3481,965087470,2000-07-31 23:51:10,2000,7,2000-07-01 00:00:00


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50742 entries, 0 to 50741
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   UserID     50742 non-null  int64 
 1   ItemID     50742 non-null  int64 
 2   Timestamp  50742 non-null  int64 
 3   date2      50742 non-null  object
 4   year       50742 non-null  int64 
 5   month      50742 non-null  int64 
 6   date       50742 non-null  object
dtypes: int64(5), object(2)
memory usage: 2.7+ MB


### Convert timestamp

In [11]:
%%time 
# 2.42s
data['date'] = data['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %X'))
# data.sort_values(by='timestamp', inplace=True)

CPU times: user 304 ms, sys: 0 ns, total: 304 ms
Wall time: 303 ms


# EXPERIMENT 7

CODE TO GET LAST N INTERACTIONS FROM EACH USER AS HOLDOUT  
* IF USER DID NOT INTERACT WITH AT LEAST N+1 ITEMS, THEN IT IS NOT USED FOR HOLDOUT  
* LAST 10 INTERACTIONS FROM EACH USER AS HOLDOUT
* RECOMENDING SEEN ITEMS IS ALLOWED
* DON'T REMOVE INTERACTIONS IN HOLDOUT FROM BUCKETS
* DON'T REMOVE INTERACTIONS **SENT** TO HOLDOUT FROM BUCKET

In [12]:
# CODE TO GET LAST N INTERACTIONS FROM EACH USER AS HOLDOUT
# IF USER DID NOT INTERACT WITH AT LEAST N+1 ITEMS, THEN IT IS NOT USED FOR HOLDOUT

N = 10
cold_start_buckets = 0
#     print('0',data.shape[0]) # debug
print('Creating buckets. . .')
buckets = []
# assert interval_type in ['W', 'M', 'QS', 'F'], "interval must be one of W, M, QS, or F"
# create buckets based on months
months = data['date'].unique()
months.sort()
for interval in months:
    idx = (data['date'] == interval)
    buckets.append( data[idx] )

Creating buckets. . .


In [13]:
print('Creating holdouts. . .')
# create holdouts with last user interaction
holdouts = []

for i, b in enumerate( buckets ):
    if i >= cold_start_buckets:
        condition = (b[user_col].value_counts() > N)
        frequent_users = b[user_col].value_counts()[ condition ].index
        holdout_idx = []
        for u in frequent_users:
            tail_idx = list( b[b[user_col] == u].tail(N).index )
            holdout_idx += tail_idx
        holdout = b.loc[holdout_idx].reset_index(drop=True)
        holdouts.append(holdout)
        # buckets[i] = b.drop(index=holdout_idx).reset_index(drop=True)
        buckets[i] = b.reset_index(drop=True)

Creating holdouts. . .


Store buckets and holdouts

In [14]:
# buckets[0].to_csv('movielens_bucket_0.csv', columns=[user_col, item_col], header=False, index=False)
# holdouts[0].to_csv('movielens_holdout_0.csv', columns=[user_col, item_col], header=False, index=False)
buckets[0].iloc[:1000].to_csv('movielens_bucket_0.csv', columns=[user_col, item_col], header=False, index=False)
holdouts[0].iloc[:100].to_csv('movielens_holdout_0.csv', columns=[user_col, item_col], header=False, index=False)

In [15]:
for b in buckets:
    print(b.shape)    

(16617, 7)
(13148, 7)
(20977, 7)


In [16]:
for h in holdouts:
    print(h.shape)

(3710, 7)
(3240, 7)
(5040, 7)


In [17]:
holdouts[1].to_csv('movielens_holdout_1.csv', columns=[user_col, item_col], header=False, index=False)

Run experiment

# Running experiment with base configuration - there is optimization and training

In [18]:
from elliot.run import run_experiment


__/\\\\\\\\\\\\\\\___/\\\\\\______/\\\\\\_________________________________________        
 _\/\\\///////////___\////\\\_____\////\\\_________________________________________       
  _\/\\\_________________\/\\\________\/\\\______/\\\_____________________/\\\______      
   _\/\\\\\\\\\\\_________\/\\\________\/\\\_____\///_______/\\\\\______/\\\\\\\\\\\_     
    _\/\\\///////__________\/\\\________\/\\\______/\\\____/\\\///\\\___\////\\\////__    
     _\/\\\_________________\/\\\________\/\\\_____\/\\\___/\\\__\//\\\_____\/\\\______   
      _\/\\\_________________\/\\\________\/\\\_____\/\\\__\//\\\__/\\\______\/\\\_/\\__  
       _\/\\\\\\\\\\\\\\\___/\\\\\\\\\___/\\\\\\\\\__\/\\\___\///\\\\\/_______\//\\\\\___ 
        _\///////////////___\/////////___\/////////___\///______\/////__________\/////____
Version Number: 0.3.1


In [19]:
run_experiment('elliot_example_configuration.yml')

2023-03-14 18:23:37.104512: I Start experiment
2023-03-14 18:23:37.169382: I /home/jupyter-kpereira/streamRec-forgetting/notebooks/elliot_experiments/elliot_example/movielens_bucket_0.csv - Loaded
2023-03-14 18:23:37.173600: I Test Fold 0
2023-03-14 18:23:37.805460: I Statistics	Users:	1000	Items:	1000	Transactions:	1000	Sparsity:	0.999
2023-03-14 18:23:38.626188: I Loading parameters
2023-03-14 18:23:38.626314: I Tuning begun for MultiVAE\n
2023-03-14 18:23:38.626801: I Hyperparameter tuning exploration:
2023-03-14 18:23:38.626742: I Parameter intermediate_dim set to 155
2023-03-14 18:23:38.627219: I Parameter latent_dim set to 101
2023-03-14 18:23:38.627348: I batch_size set to 17
2023-03-14 18:23:38.627682: I Parameter reg_lambda set to 0.021991645792416692
2023-03-14 18:23:38.627819: I dropout_pkeep set to 0.25040322158625117
2023-03-14 18:23:38.628139: I Parameter lr set to 0.30987825830054144
2023-03-14 18:23:38.628282: I epochs set to 10
2023-03-14 18:23:38.628678: I intermediat

TypeError: Object of type int64 is not JSON serializable

Running experiment with RESTORE - model weights are read and there is no training

In [22]:
run_experiment('elliot_example_configuration Load Test.yml')

2023-03-06 17:45:28.229007: I Start experiment
2023-03-06 17:45:28.238097: I /home/kpfra/streamRec-forgetting/notebooks/elliot_experiments/elliot_example/movielens_bucket_1.csv - Loaded
2023-03-06 17:45:28.244117: I Test Fold 0
2023-03-06 17:45:29.429240: I Statistics	Users:	1000	Items:	1000	Transactions:	1000	Sparsity:	0.999
2023-03-06 17:45:29.949316: I Training begun for MultiVAE\n
!!!!!!!!!!!!!!!!!!
ENTERS RESTORE IF
!!!!!!!!!!!!!!!!!!
Model correctly Restored
2023-03-06 17:45:29.962517: I Hyperparameters:
2023-03-06 17:45:29.965603: I meta set to namespace(restore=True, save_recs=False, save_weights=True, validation_metric='Recall@20', verbose=False)
2023-03-06 17:45:29.969028: I epochs set to 8
2023-03-06 17:45:29.971113: I batch_size set to 470
2023-03-06 17:45:29.973375: I intermediate_dim set to 331
2023-03-06 17:45:29.977459: I latent_dim set to 493
2023-03-06 17:45:29.979519: I mf_factors set to 11
2023-03-06 17:45:29.981817: I reg_lambda set to 0.021991645792416692
2023-03-

Running experiment with RESTORE, but with ranges defined for the optimization
* This does not work, because the framework tries to find a folder from a previous model that contains in its name the same hyperparameters it is running with in an iteration.
* This means datasets must be stored in versions that contain an increasing number of buckets and holdouts:  
    * b0.csv - h0.csv
    * b0_b1.csv - h1.csv
    * b0_b1_b2.csv - h2.csv

In [25]:
run_experiment('elliot_example_configuration 2.yml')

2023-03-06 17:49:27.277543: I Start experiment
2023-03-06 17:49:27.285791: I /home/kpfra/streamRec-forgetting/notebooks/elliot_experiments/elliot_example/movielens_bucket_1.csv - Loaded
2023-03-06 17:49:27.293861: I Test Fold 0
2023-03-06 17:49:28.459648: I Statistics	Users:	1000	Items:	1000	Transactions:	1000	Sparsity:	0.999
2023-03-06 17:49:28.956628: I Tuning begun for MultiVAE\n
!!!!!!!!!!!!!!!!!!
ENTERS RESTORE IF
!!!!!!!!!!!!!!!!!!
2023-03-06 17:49:28.981404: I Hyperparameter tuning exploration:
2023-03-06 17:49:28.987666: I batch_size set to 928
2023-03-06 17:49:29.036221: I dropout_pkeep set to 0.6342344071271315
2023-03-06 17:49:29.078902: I epochs set to 3
2023-03-06 17:49:29.085776: I intermediate_dim set to 141
2023-03-06 17:49:29.097875: I latent_dim set to 304
2023-03-06 17:49:29.100747: I lr set to 0.09139786666274591
2023-03-06 17:49:29.108933: I mf_factors set to 11
2023-03-06 17:49:29.143828: I reg_lambda set to 0.4291070790802781


Exception: Error in model restoring operation! Unsuccessful TensorSliceReader constructor: Failed to find any matching files for /home/kpfra/streamRec-forgetting/notebooks/elliot_experiments/elliot_example/results/weights/MultiVAE_seed=42_e=3_bs=928_intermediate_dim=141_latent_dim=304_reg_lambda=0$4291070790802781_lr=0$09139786666274591_dropout_pkeep=0$3657655928728685/best-weights-MultiVAE_seed=42_e=3_bs=928_intermediate_dim=141_latent_dim=304_reg_lambda=0$4291070790802781_lr=0$09139786666274591_dropout_pkeep=0$3657655928728685

2023-03-06 17:49:29.145544: I Exploration: Hyperparameter exploration number 1
2023-03-06 17:49:29.147012: I Exploration: Test Fold exploration number 1
2023-03-06 17:49:29.148684: I Exploration: Train-Validation Fold exploration number 1
