In [1]:
# run_experiment("configuration/file/path")

In [2]:
# import tensorflow as tf
# from tensorflow.python.client import device_lib

# print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
# device_lib.list_local_devices()

# Import stuff

In [3]:
import os
import sys
sys.path.append(os.path.abspath('') + '/../..')

In [4]:
from data import ImplicitData, getBucketsHoldouts
from plot_utils import lineplot_recallxholdout, recall_heatmap
from dataset_evaluation_utils import *
from recommenders_implicit import ISGD, RAISGD, RSISGD  # ISGD framework, BISGD,
from eval_implicit import EvaluateHoldouts, EvaluateAndStore, EvalPrequential # EvaluateAndStore para guardar estados do modelo e holdouts, a avaliação prequencial de ratings implicitos é opcional, , EvalHoldout

from datetime import datetime
import joblib
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')

___
# BWT FWT

ACC, BWT, e FWT - Lopez-Paz e Ranzato GEM

In [5]:
def avg_recall(results_matrix): # Lopez-Paz e Ranzato GEM 2017
    return np.mean( np.diag(results_matrix) )

def compute_BWT(results_matrix): # Lopez-Paz e Ranzato GEM 2017
    BWT = []
    n_checkpoints = results_matrix.shape[0]
    for T in range(1, n_checkpoints): # 1 means holdout 2, 2 means 3, so on
        Rti = results_matrix.iloc[T, 0:T] # get models performances' on previous holdouts
        Rii = np.diag(results_matrix)[0:T] # get models performances' on their closest holdouts (diagonal)
        E = sum( Rti - Rii ) # future models performances' - performances' of models closest to holdouts (diagonal)
        BWT.append( E/T ) # store average BWT for model
    return BWT, np.mean( BWT ) # return BWT and average BWT for all models

def compute_FWT(results_matrix): # Díaz-Rodriguez et al. 2018
    upper_tri = results_matrix.to_numpy()[np.triu_indices(results_matrix.shape[0], k=1)]
    return np.mean(upper_tri)

___
# Read Data

In [6]:
# importa dataset 'movieles'
data = pd.read_csv('../output/movielens_dump/sampled_movielens.csv')
user_col = 'UserID'
item_col = 'ItemID'

In [7]:
data.shape, data[user_col].nunique(), data[item_col].nunique()

((50742, 7), 1427, 2492)

In [8]:
data[[user_col, item_col]].duplicated().sum()

0

In [9]:
data.head()

Unnamed: 0,UserID,ItemID,Timestamp,date2,year,month,date
0,4448,902,965087178,2000-07-31 23:46:18,2000,7,2000-07-01 00:00:00
1,4448,3793,965087267,2000-07-31 23:47:47,2000,7,2000-07-01 00:00:00
2,4448,3751,965087267,2000-07-31 23:47:47,2000,7,2000-07-01 00:00:00
3,4448,3578,965087349,2000-07-31 23:49:09,2000,7,2000-07-01 00:00:00
4,4448,3481,965087470,2000-07-31 23:51:10,2000,7,2000-07-01 00:00:00


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50742 entries, 0 to 50741
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   UserID     50742 non-null  int64 
 1   ItemID     50742 non-null  int64 
 2   Timestamp  50742 non-null  int64 
 3   date2      50742 non-null  object
 4   year       50742 non-null  int64 
 5   month      50742 non-null  int64 
 6   date       50742 non-null  object
dtypes: int64(5), object(2)
memory usage: 2.7+ MB


### Convert timestamp

In [11]:
%%time 
# 2.42s
data['date'] = data['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %X'))
# data.sort_values(by='timestamp', inplace=True)

CPU times: user 405 ms, sys: 927 µs, total: 406 ms
Wall time: 405 ms


# EXPERIMENT 7

CODE TO GET LAST N INTERACTIONS FROM EACH USER AS HOLDOUT  
* IF USER DID NOT INTERACT WITH AT LEAST N+1 ITEMS, THEN IT IS NOT USED FOR HOLDOUT  
* LAST 10 INTERACTIONS FROM EACH USER AS HOLDOUT
* RECOMENDING SEEN ITEMS IS ALLOWED
* DON'T REMOVE INTERACTIONS IN HOLDOUT FROM BUCKETS
* DON'T REMOVE INTERACTIONS **SENT** TO HOLDOUT FROM BUCKET

In [12]:
# CODE TO GET LAST N INTERACTIONS FROM EACH USER AS HOLDOUT
# IF USER DID NOT INTERACT WITH AT LEAST N+1 ITEMS, THEN IT IS NOT USED FOR HOLDOUT

N = 10
cold_start_buckets = 0
#     print('0',data.shape[0]) # debug
print('Creating buckets. . .')
buckets = []
# assert interval_type in ['W', 'M', 'QS', 'F'], "interval must be one of W, M, QS, or F"
# create buckets based on months
months = data['date'].unique()
for interval in months:
    idx = (data['date'] == interval)
    buckets.append( data[idx] )

Creating buckets. . .


In [13]:
print('Creating holdouts. . .')
# create holdouts with last user interaction
holdouts = []

for i, b in enumerate( buckets ):
    if i >= cold_start_buckets:
        condition = (b[user_col].value_counts() > N)
        frequent_users = b[user_col].value_counts()[ condition ].index
        holdout_idx = []
        for u in frequent_users:
            tail_idx = list( b[b[user_col] == u].tail(N).index )
            holdout_idx += tail_idx
        holdout = b.loc[holdout_idx].reset_index(drop=True)
        holdouts.append(holdout)
        # buckets[i] = b.drop(index=holdout_idx).reset_index(drop=True)
        buckets[i] = b.reset_index(drop=True)

Creating holdouts. . .


Store buckets and holdouts

In [14]:
buckets[0].iloc[:1000].to_csv('movielens_bucket_0.csv', columns=[user_col, item_col], header=False, index=False)
holdouts[0].iloc[:100].to_csv('movielens_holdout_0.csv', columns=[user_col, item_col], header=False, index=False)

In [16]:
for b in buckets:
    print(b.shape)    

(20977, 7)
(13148, 7)
(16617, 7)


In [17]:
for h in holdouts:
    print(h.shape)

(5040, 7)
(3240, 7)
(3710, 7)


Run experiment

In [18]:
from elliot.run import run_experiment


__/\\\\\\\\\\\\\\\___/\\\\\\______/\\\\\\_________________________________________        
 _\/\\\///////////___\////\\\_____\////\\\_________________________________________       
  _\/\\\_________________\/\\\________\/\\\______/\\\_____________________/\\\______      
   _\/\\\\\\\\\\\_________\/\\\________\/\\\_____\///_______/\\\\\______/\\\\\\\\\\\_     
    _\/\\\///////__________\/\\\________\/\\\______/\\\____/\\\///\\\___\////\\\////__    
     _\/\\\_________________\/\\\________\/\\\_____\/\\\___/\\\__\//\\\_____\/\\\______   
      _\/\\\_________________\/\\\________\/\\\_____\/\\\__\//\\\__/\\\______\/\\\_/\\__  
       _\/\\\\\\\\\\\\\\\___/\\\\\\\\\___/\\\\\\\\\__\/\\\___\///\\\\\/_______\//\\\\\___ 
        _\///////////////___\/////////___\/////////___\///______\/////__________\/////____
Version Number: 0.3.1


In [19]:
import tensorflow as tf
tf.autograph.set_verbosity(5)

In [None]:
holdouts[1].iloc[:100].to_csv('movielens_holdout_1.csv', columns=[user_col, item_col], header=False, index=False)

In [20]:
run_experiment('elliot_example_configuration.yml')

2023-03-03 17:58:43.614575: I Start experiment
2023-03-03 17:58:43.629356: I /home/kpfra/streamRec-forgetting/notebooks/elliot example/movielens_bucket_0.csv - Loaded
2023-03-03 17:58:43.635234: I Test Fold 0
2023-03-03 17:58:44.735569: I Statistics	Users:	1000	Items:	1000	Transactions:	1000	Sparsity:	0.999
2023-03-03 17:58:45.325980: I Tuning begun for MultiVAE\n
2023-03-03 17:58:45.325837: I Loading parameters
2023-03-03 17:58:45.327138: I Parameter intermediate_dim set to 331
2023-03-03 17:58:45.328261: I Parameter latent_dim set to 493
2023-03-03 17:58:45.328648: I Hyperparameter tuning exploration:
2023-03-03 17:58:45.329624: I batch_size set to 470
2023-03-03 17:58:45.329500: I Parameter reg_lambda set to 0.021991645792416692
2023-03-03 17:58:45.330603: I dropout_pkeep set to 0.29826163136513073
2023-03-03 17:58:45.331581: I epochs set to 8
2023-03-03 17:58:45.331436: I Parameter lr set to 0.0014786280507223606
2023-03-03 17:58:45.332715: I intermediate_dim set to 331
2023-03-03 

In [21]:
# from elliot.recommender import MultiVAE
# _ = MultiVAE()

In [23]:
buckets[1].iloc[:1000].to_csv('movielens_bucket_1.csv', columns=[user_col, item_col], header=False, index=False)
holdouts[1].iloc[:100].to_csv('movielens_holdout_1.csv', columns=[user_col, item_col], header=False, index=False)

In [24]:
run_experiment('elliot_example_configuration Load Test.yml')

2023-03-03 18:06:23.277707: I Start experiment


AttributeError: 'types.SimpleNamespace' object has no attribute 'train_path'