## Avaliação em holdouts - Amazon Books

Interactions in the 'Amazon Books' dataset are well distributed.
Several users are present during the whole considered period (2014):
<!-- * 37067 users of 190248 (19.484%) occurr in 80.0% or more months. -->
* 500 users of 500 (100.0%) occurr in 80.0% or more months.

In [1]:
import os
import sys
sys.path.append(os.path.abspath('') + '/..')

In [2]:
from data import ImplicitData, getBucketsHoldouts
from plot_utils import lineplot_recallxholdout, recall_heatmap
from dataset_evaluation_utils import *
from recommenders_implicit import ISGD, RAISGD, RSISGD  # ISGD framework, BISGD,
from eval_implicit import EvaluateHoldouts, EvaluateAndStore # EvaluateAndStore para guardar estados do modelo e holdouts, a avaliação prequencial de ratings implicitos é opcional, , EvalHoldout

from datetime import datetime
import joblib
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')

___
## BWT FWT

ACC, BWT, e FWT - Lopez-Paz e Ranzato GEM

In [3]:
def avg_recall(results_matrix): # Lopez-Paz e Ranzato GEM 2017
    return np.mean( np.diag(results_matrix) )

def compute_BWT(results_matrix): # Lopez-Paz e Ranzato GEM 2017
    BWT = []
    n_checkpoints = results_matrix.shape[0]
    for T in range(1, n_checkpoints): # 1 means holdout 2, 2 means 3, so on
        Rti = results_matrix.iloc[T, 0:T] # get models performances' on previous holdouts
        Rii = np.diag(results_matrix)[0:T] # get models performances' on their closest holdouts (diagonal)
        E = sum( Rti - Rii ) # future models performances' - performances' of models closest to holdouts (diagonal)
        BWT.append( E/T ) # store average BWT for model
    return BWT, np.mean( BWT ) # return BWT and average BWT for all models

def compute_FWT(results_matrix): # Díaz-Rodriguez et al. 2018
    upper_tri = results_matrix.to_numpy()[np.triu_indices(results_matrix.shape[0], k=1)]
    return np.mean(upper_tri)

___
# Amazon Books
Small subset, ratings only.  
https://nijianmo.github.io/amazon/index.html


In [4]:
# importa dataset 'books playlists'
data = pd.read_csv('output/amazonbooks_dump/2nd_sampled_amazon_books.csv')
dataset_name = 'Amazon_Books'
user_col = 'user_id'
item_col = 'item_id'

In [5]:
data.shape

(564099, 4)

In [6]:
data.head()

Unnamed: 0,user_id,item_id,timestamp,date
0,141353678,A2YYLQVZSEPYH7,1388534400,2014-01-01
1,812469496,A139ERFHMIYLXL,1388534400,2014-01-01
2,297867288,A2LF0FQQG9ANHJ,1388534400,2014-01-01
3,297867288,A2T098MZGADPLT,1388534400,2014-01-01
4,297867288,A1NTTZPH8YU0FN,1388534400,2014-01-01


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564099 entries, 0 to 564098
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   user_id    564099 non-null  object
 1   item_id    564099 non-null  object
 2   timestamp  564099 non-null  int64 
 3   date       564099 non-null  object
dtypes: int64(1), object(3)
memory usage: 17.2+ MB


### Convert timestamp

In [8]:
%%time 
# 5s
data['date'] = data['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
data.sort_values(by='timestamp', inplace=True)

KeyboardInterrupt: 

In [9]:
data.head()

Unnamed: 0,user_id,item_id,timestamp,date
0,141353678,A2YYLQVZSEPYH7,1388534400,2014-01-01
1,812469496,A139ERFHMIYLXL,1388534400,2014-01-01
2,297867288,A2LF0FQQG9ANHJ,1388534400,2014-01-01
3,297867288,A2T098MZGADPLT,1388534400,2014-01-01
4,297867288,A1NTTZPH8YU0FN,1388534400,2014-01-01


In [10]:
%%time
# 198ms
interactions_per_month = data.groupby(by=['date']).count().iloc[:, 0]
interactions_per_month.name = 'count'
interactions_per_month=interactions_per_month.reset_index()
_ = interactions_per_month.copy()
_['date'] = _['date'].dt.date
_.groupby('date').sum().plot(kind='bar')
plt.title('interactions per month')

AttributeError: Can only use .dt accessor with datetimelike values

___
## Get intervals and Frequent users (threshold = 0.8)

time intervals

In [11]:
user_presence_df = pd.read_csv('output/amazonbooks_dump/2nd_sample_user_presence_df.csv')
user_month_interactions = pd.read_csv('output/amazonbooks_dump/2nd_sample_user_month_interactions.csv')
frequent_users_month = joblib.load('output/amazonbooks_dump/2nd_sample_frequent_users_month.joblib')

___
## Experiments using months

In [12]:
data.shape

(564099, 4)

In [None]:
%%time
# 29.7 s
buckets, holdouts = getBucketsHoldouts(
    data=data,
    user_col=user_col,
    item_col=item_col,
    frequent_users=frequent_users_month,
    interval_type='M',
    intervals=None, 
    cold_start_buckets=1)

Creating buckets. . .
Creating holdouts. . .


Debugging buckets and holdouts - **only works if they're not converted to implicit data in get_buckets_and_holdouts.getBucketsHoldouts**

In [None]:
# # join buckets and holdouts, to check if they have any equal interactions (they should not)
# a = pd.concat( buckets ).set_index([user_col, item_col])
# b = pd.concat( holdouts )[[user_col, item_col]].set_index([user_col, item_col])
# a.join(b, how='inner').shape[0]

In [None]:
# # concatenate buckets and holdouts, to check if the result is equal to the original data (it should be equal)
# _  = pd.concat( [pd.concat( buckets ), pd.concat( holdouts )], ignore_index=True).sort_values(by=['timestamp', 'user_id','item_id']).reset_index(drop=True)
# _.equals(data.sort_values(by=['timestamp', 'user_id','item_id']).reset_index(drop=True))

In [None]:
# bucket_sizes = [b.shape[0] for b in buckets]
# holdout_sizes = [h.shape[0] for h in holdouts]
# bucket_sizes, holdout_sizes

In [None]:
joblib.dump(buckets, 'output/amazonbooks_dump/2nd_sample_buckets.joblib')
joblib.dump(holdouts, 'output/amazonbooks_dump/2nd_sample_holdouts.joblib')

### ISGD

In [None]:
buckets = joblib.load('output/amazonbooks_dump/2nd_sample_buckets.joblib')
holdouts = joblib.load('output/amazonbooks_dump/2nd_sample_holdouts.joblib')

Hyperparameters

In [None]:
# define hyperparameters (SAME AS LASTFM)
num_factors = 160
num_iter = 4
learn_rate = 0.5
regularization = 0.4
num_nodes = 8

In [None]:
# transforma interações em objeto que contem mappings usuário-itens e item-usuários, contém também métodos de suporte. recebe listas
# stream = ImplicitData(data[user_col], data[item_col])
# O modelo deve ser iniciado com uma lista vazia
empty_stream = ImplicitData([], [])
# Se o stream for passado, ao excluir itens conhecidos o recall é sempre 0. Ao permitir a recomendação de itens já vistos, o recall não é 0.
model = ISGD(empty_stream, num_factors, num_iter, learn_rate = learn_rate, u_regularization = regularization, i_regularization = regularization, random_seed = 10)

In [None]:
%%time
# 1d 14min 19s
# criamos instancia de EvaluateHoldouts para treinar o modelo e criar checkpoints
eval = EvaluateHoldouts(model=model, buckets=buckets, holdouts=holdouts)

In [None]:
%%time
# 14h 22min 35s
eval.EvaluateHoldouts(N_recommendations=20, exclude_known_items=True, default_user='none')

In [None]:
joblib.dump(eval, 'output/amazonbooks_dump/2nd_sample_amazon_books ISGD eval.joblib')

In [None]:
rm = eval.results_matrix
df = pd.DataFrame(rm.T)
df.to_csv('output/amazonbooks_dump/2nd_sample_amazon_books month_bucket ISGD results.csv', index=False)

In [None]:
recall_heatmap(df,
    title='Recall@20 for ISGD models across Holdouts - Amazon Books',
    filepath='images/heatmaps/amazonbooks_dump/2nd_sample_amazon_books month_bucket ISGD heatmap.png') #='images/heatmaps/palco_2010 month_bucket ISGD heatmap.png'

In [None]:
arecall = avg_recall(df)
arecall

In [None]:
BWT, meanBWT = compute_BWT(df)
BWT, meanBWT

In [None]:
FWT = compute_FWT(df)
FWT
# que itens que usuario utilizou no passado e deixou de consumir o sistema ainda pode recomendar

In [None]:
BWT, meanBWT = compute_BWT(df)
BWT, meanBWT