In [1]:
import os 
import pandas as pd 
import recbole
import torch 

from recbole.model.general_recommender.bpr import BPR
from recbole.model.general_recommender.ease import EASE

from recbole.config import Config
from recbole.data import create_dataset, data_preparation, Interaction
from recbole.utils import init_logger, get_trainer, get_model, init_seed, set_color
from recbole.quick_start import load_data_and_model, run_recbole, objective_function
from recbole.utils.case_study import full_sort_topk
from recbole.trainer import HyperTuning


from logging import getLogger


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ratings = pd.read_csv('../data/ratings.csv')

In [3]:
ratings

Unnamed: 0,session_id,user_id,outfit_id,timestamp
0,7708c8e7-4292-4ff9-99b1-27be20427e42,1.0,83783,23-07-19 09:26:07
1,7708c8e7-4292-4ff9-99b1-27be20427e42,1.0,83800,23-07-19 09:26:10
2,7708c8e7-4292-4ff9-99b1-27be20427e42,1.0,83791,23-07-19 09:26:12
3,7708c8e7-4292-4ff9-99b1-27be20427e42,1.0,84029,23-07-19 09:26:16
4,7708c8e7-4292-4ff9-99b1-27be20427e42,1.0,83706,23-07-19 09:26:18
...,...,...,...,...
2246,22af75f5-cabe-46e8-90b3-08613327f389,,83291,2023-07-21 09:18:48.784861
2247,22af75f5-cabe-46e8-90b3-08613327f389,,91336,2023-07-21 09:19:34.289190
2248,22af75f5-cabe-46e8-90b3-08613327f389,,91601,2023-07-21 09:20:03.409300
2249,22af75f5-cabe-46e8-90b3-08613327f389,,90043,2023-07-21 09:20:13.284585


In [4]:
train = ratings[['session_id', 'outfit_id','timestamp']]
train.rename(columns={'session_id':'user', 'outfit_id':'item', 'timestamp':'Time'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [5]:
user2idx = {v:k for k,v in enumerate(sorted(set(train.user)))}
item2idx = {v:k for k,v in enumerate(sorted(set(train.item)))}
uidx2user = {k:v for k,v in enumerate(sorted(set(train.user)))}
iidx2item = {k:v for k,v in enumerate(sorted(set(train.item)))}

In [6]:
train.user = train.user.map(user2idx)
train.item = train.item.map(item2idx)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [7]:
train.Time = pd.to_datetime(train.Time)
train.Time = train.Time.view('int64')

In [8]:
train.columns=['user_id:token','item_id:token','timestamp:float']
train[:2]

Unnamed: 0,user_id:token,item_id:token,timestamp:float
0,114,698,1563873967000000000
1,114,709,1563873970000000000


In [9]:
outpath = f"dataset/recbole_train"
os.makedirs(outpath, exist_ok=True)
# sub_train=train.groupby("user").sample(n=10, random_state=SEED)
# sub_train.shape
train.to_csv(os.path.join(outpath,"recbole_train.inter"),sep='\t',index=False)


In [10]:
yaml_data = """
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
TIME_FIELD: timestamp

load_col:
    inter: [user_id, item_id, timestamp]
"""

with open("bpr.yaml","w") as f:
    f.write(yaml_data)

In [11]:
train['user_id:token'].nunique() 

229

In [12]:
train['item_id:token'].nunique()

1485

In [13]:
229 * 1485

340065

In [14]:
2251 / 340065 * 100

0.6619322776528017

In [15]:
model_name = 'bpr' 
parameter_dict = {'embedding_size' : 8}
config = Config(model=model_name.upper(), dataset='recbole_train', config_file_list=['bpr.yaml'], config_dict = parameter_dict)

In [16]:
config['epochs'] = 100 
config['show_progress'] = False
config['device'] = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config['train_batch_size'] = 128 
config['train_neg_sample_args'] = {'distribution': 'uniform', 'sample_num': 5, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
config['eval_args'] = {'group_by':'user','order': 'TO','split': {'LS' : 'valid_and_test'},'mode': 'full'}

In [17]:
logger = getLogger()
init_seed(config['seed'], config['reproducibility'])
init_logger(config)

logger.info(config)

21 Jul 04:37    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/recbole_train
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 100
train_batch_size = 128
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 5, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'group_by': 'user', 'order': 'TO', 'split': {'LS': 'valid_and_test'}, 'mode': 'full'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [10]
valid_metric = MRR@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
s

In [18]:
# dataset filtering
dataset = create_dataset(config)
logger.info(dataset)

# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

21 Jul 04:37    INFO  recbole_train
The number of users: 230
Average actions of users: 9.829694323144105
The number of items: 1486
Average actions of items: 1.5158249158249157
The number of inters: 2251
The sparsity of the dataset: 99.34138919772954%
Remain Fields: ['user_id', 'item_id', 'timestamp']
21 Jul 04:37    INFO  [Training]: train_batch_size = [128] train_neg_sample_args: [{'distribution': 'uniform', 'sample_num': 5, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
21 Jul 04:37    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'group_by': 'user', 'order': 'TO', 'split': {'LS': 'valid_and_test'}, 'mode': 'full'}]


In [19]:
init_seed(config['seed'], config['reproducibility'])
model = BPR(config, train_data.dataset).to(config['device'])
logger.info(model)

21 Jul 04:37    INFO  BPR(
  (user_embedding): Embedding(230, 8)
  (item_embedding): Embedding(1486, 8)
  (loss): BPRLoss()
)
Trainable parameters: 13728


In [20]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, show_progress=config['show_progress']
)

21 Jul 04:37    INFO  epoch 0 training [time: 0.14s, train loss: 52.5917]
21 Jul 04:37    INFO  epoch 0 evaluating [time: 0.77s, valid_score: 0.004700]
21 Jul 04:37    INFO  valid result: 
recall@10 : 0.0173    mrr@10 : 0.0047    ndcg@10 : 0.0076    hit@10 : 0.0173    precision@10 : 0.0017
21 Jul 04:37    INFO  Saving current: saved/BPR-Jul-21-2023_04-37-32.pth
21 Jul 04:37    INFO  epoch 1 training [time: 0.13s, train loss: 52.1756]
21 Jul 04:37    INFO  epoch 1 evaluating [time: 0.07s, valid_score: 0.011600]
21 Jul 04:37    INFO  valid result: 
recall@10 : 0.0116    mrr@10 : 0.0116    ndcg@10 : 0.0116    hit@10 : 0.0116    precision@10 : 0.0012
21 Jul 04:37    INFO  Saving current: saved/BPR-Jul-21-2023_04-37-32.pth
21 Jul 04:37    INFO  epoch 2 training [time: 0.17s, train loss: 51.3589]
21 Jul 04:37    INFO  epoch 2 evaluating [time: 0.07s, valid_score: 0.015100]
21 Jul 04:37    INFO  valid result: 
recall@10 : 0.0231    mrr@10 : 0.0151    ndcg@10 : 0.0169    hit@10 : 0.0231    pre

In [31]:
0.0185 / 0.004545

4.07040704070407

In [30]:
10 / 2200 

0.004545454545454545

In [22]:
yaml_data = """
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
TIME_FIELD: timestamp

load_col:
    inter: [user_id, item_id, timestamp]
"""

with open("ease.yaml","w") as f:
    f.write(yaml_data)

In [23]:
model_name = 'ease' 
parameter_dict = {'embedding_size' : 8}
config = Config(model=model_name.upper(), dataset='recbole_train', config_file_list=['ease.yaml'], config_dict = parameter_dict)

In [24]:
config['epochs'] = 100 
config['show_progress'] = False
config['device'] = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
config['train_batch_size'] = 128 
config['train_neg_sample_args'] = {'distribution': 'uniform', 'sample_num': 5, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
config['eval_args'] = {'group_by':'user','order': 'TO','split': {'LS' : 'valid_and_test'},'mode': 'full'}

In [25]:
logger = getLogger()
init_seed(config['seed'], config['reproducibility'])
init_logger(config)

logger.info(config)

21 Jul 04:39    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/recbole_train
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 100
train_batch_size = 128
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 5, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'group_by': 'user', 'order': 'TO', 'split': {'LS': 'valid_and_test'}, 'mode': 'full'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [10]
valid_metric = MRR@10
valid_metric_bigger = True
eval_batch_size = 4096
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
s

In [26]:
# dataset filtering
dataset = create_dataset(config)
logger.info(dataset)

# dataset splitting
train_data, valid_data, test_data = data_preparation(config, dataset)

21 Jul 04:39    INFO  recbole_train
The number of users: 230
Average actions of users: 9.829694323144105
The number of items: 1486
Average actions of items: 1.5158249158249157
The number of inters: 2251
The sparsity of the dataset: 99.34138919772954%
Remain Fields: ['user_id', 'item_id', 'timestamp']
21 Jul 04:39    INFO  [Training]: train_batch_size = [128] train_neg_sample_args: [{'distribution': 'uniform', 'sample_num': 5, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
21 Jul 04:39    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'group_by': 'user', 'order': 'TO', 'split': {'LS': 'valid_and_test'}, 'mode': 'full'}]


In [28]:
init_seed(config['seed'], config['reproducibility'])
model = EASE(config, train_data.dataset).to(config['device'])
logger.info(model)

21 Jul 04:39    INFO  EASE()
Trainable parameters: 1


In [29]:
# trainer loading and initialization
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)

# model training
best_valid_score, best_valid_result = trainer.fit(
    train_data, valid_data, saved=True, show_progress=config['show_progress']
)

21 Jul 04:39    INFO  epoch 0 training [time: 0.07s, train loss: 0.0000]
21 Jul 04:39    INFO  epoch 0 evaluating [time: 0.07s, valid_score: 0.163600]
21 Jul 04:39    INFO  valid result: 
recall@10 : 0.185    mrr@10 : 0.1636    ndcg@10 : 0.1686    hit@10 : 0.185    precision@10 : 0.0185
21 Jul 04:39    INFO  Saving current: saved/EASE-Jul-21-2023_04-39-54.pth
