In [1]:
import os
from experiment_setup import setups
import torch

In [3]:
# download the dataset, the links can be fund in the README
dataset_path = "../datasets/coveo_ecommerce"
model_path = "../trained_models"

## Run the preprocess script, specific to the dataset you chose

- The preprocessing script in general, executes the following steps:
    - Loads the raw data, with correct types
    - Creates the sessions
    - Removes duplicated items. An item is considered as a duplicate if the preceding (based on time) event in the same session contains the exact same item.
    - Performes iterative support filtering
        - Removes sessions with only one event
        - Removes items with less than 5 events
        - Until the size of the dataset changes


In [4]:
%run ../Preprocess/coveo_preproc.py --path $dataset_path

1566074 274797 11365
1464757 173480 11344
1463706 173480 10869
1463649 173423 10869
1463645 173423 10868
1463645 173423 10868
1463645 173423 10868


  dt.datetime.utcfromtimestamp(data.Time.min() / 1000).strftime(
  dt.datetime.utcfromtimestamp(data.Time.max() / 1000).strftime(
  dt.datetime.utcfromtimestamp(data.Time.min() / 1000).strftime(
  dt.datetime.utcfromtimestamp(data.Time.max() / 1000).strftime(
  dt.datetime.utcfromtimestamp(data.Time.min() / 1000).strftime(
  dt.datetime.utcfromtimestamp(data.Time.max() / 1000).strftime(


                                             Dataset  NumEvents  NumSessions  \
0      coveo_ecommerce\coveo_processed_view_full.tsv    1463645       173423   
1      coveo_ecommerce\coveo_processed_view_test.tsv      52501         7748   
2  coveo_ecommerce\coveo_processed_view_train_ful...    1411113       165673   
3  coveo_ecommerce\coveo_processed_view_train_tr.tsv    1368003       159766   
4  coveo_ecommerce\coveo_processed_view_train_val...      43032         5905   

   NumItems    NumDays                   StartTime  \
0     10868  17.999833  2018-12-08 00:00:11.994000   
1      8230   0.998696  2018-12-25 00:01:50.223000   
2     10868  16.999566  2018-12-08 00:00:11.994000   
3     10868  15.999713  2018-12-08 00:00:11.994000   
4      8014   0.997503  2018-12-24 00:03:10.240000   

                      EndTime  AvgItemViews  MinSessionLength  \
0  2018-12-25 23:59:57.577000    134.674733                 2   
1  2018-12-25 23:59:57.577000      6.379222                 2   

  dt.datetime.utcfromtimestamp(data.Time.min() / 1000).strftime(
  dt.datetime.utcfromtimestamp(data.Time.max() / 1000).strftime(
  dt.datetime.utcfromtimestamp(data.Time.min() / 1000).strftime(
  dt.datetime.utcfromtimestamp(data.Time.max() / 1000).strftime(


## Use a specific setup for your dataset

In [5]:
params = setups["coveo"]["params_bprmax"]

In [6]:
train_path = os.path.join(dataset_path,"coveo_processed_view_train_full.tsv")
test_path = os.path.join(dataset_path,"coveo_processed_view_test.tsv")

In [7]:
def create_gru4rec_pytorch_script(model_name, train_folder, train_data, test_data, model_path, loss, optim, final_act, layers, batch_size, dropout_p_embed, dropout_p_hidden, learning_rate, n_epochs, m, eval_hidden_reset, use_correct_loss, use_correct_mask_reset):
    checkpoint_dir = f"{model_path}\\{model_name}"
    s_train_full = (
        f"python ..\\GRU4REC-pytorch\\main.py --data_folder {train_folder} "
        f"--train_data {train_data} --valid_data {test_data} --checkpoint_dir {checkpoint_dir} "
        f"--num_layers 1 --embedding_dim {layers} --hidden_size {layers} "
        f"--loss_type {'BPR-max' if loss == 'bpr-max' else 'CrossEntropy'} --final_act {final_act} "
        f"--n_epochs {n_epochs} --batch_size {batch_size} --dropout_input {dropout_p_embed} "
        f"--dropout_hidden {dropout_p_hidden} --lr {learning_rate} --momentum 0.0 "
        f"--optimizer_type {'Adagrad' if optim == 'adagrad' else ''}"
        f"{' --eval_hidden_reset' if eval_hidden_reset else ''}"
        f"{' --use_correct_loss' if use_correct_loss else ''}"
        f"{' --use_correct_mask_reset' if use_correct_mask_reset else ''}"
    )
    s_test_full = s_train_full + f" --is_eval --load_model {checkpoint_dir}\\model_0000{n_epochs-1}.pt --m {m}"
    return s_train_full, s_test_full

In [9]:
loss = params["loss"]
optim = params["optim"]
const_emb = params["constrained_embedding"]
embed = params["embedding"]
final_act = params["final_act"]
layers = params["layers"]
batch_size = params["batch_size"]
dropout_p_embed = params["dropout_p_embed"]
dropout_p_hidden = params["dropout_p_hidden"]
learning_rate = params["learning_rate"]
momentum = params["momentum"]
sample_alpha = params["sample_alpha"]
bpreg = params["bpreg"]
logq = params["logq"]
hidden_act = params["hidden_act"]
n_epochs = 5
m = '1 5 10 20'

## Train & test the out-of-the-box model

In [8]:
train_folder, train_data = '/'.join(train_path.split('/')[:-1]), train_path.split('/')[-1]
test_folder, test_data = '/'.join(test_path.split('/')[:-1]), test_path.split('/')[-1]

In [9]:
train_script_oob, test_script_oob = create_gru4rec_pytorch_script(model_name='gru4rec_pytorch_oob_bprmax', train_folder=train_folder, train_data=train_data, test_data=test_data, model_path=model_path, loss=loss, optim=optim, final_act=final_act, layers=layers, batch_size=batch_size, dropout_p_embed=0.0, dropout_p_hidden=0.0, learning_rate=learning_rate, n_epochs=n_epochs, m=m, eval_hidden_reset=False, use_correct_loss=False, use_correct_mask_reset=False)

In [10]:
print(train_script_oob)
print(test_script_oob)

CUDA_VISIBLE_DEVICES=0 python3 ../GRU4REC-pytorch/main.py --data_folder ../datasets/coveo_ecommerce --train_data coveo_processed_view_train_full.tsv --valid_data coveo_processed_view_test.tsv --checkpoint_dir ../trained_models/gru4rec_pytorch_oob_bprmax --num_layers 1 --embedding_dim 512 --hidden_size 512 --loss_type BPR-max --final_act elu-1 --n_epochs 5 --batch_size 144 --dropout_input 0.0 --dropout_hidden 0.0 --lr 0.05 --momentum 0.0 --optimizer_type Adagrad
CUDA_VISIBLE_DEVICES=0 python3 ../GRU4REC-pytorch/main.py --data_folder ../datasets/coveo_ecommerce --train_data coveo_processed_view_train_full.tsv --valid_data coveo_processed_view_test.tsv --checkpoint_dir ../trained_models/gru4rec_pytorch_oob_bprmax --num_layers 1 --embedding_dim 512 --hidden_size 512 --loss_type BPR-max --final_act elu-1 --n_epochs 5 --batch_size 144 --dropout_input 0.0 --dropout_hidden 0.0 --lr 0.05 --momentum 0.0 --optimizer_type Adagrad --is_eval --load_model ../trained_models/gru4rec_pytorch_oob_bprmax/

In [11]:
checkpoint_dir = "../trained_models/gru4rec_pytorch_oob_bprmax"
eval_hidden_reset=False
use_correct_loss=False
use_correct_mask_reset=False

In [12]:
import subprocess

# Comando de entrenamiento
process_train = subprocess.Popen(train_script_oob, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout_train, stderr_train = process_train.communicate()

print("Salida de STDOUT (Entrenamiento):")
print(stdout_train.decode())
print("Salida de STDERR (Entrenamiento):")
print(stderr_train.decode())

# Comando de evaluación
process_test = subprocess.Popen(test_script_oob, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout_test, stderr_test = process_test.communicate()

print("Salida de STDOUT (Evaluación):")
print(stdout_test.decode())
print("Salida de STDERR (Evaluación):")
print(stderr_test.decode())


Salida de STDOUT (Entrenamiento):

Salida de STDERR (Entrenamiento):
Traceback (most recent call last):
  File "../GRU4REC-pytorch/main.py", line 3, in <module>
    import lib
  File "/home/juancagp/RecSysProject/RecSys-Project-2024-1/GRU4REC-pytorch/lib/__init__.py", line 1, in <module>
    from .dataset import Dataset, DataLoader
  File "/home/juancagp/RecSysProject/RecSys-Project-2024-1/GRU4REC-pytorch/lib/dataset.py", line 3, in <module>
    import torch
ModuleNotFoundError: No module named 'torch'

Salida de STDOUT (Evaluación):

Salida de STDERR (Evaluación):
Traceback (most recent call last):
  File "../GRU4REC-pytorch/main.py", line 3, in <module>
    import lib
  File "/home/juancagp/RecSysProject/RecSys-Project-2024-1/GRU4REC-pytorch/lib/__init__.py", line 1, in <module>
    from .dataset import Dataset, DataLoader
  File "/home/juancagp/RecSysProject/RecSys-Project-2024-1/GRU4REC-pytorch/lib/dataset.py", line 3, in <module>
    import torch
ModuleNotFoundError: No module nam

### Train the out-of-the-box model

In [17]:
os.system(train_script_oob)

sh: 1: python: not found


32512

### Test the out-of-the-box model

In [None]:
os.system(test_script_oob)

## Train & test inference fix model

In [None]:
train_script_inffix, test_script_inffix = create_gru4rec_pytorch_script(model_name='gru4rec_pytorch_inffix_bprmax', train_folder=train_folder, train_data=train_data, test_data=test_data, model_path=model_path, loss=loss, optim=optim, final_act=final_act, layers=layers, batch_size=batch_size, dropout_p_embed=0.0, dropout_p_hidden=0.0, learning_rate=learning_rate, n_epochs=n_epochs, m=m, eval_hidden_reset=True, use_correct_loss=False, use_correct_mask_reset=False)

### Train the out-of-the-box eval fix model

In [None]:
os.system(train_script_inffix)

### Test the out-of-the-box eval fix model

In [None]:
os.system(test_script_inffix)

## Train & test the major fix model

In [None]:
train_script_majorfix, test_script_majorfix = create_gru4rec_pytorch_script(model_name='gru4rec_pytorch_majorfix_bprmax', train_folder=train_folder, train_data=train_data, test_data=test_data, model_path=model_path, loss=loss, optim=optim, final_act=final_act, layers=layers, batch_size=batch_size, dropout_p_embed=dropout_p_embed, dropout_p_hidden=dropout_p_hidden, learning_rate=learning_rate, n_epochs=n_epochs, m=m, eval_hidden_reset=True, use_correct_loss=True, use_correct_mask_reset=True)

### Train the major fix model

In [None]:
os.system(train_script_majorfix)

### Test the major fix model

In [None]:
os.system(test_script_majorfix)