In [1]:
import os
from experiment_setup import setups
from model_tracker import track_training_C02_emissions

In [2]:
dataset = "yoochoose"   # coveo, diginetica, rees46, retailrocket, yoochoose
loss_function = "xe"   # bprmax, xe

dataset_path = f"../datasets/{dataset}"
model_path = "../trained_models"

## Run the preprocess script, specific to the dataset you chose

- The preprocessing script in general, executes the following steps:
    - Loads the raw data, with correct types
    - Creates the sessions
    - Removes duplicated items. An item is considered as a duplicate if the preceding (based on time) event in the same session contains the exact same item.
    - Performes iterative support filtering
        - Removes sessions with only one event
        - Removes items with less than 5 events
        - Until the size of the dataset changes


In [None]:
%run ../Preprocess/coveo_ecommerce_preproc.py --path $dataset_path

## Use a specific setup for your dataset

In [3]:
params = setups[dataset][f"params_{loss_function}"]

In [4]:
train_path = os.path.join(dataset_path,f"{dataset}_processed_view_train_full.tsv")
test_path = os.path.join(dataset_path,f"{dataset}_processed_view_test.tsv")

In [5]:
def create_gru4rec_pytorch_script(model_name, train_folder, train_data, test_data, model_path, loss, optim, final_act, layers, batch_size, dropout_p_embed, dropout_p_hidden, learning_rate, n_epochs, m, eval_hidden_reset, use_correct_loss, use_correct_mask_reset):
    checkpoint_dir = f"{model_path}\\{model_name}"
    s_train_full = (
        f"python ..\\GRU4REC-pytorch\\main.py --data_folder {train_folder} "
        f"--train_data {train_data} --valid_data {test_data} --checkpoint_dir {checkpoint_dir} "
        f"--num_layers 1 --embedding_dim {layers} --hidden_size {layers} "
        f"--loss_type {'BPR-max' if loss == 'bpr-max' else 'CrossEntropy'} --final_act {final_act} "
        f"--n_epochs {n_epochs} --batch_size {batch_size} --dropout_input {dropout_p_embed} "
        f"--dropout_hidden {dropout_p_hidden} --lr {learning_rate} --momentum 0.0 "
        f"--optimizer_type {'Adagrad' if optim == 'adagrad' else ''}"
        f"{' --eval_hidden_reset' if eval_hidden_reset else ''}"
        f"{' --use_correct_loss' if use_correct_loss else ''}"
        f"{' --use_correct_mask_reset' if use_correct_mask_reset else ''}"
    )
    s_test_full = s_train_full + f" --is_eval --load_model {checkpoint_dir}\\model_0000{n_epochs-1}.pt --m {m}"
    return s_train_full, s_test_full

In [6]:
loss = params["loss"]
optim = params["optim"]
const_emb = params["constrained_embedding"]
embed = params["embedding"]
final_act = params["final_act"]
layers = params["layers"]
batch_size = params["batch_size"]
dropout_p_embed = params["dropout_p_embed"]
dropout_p_hidden = params["dropout_p_hidden"]
learning_rate = params["learning_rate"]
momentum = params["momentum"]
sample_alpha = params["sample_alpha"]
bpreg = params["bpreg"]
logq = params["logq"]
hidden_act = params["hidden_act"]
n_epochs = 5
m = '1 5 10 20'

In [7]:
train_folder, train_data = '/'.join(train_path.split('/')[:-1]), train_path.split('/')[-1]
test_folder, test_data = '/'.join(test_path.split('/')[:-1]), test_path.split('/')[-1]

print("Training folder: ", train_folder)
print("Train data: ", train_data)

Training folder:  ../datasets
Train data:  yoochoose\yoochoose_processed_view_train_full.tsv


## Train & test (major fix model)

In [8]:
train_script_majorfix, test_script_majorfix = create_gru4rec_pytorch_script(model_name=f'gru4rec_pytorch_{loss_function}', train_folder=train_folder, train_data=train_data, test_data=test_data, model_path=model_path, loss=loss, optim=optim, final_act=final_act, layers=layers, batch_size=batch_size, dropout_p_embed=dropout_p_embed, dropout_p_hidden=dropout_p_hidden, learning_rate=learning_rate, n_epochs=n_epochs, m=m, eval_hidden_reset=True, use_correct_loss=True, use_correct_mask_reset=True)

### Train

In [9]:
track_training_C02_emissions(train_script_majorfix, f"gru4rec_pytorch_{loss_function}", dataset)

[codecarbon INFO @ 13:51:52] [setup] RAM Tracking...
[codecarbon INFO @ 13:51:52] [setup] GPU Tracking...
[codecarbon INFO @ 13:51:53] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 13:51:53] [setup] CPU Tracking...
[codecarbon INFO @ 13:51:54] CPU Model on constant consumption mode: 13th Gen Intel(R) Core(TM) i9-13900HX
[codecarbon INFO @ 13:51:54] >>> Tracker's metadata:
[codecarbon INFO @ 13:51:54]   Platform system: Windows-11-10.0.22631-SP0
[codecarbon INFO @ 13:51:54]   Python version: 3.12.3
[codecarbon INFO @ 13:51:54]   CodeCarbon version: 2.4.2
[codecarbon INFO @ 13:51:54]   Available RAM : 31.746 GB
[codecarbon INFO @ 13:51:54]   CPU count: 32
[codecarbon INFO @ 13:51:54]   CPU model: 13th Gen Intel(R) Core(TM) i9-13900HX
[codecarbon INFO @ 13:51:54]   GPU count: 1
[codecarbon INFO @ 13:51:54]   GPU model: 1 x NVIDIA GeForce RTX 4090 Laptop GPU
[codecarbon INFO @ 13:52:12] Energy consumed for RAM : 0.000050 kWh. RAM Power : 11.904736518859863 W
[codecarbon INFO @ 13:52:12

Salida de STDOUT:                       Args                                             Values
0              hidden_size                                                480
1               num_layers                                                  1
2               batch_size                                                 48
3            dropout_input                                                0.0
4           dropout_hidden                                                0.2
5                 n_epochs                                                  5
6                        m                                               [20]
7           optimizer_type                                            Adagrad
8                final_act                                            softmax
9                       lr                                               0.07
10            weight_decay                                                  0
11                momentum                    

0.15998174396346027

### Test

In [10]:
print(test_script_majorfix)
# os.system(test_script_majorfix)

python ..\GRU4REC-pytorch\main.py --data_folder ../datasets --train_data yoochoose\yoochoose_processed_view_train_full.tsv --valid_data yoochoose\yoochoose_processed_view_test.tsv --checkpoint_dir ../trained_models\gru4rec_pytorch_xe --num_layers 1 --embedding_dim 480 --hidden_size 480 --loss_type CrossEntropy --final_act softmax --n_epochs 5 --batch_size 48 --dropout_input 0.0 --dropout_hidden 0.2 --lr 0.07 --momentum 0.0 --optimizer_type Adagrad --eval_hidden_reset --use_correct_loss --use_correct_mask_reset --is_eval --load_model ../trained_models\gru4rec_pytorch_xe\model_00004.pt --m 1 5 10 20
