In [1]:
import os
from experiment_setup import setups

In [2]:
# download the dataset, the links can be fund in the README
dataset_path = "../datasets/coveo_ecommerce"
model_path = "../trained_models"

## Run the preprocess script, specific to the dataset you chose

- The preprocessing script in general, executes the following steps:
    - Loads the raw data, with correct types
    - Creates the sessions
    - Removes duplicated items. An item is considered as a duplicate if the preceding (based on time) event in the same session contains the exact same item.
    - Performes iterative support filtering
        - Removes sessions with only one event
        - Removes items with less than 5 events
        - Until the size of the dataset changes


In [4]:
%run ../Preprocess/coveo_preproc.py --path $dataset_path

1566074 274797 11365
1464757 173480 11344
1463706 173480 10869
1463649 173423 10869
1463645 173423 10868
1463645 173423 10868
1463645 173423 10868


  dt.datetime.utcfromtimestamp(data.Time.min() / 1000).strftime(
  dt.datetime.utcfromtimestamp(data.Time.max() / 1000).strftime(
  dt.datetime.utcfromtimestamp(data.Time.min() / 1000).strftime(
  dt.datetime.utcfromtimestamp(data.Time.max() / 1000).strftime(
  dt.datetime.utcfromtimestamp(data.Time.min() / 1000).strftime(
  dt.datetime.utcfromtimestamp(data.Time.max() / 1000).strftime(
  dt.datetime.utcfromtimestamp(data.Time.min() / 1000).strftime(
  dt.datetime.utcfromtimestamp(data.Time.max() / 1000).strftime(


                                             Dataset  NumEvents  NumSessions  \
0      coveo_ecommerce\coveo_processed_view_full.tsv    1463645       173423   
1      coveo_ecommerce\coveo_processed_view_test.tsv      52501         7748   
2  coveo_ecommerce\coveo_processed_view_train_ful...    1411113       165673   
3  coveo_ecommerce\coveo_processed_view_train_tr.tsv    1368003       159766   
4  coveo_ecommerce\coveo_processed_view_train_val...      43032         5905   

   NumItems    NumDays                   StartTime  \
0     10868  17.999833  2018-12-08 00:00:11.994000   
1      8230   0.998696  2018-12-25 00:01:50.223000   
2     10868  16.999566  2018-12-08 00:00:11.994000   
3     10868  15.999713  2018-12-08 00:00:11.994000   
4      8014   0.997503  2018-12-24 00:03:10.240000   

                      EndTime  AvgItemViews  MinSessionLength  \
0  2018-12-25 23:59:57.577000    134.674733                 2   
1  2018-12-25 23:59:57.577000      6.379222                 2   

  dt.datetime.utcfromtimestamp(data.Time.min() / 1000).strftime(
  dt.datetime.utcfromtimestamp(data.Time.max() / 1000).strftime(


## Use a specific setup for your dataset

In [3]:
params = setups["coveo"]["params_xe"]

In [4]:
train_path = os.path.join(dataset_path,"coveo_processed_view_train_full.tsv")
test_path = os.path.join(dataset_path,"coveo_processed_view_test.tsv")

In [5]:
def create_gru4rec_tensorflow_script(model_name, train_path, test_path, model_path, loss, final_act, layers, batch_size, dropout_p_hidden, learning_rate, hidden_act, n_epochs, m, decay, initial_accumulator):
        """Creates the train and test script.
        """
        s_train_full = (
                f"python ../GRU4Rec_TensorFlow/gru4rec_BP/main.py --train_path {train_path} "
                f"--test_path {test_path} --checkpoint_dir {model_path}/{model_name} --layer {1} --size {layers} "
                f"--epoch {n_epochs} --lr {learning_rate} --hidden_act {hidden_act} --final_act {final_act} "
                f"--loss {loss} --dropout {dropout_p_hidden} --batch_size {batch_size} "
                f"--initial_accumulator_value {initial_accumulator} --decay {decay}"
        )
        s_test_full = s_train_full + f" --train 0 --test {n_epochs-1} --m {m}"
        return s_train_full, s_test_full

In [6]:
loss = params["loss"]
optim = params["optim"]
const_emb = params["constrained_embedding"]
embed = params["embedding"]
final_act = params["final_act"]
layers = params["layers"]
batch_size = params["batch_size"]
dropout_p_embed = params["dropout_p_embed"]
dropout_p_hidden = params["dropout_p_hidden"]
learning_rate = params["learning_rate"]
momentum = params["momentum"]
sample_alpha = params["sample_alpha"]
bpreg = params["bpreg"]
logq = params["logq"]
hidden_act = params["hidden_act"]
n_epochs = 5
m = '1 5 10 20'

## Train & test the out-of-the-box model

In [7]:
train_script_oob, test_script_oob = create_gru4rec_tensorflow_script(model_name="gru4rec_tensorflow_oob", train_path=train_path, test_path=test_path, model_path=model_path, loss=loss, final_act=final_act, layers=layers, batch_size=50, dropout_p_hidden=dropout_p_hidden, learning_rate=learning_rate, hidden_act=hidden_act, n_epochs=n_epochs, m=m, decay=0.96, initial_accumulator=1e-1)

### Train the out-of-the-box model

In [8]:
from model_tracker import track_training_C02_emissions

track_training_C02_emissions(train_script_oob, "gru4rec_tensorflow_oob", "coveo")

[codecarbon INFO @ 18:13:36] [setup] RAM Tracking...
[codecarbon INFO @ 18:13:36] [setup] GPU Tracking...
[codecarbon INFO @ 18:13:37] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 18:13:37] [setup] CPU Tracking...
[codecarbon INFO @ 18:13:39] CPU Model on constant consumption mode: 13th Gen Intel(R) Core(TM) i9-13900HX
[codecarbon INFO @ 18:13:40] >>> Tracker's metadata:
[codecarbon INFO @ 18:13:40]   Platform system: Windows-11-10.0.22631-SP0
[codecarbon INFO @ 18:13:40]   Python version: 3.12.3
[codecarbon INFO @ 18:13:40]   CodeCarbon version: 2.4.2
[codecarbon INFO @ 18:13:40]   Available RAM : 31.746 GB
[codecarbon INFO @ 18:13:40]   CPU count: 32
[codecarbon INFO @ 18:13:40]   CPU model: 13th Gen Intel(R) Core(TM) i9-13900HX
[codecarbon INFO @ 18:13:40]   GPU count: 1
[codecarbon INFO @ 18:13:40]   GPU model: 1 x NVIDIA GeForce RTX 4090 Laptop GPU


CarbonTracker: The following components were found: GPU with device(s) NVIDIA GeForce RTX 4090 Laptop GPU.
The training of models in this work is estimated to use 0.000 kWh of electricity contributing to 0.000 kg of CO2eq. Measured by carbontracker (https://github.com/lfwa/carbontracker).


[codecarbon INFO @ 18:13:58] Energy consumed for RAM : 0.000050 kWh. RAM Power : 11.904736518859863 W
[codecarbon INFO @ 18:13:58] Energy consumed for all GPUs : 0.000103 kWh. Total GPU Power : 24.584543573911386 W
[codecarbon INFO @ 18:13:58] Energy consumed for all CPUs : 0.000177 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 18:13:58] 0.000329 kWh of electricity used since the beginning.
[codecarbon INFO @ 18:14:13] Energy consumed for RAM : 0.000099 kWh. RAM Power : 11.904736518859863 W
[codecarbon INFO @ 18:14:14] Energy consumed for all GPUs : 0.000164 kWh. Total GPU Power : 14.768583193100739 W
[codecarbon INFO @ 18:14:14] Energy consumed for all CPUs : 0.000360 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 18:14:14] 0.000623 kWh of electricity used since the beginning.
[codecarbon INFO @ 18:14:28] Energy consumed for RAM : 0.000147 kWh. RAM Power : 11.904736518859863 W
[codecarbon INFO @ 18:14:28] Energy consumed for all GPUs : 0.000218 kWh. Total GPU Power : 13.339880338

CarbonTracker: Average carbon intensity during training was 374.46 gCO2/kWh at detected location: Santiago, Santiago Metropolitan, CL.
CarbonTracker: 
Actual consumption for 1 epoch(s):
	Time:	0:07:06
	Energy:	0.058889227016 kWh
	CO2eq:	22.051850160622 g
	This is equivalent to:
	0.205133489866 km travelled by car
CarbonTracker: Live carbon intensity could not be fetched at detected location: Santiago, Santiago Metropolitan, CL. Defaulted to average carbon intensity for CL in 2021 of 374.46 gCO2/kWh. at detected location: Santiago, Santiago Metropolitan, CL.
CarbonTracker: 
Predicted consumption for 1 epoch(s):
	Time:	0:07:06
	Energy:	0.058889227016 kWh
	CO2eq:	22.051850160622 g
	This is equivalent to:
	0.205133489866 km travelled by car
CarbonTracker: Finished monitoring.
Salida de STDOUT:                          Args                                    Values
0                     n_items                                     10868
1                      layers                          

0.002594780853018689

### Test the out-of-the-box model

In [11]:
os.system(test_script_oob)

0


## Train & test the minor fix model


In [12]:
train_script_minor, test_script_minor = create_gru4rec_tensorflow_script(model_name='gru4rec_tensorflow_minorfix', train_path=train_path, test_path=test_path, model_path=model_path, loss=loss, final_act=final_act, layers=layers, batch_size=batch_size, dropout_p_hidden=dropout_p_hidden, learning_rate=learning_rate, hidden_act=hidden_act, n_epochs=n_epochs, m=m, decay=1.0, initial_accumulator=1e-12)

In [15]:
print(test_script_minor)

python ../GRU4Rec_TensorFlow/gru4rec_BP/main.py --train_path ../datasets/coveo_ecommerce\coveo_processed_view_train_full.tsv --test_path ../datasets/coveo_ecommerce\coveo_processed_view_test.tsv --checkpoint_dir ../trained_models/gru4rec_tensorflow_minorfix --layer 1 --size 512 --epoch 5 --lr 0.03 --hidden_act tanh --final_act softmax --loss cross-entropy --dropout 0.15 --batch_size 32 --initial_accumulator_value 1e-12 --decay 1.0 --train 0 --test 4 --m 1 5 10 20


### Train the minor fix model

In [13]:
track_training_C02_emissions(train_script_minor, "gru4rec_tensorflow_minorfix", "coveo")

[codecarbon INFO @ 18:32:47] [setup] RAM Tracking...
[codecarbon INFO @ 18:32:47] [setup] GPU Tracking...
[codecarbon INFO @ 18:32:47] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 18:32:48] [setup] CPU Tracking...
[codecarbon INFO @ 18:32:50] CPU Model on constant consumption mode: 13th Gen Intel(R) Core(TM) i9-13900HX
[codecarbon INFO @ 18:32:50] >>> Tracker's metadata:
[codecarbon INFO @ 18:32:50]   Platform system: Windows-11-10.0.22631-SP0
[codecarbon INFO @ 18:32:50]   Python version: 3.12.3
[codecarbon INFO @ 18:32:50]   CodeCarbon version: 2.4.2
[codecarbon INFO @ 18:32:50]   Available RAM : 31.746 GB
[codecarbon INFO @ 18:32:50]   CPU count: 32
[codecarbon INFO @ 18:32:50]   CPU model: 13th Gen Intel(R) Core(TM) i9-13900HX
[codecarbon INFO @ 18:32:50]   GPU count: 1
[codecarbon INFO @ 18:32:50]   GPU model: 1 x NVIDIA GeForce RTX 4090 Laptop GPU


CarbonTracker: The following components were found: GPU with device(s) NVIDIA GeForce RTX 4090 Laptop GPU.
CarbonTracker: The following components were found: GPU with device(s) NVIDIA GeForce RTX 4090 Laptop GPU.
The training of models in this work is estimated to use 0.000 kWh of electricity contributing to 0.000 kg of CO2eq. Measured by carbontracker (https://github.com/lfwa/carbontracker).


[codecarbon INFO @ 18:33:09] Energy consumed for RAM : 0.000051 kWh. RAM Power : 11.904736518859863 W
[codecarbon INFO @ 18:33:09] Energy consumed for all GPUs : 0.000043 kWh. Total GPU Power : 9.992181239419622 W
[codecarbon INFO @ 18:33:09] Energy consumed for all CPUs : 0.000187 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 18:33:09] 0.000281 kWh of electricity used since the beginning.
[codecarbon INFO @ 18:33:24] Energy consumed for RAM : 0.000099 kWh. RAM Power : 11.904736518859863 W
[codecarbon INFO @ 18:33:24] Energy consumed for all GPUs : 0.000071 kWh. Total GPU Power : 6.953635775127584 W
[codecarbon INFO @ 18:33:24] Energy consumed for all CPUs : 0.000364 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 18:33:24] 0.000534 kWh of electricity used since the beginning.
[codecarbon INFO @ 18:33:39] Energy consumed for RAM : 0.000147 kWh. RAM Power : 11.904736518859863 W
[codecarbon INFO @ 18:33:39] Energy consumed for all GPUs : 0.000108 kWh. Total GPU Power : 9.196471982601

CarbonTracker: Average carbon intensity during training was 374.46 gCO2/kWh at detected location: Santiago, Santiago Metropolitan, CL.
CarbonTracker: Average carbon intensity during training was 374.46 gCO2/kWh at detected location: Santiago, Santiago Metropolitan, CL.
CarbonTracker: 
Actual consumption for 1 epoch(s):
	Time:	0:06:37
	Energy:	0.093370648305 kWh
	CO2eq:	34.963874551308 g
	This is equivalent to:
	0.325245344663 km travelled by car
CarbonTracker: 
Actual consumption for 1 epoch(s):
	Time:	0:06:37
	Energy:	0.093370648305 kWh
	CO2eq:	34.963874551308 g
	This is equivalent to:
	0.325245344663 km travelled by car
CarbonTracker: Live carbon intensity could not be fetched at detected location: Santiago, Santiago Metropolitan, CL. Defaulted to average carbon intensity for CL in 2021 of 374.46 gCO2/kWh. at detected location: Santiago, Santiago Metropolitan, CL.
CarbonTracker: Live carbon intensity could not be fetched at detected location: Santiago, Santiago Metropolitan, CL. Defa

0.0022904249022593446

### Test the minor fix model

In [14]:
os.system(test_script_minor)

0