In [2]:
import logging
import os
import threadpoolctl
import torch
from pathlib import Path
from lightning_fabric import seed_everything

import numpy as np
import pandas as pd
from rectools import Columns

from implicit.als import AlternatingLeastSquares

from rectools.dataset import Dataset
from rectools.metrics import MAP, calc_metrics, MeanInvUserFreq, Serendipity
from rectools.models import ImplicitALSWrapperModel
from rectools.models import SASRecModel
from rectools.models.nn.item_net import CatFeaturesItemNet, IdEmbeddingsItemNet

In [3]:
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

# For implicit ALS
os.environ["OPENBLAS_NUM_THREADS"] = "1"
threadpoolctl.threadpool_limits(1, "blas")

logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger()

# Data

In [4]:
# %%time
# !wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O data_original.zip
# !unzip -o data_original.zip
# !rm data_original.zip

In [5]:
DATA_PATH = Path("data_original")

interactions = (
    pd.read_csv(DATA_PATH / 'interactions.csv', parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": "datetime"})
)

# Split dataset

In [6]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

# Split to train / test
max_date = interactions[Columns.Datetime].max()
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()
train.drop(train.query("total_dur < 300").index, inplace=True)

# drop items with less than 20 interactions in train
items = train["item_id"].value_counts()
items = items[items >= 20]
items = items.index.to_list()
train = train[train["item_id"].isin(items)]
    
# drop users with less than 2 interactions in train
users = train["user_id"].value_counts()
users = users[users >= 2]
users = users.index.to_list()
train = train[(train["user_id"].isin(users))]

users = train["user_id"].drop_duplicates().to_list()

# drop cold users from test
test_users_sasrec = test[Columns.User].unique()
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)
test_users = test[Columns.User].unique()


In [7]:
items = pd.read_csv(DATA_PATH / 'items.csv')

In [8]:
# Process item features to the form of a flatten dataframe
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
item_features = pd.concat((genre_feature, content_feature))

candidate_items = interactions['item_id'].drop_duplicates().astype(int)
test["user_id"] = test["user_id"].astype(int)
test["item_id"] = test["item_id"].astype(int)

catalog=train[Columns.Item].unique()

In [9]:
dataset_no_features = Dataset.construct(
    interactions_df=train,
)

dataset_item_features = Dataset.construct(
    interactions_df=train,
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [10]:
metrics_name = {
    'MAP': MAP,
    'MIUF': MeanInvUserFreq,
    'Serendipity': Serendipity
    

}
metrics = {}
for metric_name, metric in metrics_name.items():
    for k in (1, 5, 10):
        metrics[f'{metric_name}@{k}'] = metric(k=k)

# list with metrics results of all models
features_results = []

# SASRec

In [11]:
RANDOM_SEED = 32
torch.use_deterministic_algorithms(True)
seed_everything(RANDOM_SEED, workers=True)

Seed set to 32


32

## Softmax loss

In [12]:
model = SASRecModel(
    n_blocks=2,
    session_max_len=32,
    lr=1e-3,
    epochs=5,
    verbose=1,
    deterministic=True,
    item_net_block_types=(IdEmbeddingsItemNet, ),  # Use only item ids in ItemNetBlock
    recommend_device="cuda",
)


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [13]:
%%time
model.fit(dataset_no_features)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params
---------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 927 K 
---------------------------------------------------------------
927 K     Trainable params
0         Non-trainable params
927 K     Total params
3.709     Total estimated model params size (MB)
/data/home/maspirina1/tasks/repo/RecTools/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


CPU times: user 5min 37s, sys: 9.52 s, total: 5min 47s
Wall time: 5min 33s


<rectools.models.nn.sasrec.SASRecModel at 0x7f0a06d874f0>

In [14]:
%%time
recos = model.recommend(
    users=test_users_sasrec, 
    dataset=dataset_no_features,
    k=10,
    filter_viewed=True,
    on_unsupported_targets="warn"
)

                Model `<class 'rectools.models.nn.sasrec.SASRecModel'>` doesn't support recommendations for cold users,
                but some of given users are cold: they are not in the `dataset.user_id_map`
            
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/data/home/maspirina1/tasks/repo/RecTools/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

CPU times: user 26.7 s, sys: 4.77 s, total: 31.5 s
Wall time: 22 s


In [15]:
metric_values = calc_metrics(metrics, recos[["user_id", "item_id", "rank"]], test, train, catalog)
metric_values["model"] = "softmax"
features_results.append(metric_values)


In [16]:
features_results

[{'MAP@1': 0.04826695924739594,
  'MAP@5': 0.08188109538342359,
  'MAP@10': 0.09111905060251475,
  'MIUF@1': 3.6361608221537636,
  'MIUF@5': 4.3807416255519165,
  'MIUF@10': 5.026356912227157,
  'Serendipity@1': 0.0008844170021458072,
  'Serendipity@5': 0.0007337182385033877,
  'Serendipity@10': 0.0006787460793393714,
  'model': 'softmax'}]

In [17]:
recos

Unnamed: 0,user_id,item_id,score,rank
0,73446,9728,2.312602,1
1,73446,7793,2.170433,2
2,73446,7829,1.772398,3
3,73446,5434,1.463491,4
4,73446,3182,1.446990,5
...,...,...,...,...
947045,857162,3734,2.158359,6
947046,857162,4151,2.016740,7
947047,857162,2657,1.590997,8
947048,857162,9996,1.377985,9


## BCE loss

In [19]:
RANDOM_SEED = 32
torch.use_deterministic_algorithms(True)
seed_everything(RANDOM_SEED, workers=True)

Seed set to 32


32

In [20]:
model = SASRecModel(
    n_blocks=2,
    session_max_len=32,
    lr=1e-3,
    epochs=5,
    verbose=1,
    deterministic=True,
    loss="BCE",
    n_negatives=2,
    item_net_block_types=(IdEmbeddingsItemNet, )  # Use only item ids in ItemNetBlock
)


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [21]:
%%time
model.fit(dataset_no_features)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params
---------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 927 K 
---------------------------------------------------------------
927 K     Trainable params
0         Non-trainable params
927 K     Total params
3.709     Total estimated model params size (MB)
/data/home/maspirina1/tasks/repo/RecTools/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


CPU times: user 5min 32s, sys: 9.23 s, total: 5min 41s
Wall time: 5min 29s


<rectools.models.nn.sasrec.SASRecModel at 0x7f0a96ee6550>

In [22]:
%%time
recos = model.recommend(
    users=test_users_sasrec, 
    dataset=dataset_no_features,
    k=10,
    filter_viewed=True,
    on_unsupported_targets="warn",
)

                Model `<class 'rectools.models.nn.sasrec.SASRecModel'>` doesn't support recommendations for cold users,
                but some of given users are cold: they are not in the `dataset.user_id_map`
            
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/data/home/maspirina1/tasks/repo/RecTools/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

CPU times: user 27.8 s, sys: 3.64 s, total: 31.4 s
Wall time: 22.7 s


In [23]:
metric_values = calc_metrics(metrics, recos[["user_id", "item_id", "rank"]], test, train, catalog)
metric_values["model"] = "bce"
features_results.append(metric_values)

In [24]:
recos

Unnamed: 0,user_id,item_id,score,rank
0,73446,7793,4.407569,1
1,73446,7829,4.235948,2
2,73446,9728,3.968448,3
3,73446,5434,3.567275,4
4,73446,3784,3.505455,5
...,...,...,...,...
947045,857162,3734,2.976863,6
947046,857162,12995,2.927455,7
947047,857162,9996,2.644526,8
947048,857162,4436,2.476946,9


## gBCE loss

In [26]:
RANDOM_SEED = 32
torch.use_deterministic_algorithms(True)
seed_everything(RANDOM_SEED, workers=True)

Seed set to 32


32

In [27]:
model = SASRecModel(
    n_blocks=2,
    session_max_len=32,
    lr=1e-3,
    epochs=5,
    verbose=1,
    deterministic=True,
    loss="gBCE",
    n_negatives=256,
    gbce_t=0.75,
    item_net_block_types=(IdEmbeddingsItemNet, )  # Use only item ids in ItemNetBlock
)

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [28]:
%%time
model.fit(dataset_no_features)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params
---------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 927 K 
---------------------------------------------------------------
927 K     Trainable params
0         Non-trainable params
927 K     Total params
3.709     Total estimated model params size (MB)
/data/home/maspirina1/tasks/repo/RecTools/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


CPU times: user 2h 22min 18s, sys: 40.7 s, total: 2h 22min 59s
Wall time: 10min 59s


<rectools.models.nn.sasrec.SASRecModel at 0x7f0a381e7550>

In [29]:
%%time
recos = model.recommend(
    users=test_users_sasrec, 
    dataset=dataset_no_features,
    k=10,
    filter_viewed=True,
    on_unsupported_targets="warn"
)


                Model `<class 'rectools.models.nn.sasrec.SASRecModel'>` doesn't support recommendations for cold users,
                but some of given users are cold: they are not in the `dataset.user_id_map`
            
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/data/home/maspirina1/tasks/repo/RecTools/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

CPU times: user 29 s, sys: 3.6 s, total: 32.6 s
Wall time: 22.5 s


In [30]:
metric_values = calc_metrics(metrics, recos[["user_id", "item_id", "rank"]], test, train, catalog)
metric_values["model"] = "gBCE"
features_results.append(metric_values)

## Softmax with key_padding_mask + causal_mask

In [32]:
model = SASRecModel(
    n_blocks=2,
    session_max_len=32,
    lr=1e-3,
    epochs=5,
    verbose=1,
    deterministic=True,
    item_net_block_types=(IdEmbeddingsItemNet, ),  # Use only item ids in ItemNetBlock
    recommend_device="cuda",
    use_key_padding_mask=True,
)


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [33]:
%%time
model.fit(dataset_no_features)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params
---------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 927 K 
---------------------------------------------------------------
927 K     Trainable params
0         Non-trainable params
927 K     Total params
3.709     Total estimated model params size (MB)
/data/home/maspirina1/tasks/repo/RecTools/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


CPU times: user 5min 25s, sys: 11 s, total: 5min 36s
Wall time: 5min 17s


<rectools.models.nn.sasrec.SASRecModel at 0x7f089ccb2a30>

In [34]:
%%time
recos = model.recommend(
    users=test_users_sasrec, 
    dataset=dataset_no_features,
    k=10,
    filter_viewed=True,
    on_unsupported_targets="warn"
)

                Model `<class 'rectools.models.nn.sasrec.SASRecModel'>` doesn't support recommendations for cold users,
                but some of given users are cold: they are not in the `dataset.user_id_map`
            
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/data/home/maspirina1/tasks/repo/RecTools/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

CPU times: user 28.7 s, sys: 4.43 s, total: 33.2 s
Wall time: 23.9 s


In [35]:
metric_values = calc_metrics(metrics, recos[["user_id", "item_id", "rank"]], test, train, catalog)

In [36]:
metric_values["model"] = "softmax_padding_mask"
features_results.append(metric_values)

In [37]:
features_df = (
    pd.DataFrame(features_results)
    .set_index("model")
    .sort_values(by=["MAP@10", "Serendipity@10"], ascending=False)
)
features_df

Unnamed: 0_level_0,MAP@1,MAP@5,MAP@10,MIUF@1,MIUF@5,MIUF@10,Serendipity@1,Serendipity@5,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
softmax,0.048267,0.081881,0.091119,3.636161,4.380742,5.026357,0.000884,0.000734,0.000679
gBCE,0.046643,0.081198,0.090133,3.21193,3.868336,4.564376,0.00058,0.000526,0.000512
softmax_padding_mask,0.046936,0.080251,0.089373,4.102551,4.677506,5.163588,0.001046,0.000827,0.000748
bce,0.043669,0.074606,0.083078,3.791983,4.484389,5.035991,0.000561,0.000537,0.000554


### sasrec with item ids embeddings in ItemNetBlock

In [39]:
model = SASRecModel(
    n_blocks=2,
    session_max_len=32,
    lr=1e-3,
    epochs=5,
    verbose=1,
    deterministic=True,
    item_net_block_types=(IdEmbeddingsItemNet, )  # Use only item ids in ItemNetBlock
)

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [40]:
%%time
model.fit(dataset_no_features)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params
---------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 927 K 
---------------------------------------------------------------
927 K     Trainable params
0         Non-trainable params
927 K     Total params
3.709     Total estimated model params size (MB)
/data/home/maspirina1/tasks/repo/RecTools/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


CPU times: user 9min 7s, sys: 14.6 s, total: 9min 22s
Wall time: 9min 8s


<rectools.models.nn.sasrec.SASRecModel at 0x7f085f448c10>

In [41]:
%%time
recos = model.recommend(
    users=test_users_sasrec, 
    dataset=dataset_no_features,
    k=10,
    filter_viewed=True,
    on_unsupported_targets="warn"
)

                Model `<class 'rectools.models.nn.sasrec.SASRecModel'>` doesn't support recommendations for cold users,
                but some of given users are cold: they are not in the `dataset.user_id_map`
            
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/data/home/maspirina1/tasks/repo/RecTools/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

CPU times: user 30.4 s, sys: 4.79 s, total: 35.2 s
Wall time: 24.1 s


In [42]:
# TODO: drop `apply(str)`
recos["item_id"] = recos["item_id"].apply(str)
test["item_id"] = test["item_id"].astype(str)
metric_values = calc_metrics(metrics, recos[["user_id", "item_id", "rank"]], test, train, catalog)
metric_values["model"] = "sasrec_ids"
features_results.append(metric_values)

### sasrec with item ids and category features embeddings in ItemNetBlock

model = SASRecModel(
    n_blocks=2,
    session_max_len=32,
    lr=1e-3,
    epochs=5,
    verbose=1,
    deterministic=True,
    item_net_block_types=(IdEmbeddingsItemNet, CatFeaturesItemNet)  # Use item ids and cat features in ItemNetBlock
)

In [43]:
#%%time
model.fit(dataset_item_features)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params
---------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 927 K 
---------------------------------------------------------------
927 K     Trainable params
0         Non-trainable params
927 K     Total params
3.709     Total estimated model params size (MB)
/data/home/maspirina1/tasks/repo/RecTools/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


<rectools.models.nn.sasrec.SASRecModel at 0x7f085f448c10>

In [44]:
%%time
recos = model.recommend(
    users=test_users_sasrec, 
    dataset=dataset_item_features,
    k=10,
    filter_viewed=True,
    on_unsupported_targets="warn"
)

                Model `<class 'rectools.models.nn.sasrec.SASRecModel'>` doesn't support recommendations for cold users,
                but some of given users are cold: they are not in the `dataset.user_id_map`
            
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/data/home/maspirina1/tasks/repo/RecTools/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

CPU times: user 28.1 s, sys: 3.92 s, total: 32 s
Wall time: 22.5 s


In [45]:
# TODO: drop `apply(str)`
recos["item_id"] = recos["item_id"].apply(str)
test["item_id"] = test["item_id"].astype(str)
metric_values = calc_metrics(metrics, recos[["user_id", "item_id", "rank"]], test, train, catalog)
metric_values["model"] = "sasrec_ids_cat"
features_results.append(metric_values)

### sasrec with category item features embeddings in ItemNetBlock

In [46]:
model = SASRecModel(
    n_blocks=2,
    session_max_len=32,
    lr=1e-3,
    epochs=5,
    verbose=1,
    deterministic=True,
    item_net_block_types=(CatFeaturesItemNet, )  # Use only cat item features in ItemNetBlock
)

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [47]:
#%%time
model.fit(dataset_item_features)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params
---------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 211 K 
---------------------------------------------------------------
211 K     Trainable params
0         Non-trainable params
211 K     Total params
0.847     Total estimated model params size (MB)
/data/home/maspirina1/tasks/repo/RecTools/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


<rectools.models.nn.sasrec.SASRecModel at 0x7f07f41532b0>

In [48]:
%%time
recos = model.recommend(
    users=test_users_sasrec, 
    dataset=dataset_item_features,
    k=10,
    filter_viewed=True,
    on_unsupported_targets="warn"
)

                Model `<class 'rectools.models.nn.sasrec.SASRecModel'>` doesn't support recommendations for cold users,
                but some of given users are cold: they are not in the `dataset.user_id_map`
            
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/data/home/maspirina1/tasks/repo/RecTools/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

CPU times: user 30.7 s, sys: 5.03 s, total: 35.7 s
Wall time: 26.2 s


In [49]:
# TODO: drop `apply(str)`
recos["item_id"] = recos["item_id"].apply(str)
test["item_id"] = test["item_id"].astype(str)
metric_values = calc_metrics(metrics, recos[["user_id", "item_id", "rank"]], test, train, catalog)
metric_values["model"] = "sasrec_cat"
features_results.append(metric_values)

In [50]:
features_results

[{'MAP@1': 0.04826695924739594,
  'MAP@5': 0.08188109538342359,
  'MAP@10': 0.09111905060251475,
  'MIUF@1': 3.6361608221537636,
  'MIUF@5': 4.3807416255519165,
  'MIUF@10': 5.026356912227157,
  'Serendipity@1': 0.0008844170021458072,
  'Serendipity@5': 0.0007337182385033877,
  'Serendipity@10': 0.0006787460793393714,
  'model': 'softmax'},
 {'MAP@1': 0.04366861238832978,
  'MAP@5': 0.07460647524917209,
  'MAP@10': 0.08307848040450251,
  'MIUF@1': 3.791982822943762,
  'MIUF@5': 4.484389444400757,
  'MIUF@10': 5.0359912497143915,
  'Serendipity@1': 0.0005614126292513184,
  'Serendipity@5': 0.0005366042437046948,
  'Serendipity@10': 0.0005543113372254213,
  'model': 'bce'},
 {'MAP@1': 0.04664341232598377,
  'MAP@5': 0.08119792078428736,
  'MAP@10': 0.0901325286557952,
  'MIUF@1': 3.2119302462140364,
  'MIUF@5': 3.868336197031158,
  'MIUF@10': 4.564376052893403,
  'Serendipity@1': 0.0005803230086690105,
  'Serendipity@5': 0.0005258138960338016,
  'Serendipity@10': 0.000511724878859585,
  

### Item to item

In [51]:
target_items = [13865, 4457, 15297]

In [52]:
%%time
recos = model.recommend_to_items(
    target_items=target_items, 
    dataset=dataset_no_features,
    k=10,
    filter_itself=True,
    items_to_recommend=None, #white_list,
)

CPU times: user 1.27 s, sys: 303 ms, total: 1.57 s
Wall time: 1.24 s


In [53]:
recos

Unnamed: 0,target_item_id,item_id,score,rank
0,13865,15648,1.0,1
1,13865,3386,1.0,2
2,13865,147,0.891782,3
3,13865,16194,0.891782,4
4,13865,12309,0.891782,5
5,13865,12586,0.891782,6
6,13865,6661,0.891782,7
7,13865,2255,0.891782,8
8,13865,3792,0.891782,9
9,13865,4130,0.891782,10


In [54]:
# TODO: change model for recos (here is the last one trained and is is the worst in quality)
recos.merge(items[["item_id", "title"]], on="item_id")

Unnamed: 0,target_item_id,item_id,score,rank,title
0,13865,15648,1.0,1,Черное золото
1,13865,3386,1.0,2,Спартак
2,13865,147,0.891782,3,Единичка
3,13865,16194,0.891782,4,Голубая линия
4,13865,12309,0.891782,5,Враг у ворот
5,13865,12586,0.891782,6,Вспоминая 1942
6,13865,6661,0.891782,7,Солдатик
7,13865,2255,0.891782,8,Пленный
8,13865,3792,0.891782,9,Собибор
9,13865,4130,0.891782,10,Пустота
