In [2]:
import sys
sys.path.append("../")

In [3]:
import logging
import os
import threadpoolctl
import torch
from pathlib import Path
from lightning_fabric import seed_everything

import numpy as np
import pandas as pd
from rectools import Columns

from implicit.als import AlternatingLeastSquares

from rectools.dataset import Dataset
from rectools.metrics import MAP, calc_metrics, MeanInvUserFreq, Serendipity
from rectools.models import ImplicitALSWrapperModel
from rectools.models import SASRecModel
from rectools.models.nn.item_net import CatFeaturesItemNet, IdEmbeddingsItemNet

In [4]:
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

# For implicit ALS
os.environ["OPENBLAS_NUM_THREADS"] = "1"
threadpoolctl.threadpool_limits(1, "blas")

logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger()

# Data

In [4]:
# %%time
# !wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O data_original.zip
# !unzip -o data_original.zip
# !rm data_original.zip

In [5]:
DATA_PATH = Path("data_original")

interactions = (
    pd.read_csv(DATA_PATH / 'interactions.csv', parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": "datetime"})
)

# Split dataset

In [6]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

# Split to train / test
max_date = interactions[Columns.Datetime].max()
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()
train.drop(train.query("total_dur < 300").index, inplace=True)

# drop items with less than 20 interactions in train
items = train["item_id"].value_counts()
items = items[items >= 20]
items = items.index.to_list()
train = train[train["item_id"].isin(items)]
    
# drop users with less than 2 interactions in train
users = train["user_id"].value_counts()
users = users[users >= 2]
users = users.index.to_list()
train = train[(train["user_id"].isin(users))]

users = train["user_id"].drop_duplicates().to_list()

# drop cold users from test
test_users_sasrec = test[Columns.User].unique()
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)
test_users = test[Columns.User].unique()


In [48]:
items = pd.read_csv(DATA_PATH / 'items.csv')

In [49]:
# Process item features to the form of a flatten dataframe
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"

items["director"] = items["directors"].str.lower().str.replace(" ", "", regex=False).replace(", ", ",", regex=False).str.split(",")
directors_feature = items[["item_id", "director"]].explode("director")
directors_feature.columns = ["id", "value"]
directors_feature["feature"] = "director"

content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
item_features_genre_content = pd.concat((genre_feature, content_feature))
item_features_genre_director = pd.concat((genre_feature, directors_feature))

candidate_items = interactions['item_id'].drop_duplicates().astype(int)
test["user_id"] = test["user_id"].astype(int)
test["item_id"] = test["item_id"].astype(int)

catalog=train[Columns.Item].unique()

In [9]:
dataset_no_features = Dataset.construct(
    interactions_df=train,
)

dataset_item_features = Dataset.construct(
    interactions_df=train,
    item_features_df=item_features_genre_content,
    cat_item_features=["genre", "content_type"],
)

dataset_item_features_genre_director = Dataset.construct(
    interactions_df=train,
    item_features_df=item_features_genre_director,
    cat_item_features=["genre", "director"],
)

In [10]:
metrics_name = {
    'MAP': MAP,
    'MIUF': MeanInvUserFreq,
    'Serendipity': Serendipity
    

}
metrics = {}
for metric_name, metric in metrics_name.items():
    for k in (1, 5, 10):
        metrics[f'{metric_name}@{k}'] = metric(k=k)

# list with metrics results of all models
features_results = []

# SASRec

In [11]:
RANDOM_SEED = 32
torch.use_deterministic_algorithms(True)
seed_everything(RANDOM_SEED, workers=True)

Seed set to 32


32

## Softmax loss

In [13]:
model = SASRecModel(
    n_blocks=2,
    session_max_len=32,
    lr=1e-3,
    epochs=5,
    verbose=1,
    deterministic=True,
    item_net_block_types=(IdEmbeddingsItemNet, ),  # Use only item ids in ItemNetBlock
)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [14]:
%%time
model.fit(dataset_no_features)

  unq_values = pd.unique(values)
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params | Mode 
-----------------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 2.2 M  | train
-----------------------------------------------------------------------
2.2 M     Trainable params
0         Non-trainable params
2.2 M     Total params
8.991     Total estimated model params size (MB)
36        Modules in train mode
0         Modules in eval mode
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `nu

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


CPU times: user 6min 12s, sys: 7.72 s, total: 6min 20s
Wall time: 6min 9s


<rectools.models.nn.sasrec.SASRecModel at 0x7f4f9d4b6940>

In [15]:
%%time
recos = model.recommend(
    users=test_users_sasrec, 
    dataset=dataset_no_features,
    k=10,
    filter_viewed=True,
    on_unsupported_targets="warn"
)

                Model `<class 'rectools.models.nn.sasrec.SASRecModel'>` doesn't support recommendations for cold users,
                but some of given users are cold: they are not in the `dataset.user_id_map`
            
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

CPU times: user 2min 27s, sys: 5.38 s, total: 2min 32s
Wall time: 19.2 s


In [16]:
metric_values = calc_metrics(metrics, recos[["user_id", "item_id", "rank"]], test, train, catalog)
metric_values["model"] = "softmax"
features_results.append(metric_values)


In [17]:
features_results

[{'MAP@1': 0.04846577699474078,
  'MAP@5': 0.0816953145406517,
  'MAP@10': 0.09070442769366964,
  'MIUF@1': 3.871426206344739,
  'MIUF@5': 4.573068555853547,
  'MIUF@10': 5.159742458558834,
  'Serendipity@1': 0.001116687417059873,
  'Serendipity@5': 0.0008645696959881002,
  'Serendipity@10': 0.0007632648657992071,
  'model': 'softmax'}]

In [18]:
recos

Unnamed: 0,user_id,item_id,score,rank
0,73446,9728,2.401881,1
1,73446,7793,1.923069,2
2,73446,3784,1.824613,3
3,73446,3182,1.666528,4
4,73446,7829,1.662176,5
...,...,...,...,...
947045,857162,12995,2.385432,6
947046,857162,6809,2.360935,7
947047,857162,657,1.940931,8
947048,857162,4702,1.866479,9


## BCE loss

In [20]:
RANDOM_SEED = 32
torch.use_deterministic_algorithms(True)
seed_everything(RANDOM_SEED, workers=True)

Seed set to 32


32

In [21]:
model = SASRecModel(
    n_blocks=2,
    session_max_len=32,
    lr=1e-3,
    epochs=5,
    verbose=1,
    deterministic=True,
    loss="BCE",
    n_negatives=2,
    item_net_block_types=(IdEmbeddingsItemNet, ),  # Use only item ids in ItemNetBlock
)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [22]:
%%time
model.fit(dataset_no_features)

  unq_values = pd.unique(values)
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params | Mode 
-----------------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 2.2 M  | train
-----------------------------------------------------------------------
2.2 M     Trainable params
0         Non-trainable params
2.2 M     Total params
8.991     Total estimated model params size (MB)
36        Modules in train mode
0         Modules in eval mode
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `nu

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


CPU times: user 5min 41s, sys: 7.63 s, total: 5min 48s
Wall time: 5min 33s


<rectools.models.nn.sasrec.SASRecModel at 0x7f5066f3f400>

In [23]:
%%time
recos = model.recommend(
    users=test_users_sasrec, 
    dataset=dataset_no_features,
    k=10,
    filter_viewed=True,
    on_unsupported_targets="warn",
)

                Model `<class 'rectools.models.nn.sasrec.SASRecModel'>` doesn't support recommendations for cold users,
                but some of given users are cold: they are not in the `dataset.user_id_map`
            
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

CPU times: user 2min 53s, sys: 6.47 s, total: 2min 59s
Wall time: 21.1 s


In [24]:
metric_values = calc_metrics(metrics, recos[["user_id", "item_id", "rank"]], test, train, catalog)
metric_values["model"] = "bce"
features_results.append(metric_values)

In [25]:
recos

Unnamed: 0,user_id,item_id,score,rank
0,73446,3182,3.370286,1
1,73446,12965,3.088001,2
2,73446,6774,3.056905,3
3,73446,16270,2.966968,4
4,73446,7582,2.965708,5
...,...,...,...,...
947045,857162,4151,2.733006,6
947046,857162,142,2.687315,7
947047,857162,9728,2.634741,8
947048,857162,3734,2.558933,9


## gBCE loss

In [27]:
RANDOM_SEED = 32
torch.use_deterministic_algorithms(True)
seed_everything(RANDOM_SEED, workers=True)

Seed set to 32


32

In [28]:
model = SASRecModel(
    n_blocks=2,
    session_max_len=32,
    lr=1e-3,
    epochs=5,
    verbose=1,
    deterministic=True,
    loss="gBCE",
    n_negatives=256,
    gbce_t=0.75,
    item_net_block_types=(IdEmbeddingsItemNet, ),  # Use only item ids in ItemNetBlock
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [29]:
%%time
model.fit(dataset_no_features)

  unq_values = pd.unique(values)
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params | Mode 
-----------------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 2.2 M  | train
-----------------------------------------------------------------------
2.2 M     Trainable params
0         Non-trainable params
2.2 M     Total params
8.991     Total estimated model params size (MB)
36        Modules in train mode
0         Modules in eval mode
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `nu

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


CPU times: user 1h 57min 29s, sys: 32.5 s, total: 1h 58min 1s
Wall time: 10min 24s


<rectools.models.nn.sasrec.SASRecModel at 0x7f4e6bd44400>

In [30]:
%%time
recos = model.recommend(
    users=test_users_sasrec, 
    dataset=dataset_no_features,
    k=10,
    filter_viewed=True,
    on_unsupported_targets="warn"
)


                Model `<class 'rectools.models.nn.sasrec.SASRecModel'>` doesn't support recommendations for cold users,
                but some of given users are cold: they are not in the `dataset.user_id_map`
            
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

CPU times: user 2min 34s, sys: 4.91 s, total: 2min 39s
Wall time: 18.6 s


In [31]:
metric_values = calc_metrics(metrics, recos[["user_id", "item_id", "rank"]], test, train, catalog)
metric_values["model"] = "gBCE"
features_results.append(metric_values)

## Softmax with key_padding_mask + causal_mask

In [34]:
model = SASRecModel(
    n_blocks=2,
    session_max_len=32,
    lr=1e-3,
    epochs=5,
    verbose=1,
    deterministic=True,
    item_net_block_types=(IdEmbeddingsItemNet, ),  # Use only item ids in ItemNetBlock
    use_key_padding_mask=True,
)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [37]:
%%time
model.fit(dataset_no_features)

  unq_values = pd.unique(values)
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params | Mode 
-----------------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 2.2 M  | train
-----------------------------------------------------------------------
2.2 M     Trainable params
0         Non-trainable params
2.2 M     Total params
8.991     Total estimated model params size (MB)
36        Modules in train mode
0         Modules in eval mode
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `nu

Training: |          | 0/? [00:00<?, ?it/s]

RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.

In [None]:
%%time
recos = model.recommend(
    users=test_users_sasrec, 
    dataset=dataset_no_features,
    k=10,
    filter_viewed=True,
    on_unsupported_targets="warn"
)

                Model `<class 'rectools.models.nn.sasrec.SASRecModel'>` doesn't support recommendations for cold users,
                but some of given users are cold: they are not in the `dataset.user_id_map`
            
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/data/home/amsemenov2/git/RecTools_origin/RecTools/.venv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

CPU times: user 25.6 s, sys: 3.04 s, total: 28.6 s
Wall time: 19.8 s


In [None]:
metric_values = calc_metrics(metrics, recos[["user_id", "item_id", "rank"]], test, train, catalog)

In [None]:
metric_values["model"] = "softmax_padding_mask"
features_results.append(metric_values)

In [36]:
features_df = (
    pd.DataFrame(features_results)
    .set_index("model")
    .sort_values(by=["MAP@10", "Serendipity@10"], ascending=False)
)
features_df

Unnamed: 0_level_0,MAP@1,MAP@5,MAP@10,MIUF@1,MIUF@5,MIUF@10,Serendipity@1,Serendipity@5,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
softmax,0.048466,0.081695,0.090704,3.871426,4.573069,5.159742,0.001117,0.000865,0.000763
gBCE,0.040848,0.072356,0.080166,2.332397,3.093763,3.942205,0.000103,0.000118,0.000134
bce,0.027035,0.051244,0.05908,3.882081,4.384314,4.734298,0.000104,0.000121,0.000131


### sasrec with item ids embeddings in ItemNetBlock

In [38]:
model = SASRecModel(
    n_blocks=2,
    session_max_len=32,
    lr=1e-3,
    epochs=5,
    verbose=1,
    deterministic=True,
    item_net_block_types=(IdEmbeddingsItemNet, ),  # Use only item ids in ItemNetBlock
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [39]:
%%time
model.fit(dataset_no_features)

  unq_values = pd.unique(values)
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params | Mode 
-----------------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 2.2 M  | train
-----------------------------------------------------------------------
2.2 M     Trainable params
0         Non-trainable params
2.2 M     Total params
8.991     Total estimated model params size (MB)
36        Modules in train mode
0         Modules in eval mode
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `nu

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


CPU times: user 6min 18s, sys: 7.6 s, total: 6min 26s
Wall time: 6min 11s


<rectools.models.nn.sasrec.SASRecModel at 0x7f4dd865b310>

In [40]:
%%time
recos = model.recommend(
    users=test_users_sasrec, 
    dataset=dataset_no_features,
    k=10,
    filter_viewed=True,
    on_unsupported_targets="warn"
)

                Model `<class 'rectools.models.nn.sasrec.SASRecModel'>` doesn't support recommendations for cold users,
                but some of given users are cold: they are not in the `dataset.user_id_map`
            
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

CPU times: user 2min 35s, sys: 4.93 s, total: 2min 40s
Wall time: 23.4 s


In [41]:
metric_values = calc_metrics(metrics, recos[["user_id", "item_id", "rank"]], test, train, catalog)
metric_values["model"] = "sasrec_ids"
features_results.append(metric_values)

### sasrec with item ids and category features embeddings in ItemNetBlock

In [43]:
model = SASRecModel(
    n_blocks=2,
    session_max_len=32,
    lr=1e-3,
    epochs=5,
    verbose=1,
    deterministic=True,
    item_net_block_types=(IdEmbeddingsItemNet, CatFeaturesItemNet),  # Use item ids and cat features in ItemNetBlock
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [44]:
#%%time
model.fit(dataset_item_features_genre_director)

  unq_values = pd.unique(values)
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params | Mode 
-----------------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 3.4 M  | train
-----------------------------------------------------------------------
3.4 M     Trainable params
0         Non-trainable params
3.4 M     Total params
13.621    Total estimated model params size (MB)
39        Modules in train mode
0         Modules in eval mode
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `nu

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


<rectools.models.nn.sasrec.SASRecModel at 0x7f4db089cbb0>

In [45]:
%%time
recos = model.recommend(
    users=test_users_sasrec, 
    dataset=dataset_item_features,
    k=10,
    filter_viewed=True,
    on_unsupported_targets="warn"
)

                Model `<class 'rectools.models.nn.sasrec.SASRecModel'>` doesn't support recommendations for cold users,
                but some of given users are cold: they are not in the `dataset.user_id_map`
            
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

CPU times: user 2min 37s, sys: 8.99 s, total: 2min 46s
Wall time: 21.8 s


In [50]:
metric_values = calc_metrics(metrics, recos[["user_id", "item_id", "rank"]], test, train, catalog)
metric_values["model"] = "sasrec_id_and_cat_features"
features_results.append(metric_values)

### sasrec with category item features embeddings in ItemNetBlock

In [52]:
model = SASRecModel(
    n_blocks=2,
    session_max_len=32,
    lr=1e-3,
    epochs=5,
    verbose=1,
    deterministic=True,
    item_net_block_types=(CatFeaturesItemNet, ),  # Use only cat item features in ItemNetBlock
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [53]:
#%%time
model.fit(dataset_item_features_genre_director)

  unq_values = pd.unique(values)
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params | Mode 
-----------------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 2.0 M  | train
-----------------------------------------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
7.832     Total estimated model params size (MB)
36        Modules in train mode
0         Modules in eval mode
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `nu

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


<rectools.models.nn.sasrec.SASRecModel at 0x7f4f9e7f72e0>

In [54]:
%%time
recos = model.recommend(
    users=test_users_sasrec, 
    dataset=dataset_item_features,
    k=10,
    filter_viewed=True,
    on_unsupported_targets="warn"
)

                Model `<class 'rectools.models.nn.sasrec.SASRecModel'>` doesn't support recommendations for cold users,
                but some of given users are cold: they are not in the `dataset.user_id_map`
            
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/home/maspirina1/git_repos/RecTools/venv/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

CPU times: user 2min 31s, sys: 8.96 s, total: 2min 40s
Wall time: 18.7 s


In [55]:
metric_values = calc_metrics(metrics, recos[["user_id", "item_id", "rank"]], test, train, catalog)
metric_values["model"] = "sasrec_cat_features"
features_results.append(metric_values)

In [56]:
features_results

[{'MAP@1': 0.04846577699474078,
  'MAP@5': 0.0816953145406517,
  'MAP@10': 0.09070442769366964,
  'MIUF@1': 3.871426206344739,
  'MIUF@5': 4.573068555853547,
  'MIUF@10': 5.159742458558834,
  'Serendipity@1': 0.001116687417059873,
  'Serendipity@5': 0.0008645696959881002,
  'Serendipity@10': 0.0007632648657992071,
  'model': 'softmax'},
 {'MAP@1': 0.02703450310364319,
  'MAP@5': 0.05124396949349954,
  'MAP@10': 0.05907958022653049,
  'MIUF@1': 3.882081042459438,
  'MIUF@5': 4.384313936251787,
  'MIUF@10': 4.734298278984563,
  'Serendipity@1': 0.00010437879417622002,
  'Serendipity@5': 0.0001209341551851975,
  'Serendipity@10': 0.0001308852660453074,
  'model': 'bce'},
 {'MAP@1': 0.04084812884382748,
  'MAP@5': 0.07235604259743772,
  'MAP@10': 0.08016616686270196,
  'MIUF@1': 2.33239724771057,
  'MIUF@5': 3.093763291371006,
  'MIUF@10': 3.9422054591506033,
  'Serendipity@1': 0.00010303205538126172,
  'Serendipity@5': 0.00011795153034776448,
  'Serendipity@10': 0.00013442022189753792,
  

In [57]:
features_df = (
    pd.DataFrame(features_results)
    .set_index("model")
    .sort_values(by=["MAP@10", "Serendipity@10"], ascending=False)
)
features_df

Unnamed: 0_level_0,MAP@1,MAP@5,MAP@10,MIUF@1,MIUF@5,MIUF@10,Serendipity@1,Serendipity@5,Serendipity@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
sasrec_id_and_cat_features,0.047662,0.082258,0.091474,3.941583,4.572004,5.181503,0.00126,0.000932,0.000823
sasrec_ids,0.048148,0.081748,0.090828,18.82462,18.82462,18.82462,0.099308,0.060183,0.044193
softmax,0.048466,0.081695,0.090704,3.871426,4.573069,5.159742,0.001117,0.000865,0.000763
gBCE,0.040848,0.072356,0.080166,2.332397,3.093763,3.942205,0.000103,0.000118,0.000134
sasrec_cat_features,0.043106,0.070367,0.078227,4.184666,5.596602,6.135927,0.001027,0.000888,0.000773
bce,0.027035,0.051244,0.05908,3.882081,4.384314,4.734298,0.000104,0.000121,0.000131


### Item to item

In [58]:
target_items = [13865, 4457, 15297]

In [59]:
%%time
recos = model.recommend_to_items(
    target_items=target_items, 
    dataset=dataset_no_features,
    k=10,
    filter_itself=True,
    items_to_recommend=None, #white_list,
)

CPU times: user 3.14 s, sys: 4.21 s, total: 7.35 s
Wall time: 1.15 s


In [60]:
recos

Unnamed: 0,target_item_id,item_id,score,rank
0,13865,11863,1.0,1
1,13865,7107,1.0,2
2,13865,6409,0.628877,3
3,13865,142,0.55963,4
4,13865,2657,0.514484,5
5,13865,4457,0.503537,6
6,13865,15297,0.500209,7
7,13865,6809,0.487185,8
8,13865,10772,0.485932,9
9,13865,10440,0.47383,10


In [61]:
# TODO: change model for recos (here is the last one trained and is is the worst in quality)
recos.merge(items[["item_id", "title"]], on="item_id")

Unnamed: 0,target_item_id,item_id,score,rank,title
0,13865,11863,1.0,1,Девятаев - сериал
1,13865,7107,1.0,2,Девятаев
2,13865,6409,0.628877,3,Особо опасен
3,13865,142,0.55963,4,Маша
4,13865,2657,0.514484,5,Подслушано
5,13865,4457,0.503537,6,2067: Петля времени
6,13865,15297,0.500209,7,Клиника счастья
7,13865,6809,0.487185,8,Дуров
8,13865,10772,0.485932,9,Зелёная книга
9,13865,10440,0.47383,10,Хрустальный
