In [1]:
import sys
sys.path.append("/data/home/maspirina1/tasks/repo/RecTools/")

In [2]:
import os
import torch
import threadpoolctl
from pathlib import Path
from lightning_fabric import seed_everything

import numpy as np
import pandas as pd
from rectools import Columns


from rectools.dataset import Dataset
from rectools.metrics import MAP, calc_metrics, MeanInvUserFreq, Serendipity
from rectools.models.bert4rec import CatFeaturesItemNet, IdEmbeddingsItemNet, BERT4RecModel

In [3]:
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
threadpoolctl.threadpool_limits(1, "blas")

<threadpoolctl.threadpool_limits at 0x7f0b2806fd60>

# Prepare data

In [4]:
# %%time
# !wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O data_original.zip
# !unzip -o data_original.zip
# !rm data_original.zip

In [5]:
DATA_PATH = Path("data_original")

interactions = (
    pd.read_csv(DATA_PATH / 'interactions.csv', parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": "datetime"})
)

In [6]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

# Split to train / test
max_date = interactions[Columns.Datetime].max()
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()
train.drop(train.query("total_dur < 300").index, inplace=True)

# drop items with less than 20 interactions in train
items = train["item_id"].value_counts()
items = items[items >= 20]
items = items.index.to_list()
train = train[train["item_id"].isin(items)]
    
# drop users with less than 2 interactions in train
users = train["user_id"].value_counts()
users = users[users >= 2]
users = users.index.to_list()
train = train[(train["user_id"].isin(users))]

users = train["user_id"].drop_duplicates().to_list()

# drop cold users from test
test_users_sasrec = test[Columns.User].unique()
cold_users = set(test[Columns.User]) - set(train[Columns.User])
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)
test_users = test[Columns.User].unique()


In [7]:
items = pd.read_csv(DATA_PATH / 'items.csv')

In [8]:
# Process item features to the form of a flatten dataframe
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
item_features = pd.concat((genre_feature, content_feature))

candidate_items = interactions['item_id'].drop_duplicates().astype(int)
test["user_id"] = test["user_id"].astype(int)
test["item_id"] = test["item_id"].astype(int)

catalog=train[Columns.Item].unique()

In [9]:
dataset_no_features = Dataset.construct(
    interactions_df=train,
)

dataset_item_features = Dataset.construct(
    interactions_df=train,
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [10]:
metrics_name = {
    'MAP': MAP,
    'MIUF': MeanInvUserFreq,
    'Serendipity': Serendipity
    

}
metrics = {}
for metric_name, metric in metrics_name.items():
    for k in (1, 5, 10):
        metrics[f'{metric_name}@{k}'] = metric(k=k)

# list with metrics results of all models
features_results = []


# BERT4Rec

In [11]:
RANDOM_SEED = 32
torch.use_deterministic_algorithms(True)
seed_everything(RANDOM_SEED, workers=True)

Seed set to 32


32

### BERT4Rec with item ids embeddings in ItemNetBlock

In [20]:
model = BERT4RecModel(
    n_blocks=3,
    n_heads=4,
    dropout_rate=0.2,
    session_max_len=32,
    lr=1e-3,
    epochs=5,
    verbose=1,
    mask_prob=0.5,
    deterministic=True,
    item_net_block_types=(IdEmbeddingsItemNet, ),
)

Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [21]:
%%time
model.fit(dataset_no_features)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params
---------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 1.3 M 
---------------------------------------------------------------
1.3 M     Trainable params
0         Non-trainable params
1.3 M     Total params
5.291     Total estimated model params size (MB)
/data/home/maspirina1/tasks/repo/RecTools/.venv/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=143` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 90.00 MiB. GPU 

In [None]:
%%time
recos = model.recommend(
    users=test_users_sasrec, 
    dataset=dataset_item_features,
    k=10,
    filter_viewed=True,
    on_unsupported_targets="warn"
)

                Model `<class 'rectools.models.bert4rec.BERT4RecModel'>` doesn't support recommendations for cold users,
                but some of given users are cold: they are not in the `dataset.user_id_map`
            
100%|██████████| 740/740 [00:15<00:00, 49.03it/s]


CPU times: user 17min 5s, sys: 46.6 s, total: 17min 52s
Wall time: 34.8 s


In [None]:
del interactions
del model
torch.cuda.empty_cache()

In [None]:
recos["item_id"] = recos["item_id"].apply(str)
test["item_id"] = test["item_id"].astype(str)
metric_values = calc_metrics(metrics, recos[["user_id", "item_id", "rank"]], test, train, catalog)
metric_values["model"] = "bert4rec_ids"
features_results.append(metric_values)

In [None]:
# major recommend
recos.sort_values(["user_id", "rank"])

Unnamed: 0,user_id,item_id,score,rank
575550,3,13865,0.641595,1
575551,3,15297,0.240784,2
575552,3,4495,0.072106,3
575553,3,7829,0.046310,4
575554,3,7102,-0.149929,5
...,...,...,...,...
224955,1097544,7102,-0.363039,6
224956,1097544,4151,-0.399935,7
224957,1097544,7793,-0.440667,8
224958,1097544,4457,-0.652825,9


With timeline mask in the end of the block, with attention mask

In [17]:
features_results

[{'MAP@1': 0.03386095770656615,
  'MAP@5': 0.059875092311754766,
  'MAP@10': 0.06626564554123239,
  'MIUF@1': 18.824620072061013,
  'MIUF@5': 18.824620072061013,
  'MIUF@10': 18.824620072061013,
  'Serendipity@1': 0.06777889234992873,
  'Serendipity@5': 0.04409114066936074,
  'Serendipity@10': 0.031205145274404236,
  'model': 'bert4rec_ids'}]

Without timeline mask, with attention mask

In [17]:
features_results

[{'MAP@1': 0.031715044770102244,
  'MAP@5': 0.058107653322795036,
  'MAP@10': 0.06400667270068171,
  'MIUF@1': 18.824620072061013,
  'MIUF@5': 18.824620072061013,
  'MIUF@10': 18.824620072061013,
  'Serendipity@1': 0.0633651866321736,
  'Serendipity@5': 0.04325255649454838,
  'Serendipity@10': 0.030283831925392017,
  'model': 'bert4rec_ids'}]

With timeline mask in the end of the block, whithout attention mask

In [17]:
features_results

[{'MAP@1': 0.03521807589657321,
  'MAP@5': 0.0635501105108066,
  'MAP@10': 0.07042686574268418,
  'MIUF@1': 18.824620072061013,
  'MIUF@5': 18.824620072061013,
  'MIUF@10': 18.824620072061013,
  'Serendipity@1': 0.07181247030251835,
  'Serendipity@5': 0.048066313492978796,
  'Serendipity@10': 0.03423476251676267,
  'model': 'bert4rec_ids'}]

With timeline mask, whithout attention mask, 5 and 7 epochs

In [24]:
features_results

[{'MAP@1': 0.03521807589657321,
  'MAP@5': 0.0635501105108066,
  'MAP@10': 0.07042686574268418,
  'MIUF@1': 18.824620072061013,
  'MIUF@5': 18.824620072061013,
  'MIUF@10': 18.824620072061013,
  'Serendipity@1': 0.07181247030251835,
  'Serendipity@5': 0.048066313492978796,
  'Serendipity@10': 0.03423476251676267,
  'model': 'bert4rec_ids'},
 {'MAP@1': 0.03613885396129421,
  'MAP@5': 0.0626756506459862,
  'MAP@10': 0.06914741192133474,
  'MIUF@1': 18.824620072061013,
  'MIUF@5': 18.824620072061013,
  'MIUF@10': 18.824620072061013,
  'Serendipity@1': 0.07439945092656143,
  'Serendipity@5': 0.04685051880216868,
  'Serendipity@10': 0.033803948060973046,
  'model': 'bert4rec_ids'}]