In [2]:
# TODO: will remove
import sys
sys.path.append("../../")

In [3]:
import numpy as np
import os
import pandas as pd
import itertools
import torch
import typing as tp
import warnings
from collections import Counter
from pathlib import Path
from functools import partial

from lightning_fabric import seed_everything
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping
from rectools import Columns, ExternalIds
from rectools.dataset import Dataset
from rectools.metrics import NDCG, Recall, Serendipity, calc_metrics

from rectools.models import BERT4RecModel, SASRecModel
from rectools.models.nn.item_net import IdEmbeddingsItemNet
from rectools.models.nn.transformer_base import TransformerModelBase

# Enable deterministic behaviour with CUDA >= 10.2
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
warnings.simplefilter("ignore", UserWarning)

# Load data

In [4]:
# %%time
# !wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_en.zip -O data_en.zip
# !unzip -o data_en.zip
# !rm data_en.zip

In [5]:
# Download dataset
DATA_PATH = Path("./data_en")
items = pd.read_csv(DATA_PATH / 'items_en.csv', index_col=0)
interactions = (
    pd.read_csv(DATA_PATH / 'interactions.csv', parse_dates=["last_watch_dt"])
    .rename(columns={"last_watch_dt": Columns.Datetime})
)

# TODO: for test
unique_users = interactions[Columns.User].unique()[: 2_000]
interactions = interactions[interactions[Columns.User].isin(unique_users)]

print(interactions.shape)
interactions.head(2)

(52318, 5)


Unnamed: 0,user_id,item_id,datetime,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0


In [6]:
interactions[Columns.User].nunique(), interactions[Columns.Item].nunique()

(2000, 6179)

In [7]:
# Process interactions
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)
raw_interactions = interactions[["user_id", "item_id", "datetime", "weight"]]
print(raw_interactions.shape)
raw_interactions.head(2)

(52318, 4)


Unnamed: 0,user_id,item_id,datetime,weight
0,176549,9506,2021-05-11,3
1,699317,1659,2021-05-29,3


In [8]:
# Process item features
# items = items.loc[items[Columns.Item].isin(raw_interactions[Columns.Item])].copy()
# items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
# genre_feature = items[["item_id", "genre"]].explode("genre")
# genre_feature.columns = ["id", "value"]
# genre_feature["feature"] = "genre"
# content_feature = items.reindex(columns=[Columns.Item, "content_type"])
# content_feature.columns = ["id", "value"]
# content_feature["feature"] = "content_type"
# item_features = pd.concat((genre_feature, content_feature))

In [87]:
RANDOM_STATE=60
torch.use_deterministic_algorithms(True)
seed_everything(RANDOM_STATE, workers=True)

Seed set to 60


60

In [10]:
dataset_no_features = Dataset.construct(raw_interactions)
dataset_no_features

Dataset(user_id_map=IdMap(external_ids=array([ 176549,  699317,  656683, ...,  465028, 1055957,  761297])), item_id_map=IdMap(external_ids=array([ 9506,  1659,  7107, ...,  9107, 13252, 13105])), interactions=Interactions(df=       user_id  item_id  weight   datetime
0            0        0     3.0 2021-05-11
1            1        1     3.0 2021-05-29
2            2        2     1.0 2021-05-09
3            3        3     3.0 2021-07-05
4            4        0     3.0 2021-04-30
...        ...      ...     ...        ...
52313      754     3450     1.0 2021-03-23
52314       73     1988     3.0 2021-07-16
52315     1365     1921     3.0 2021-04-24
52316      452     1456     3.0 2021-04-30
52317      884     1610     3.0 2021-03-23

[52318 rows x 4 columns]), user_features=None, item_features=None)

In [11]:
def get_log_dir(model: TransformerModelBase) -> Path:
    """
    Get logging directory.
    """
    path = model.fit_trainer.log_dir
    return Path(path) / "metrics.csv"


def get_losses(epoch_metrics_df: pd.DataFrame, is_val: bool) -> pd.DataFrame:
    loss_df = epoch_metrics_df[["epoch", "train/loss"]].dropna()
    if is_val:
        val_loss_df = epoch_metrics_df[["epoch", "val/loss"]].dropna()
        loss_df = pd.merge(loss_df, val_loss_df, how="inner", on="epoch")
    return loss_df.reset_index(drop=True)


def get_val_metrics(epoch_metrics_df: pd.DataFrame) -> pd.DataFrame:
    metrics_df = epoch_metrics_df.drop(columns=["train/loss", "val/loss"]).dropna()
    return metrics_df.reset_index(drop=True)


def get_log_values(model: TransformerModelBase, is_val: bool = False) -> tp.Tuple[pd.DataFrame, tp.Optional[pd.DataFrame]]:
    log_path = get_log_dir(model)
    epoch_metrics_df = pd.read_csv(log_path)

    loss_df = get_losses(epoch_metrics_df, is_val)
    val_metrics = None
    if is_val:
        val_metrics = get_val_metrics(epoch_metrics_df)
    return loss_df, val_metrics

# **Training Objective**

https://arxiv.org/pdf/2205.04507

## **Next Action**

**Data Preparator**

In [202]:
from typing import Dict, List, Tuple

from rectools.models.nn.transformer_data_preparator import SessionEncoderDataPreparatorBase


class NextItemDataPreparator(SessionEncoderDataPreparatorBase):
    """Data preparator for SASRecModel."""

    train_session_max_len_addition: int = 1

    def _collate_fn_train(
        self,
        batch: List[Tuple[List[int], List[float]]],
    ) -> Dict[str, torch.Tensor]:
        """
        Truncate each session from right to keep `session_max_len` items.
        Do left padding until `session_max_len` is reached.
        Split to `x`, `y`, and `yw`.
        """
        batch_size = len(batch)
        x = np.zeros((batch_size, self.session_max_len))
        y = np.zeros((batch_size, 1))
        yw = np.zeros((batch_size, 1))
        for i, (ses, ses_weights) in enumerate(batch):
            x[i, -len(ses) + 1 :] = ses[:-1]  # ses: [session_len] -> x[i]: [session_max_len]
            y[i] = ses[-1]  # ses: [session_len] -> y[i]: [1]
            yw[i] = ses_weights[-1]  # ses_weights: [session_len] -> yw[i]: [1]

        batch_dict = {"x": torch.LongTensor(x), "y": torch.LongTensor(y), "yw": torch.FloatTensor(yw)}
        if self.n_negatives is not None:
            negatives = torch.randint(
                low=self.n_item_extra_tokens,
                high=self.item_id_map.size,
                size=(batch_size, 1, self.n_negatives),
            )  # [batch_size, 1, n_negatives]
            batch_dict["negatives"] = negatives
        return batch_dict


In [203]:
from rectools.models.nn.transformer_base import SessionEncoderLightningModule


class NextItemSessionEncoder(SessionEncoderLightningModule):

    def training_step(self, batch: tp.Dict[str, torch.Tensor], batch_idx: int) -> torch.Tensor:
        """Training step."""
        x, y, w = batch["x"], batch["y"], batch["yw"]
        if self.loss == "softmax":
            logits = self._get_full_catalog_logits(x)[:, -1: :]
            loss = self._calc_softmax_loss(logits, y, w)
        elif self.loss == "BCE":
            negatives = batch["negatives"]
            logits = self._get_pos_neg_logits(x, y, negatives)[:, -1: :]
            loss = self._calc_bce_loss(logits, y, w)
        elif self.loss == "gBCE":
            negatives = batch["negatives"]
            logits = self._get_pos_neg_logits(x, y, negatives)[:, -1: :]
            loss = self._calc_gbce_loss(logits, y, w, negatives)
        else:
            loss = self._calc_custom_loss(batch, batch_idx)

        self.log(self.train_loss_name, loss, on_step=False, on_epoch=True, prog_bar=self.verbose > 0)

        return loss 

In [204]:
from rectools.models.nn.transformer_base import TransformerModelBase

PADDING_VALUE = "PAD"


class NextItemTransformer(TransformerModelBase):
    # TODO: add to base model
    def _init_data_preparator(self) -> None:
        self.data_preparator: SessionEncoderDataPreparatorBase = self.data_preparator_type(
            session_max_len=self.session_max_len,
            n_negatives=self.n_negatives if self.loss != "softmax" else None,
            batch_size=self.batch_size,
            dataloader_num_workers=self.dataloader_num_workers,
            train_min_user_interactions=self.train_min_user_interactions,
            item_extra_tokens=(PADDING_VALUE,),
            get_val_mask_func=self.get_val_mask_func,
        )

In [205]:
MIN_EPOCHS = 3
MAX_EPOCHS = 3
TRAIN_MIN_USER_INTERACTIONS = 5
SESSION_MAX_LEN = 50

In [206]:
nextitem_trainer = Trainer(
    accelerator='gpu',
    devices=[0],
    min_epochs=MIN_EPOCHS,
    max_epochs=MAX_EPOCHS, 
    deterministic=True,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [207]:
nextitem_model = NextItemTransformer(
    n_factors=64,
    n_blocks=2,
    n_heads=2,
    dropout_rate=0.2,
    use_pos_emb=True,
    train_min_user_interactions=TRAIN_MIN_USER_INTERACTIONS,
    session_max_len=SESSION_MAX_LEN,
    lr=1e-3,
    batch_size=128,
    loss="softmax",
    verbose=1,
    deterministic=True,
    item_net_block_types=(IdEmbeddingsItemNet, ),  # Use only item ids in ItemNetBlock
    trainer=nextitem_trainer,
    data_preparator_type=NextItemDataPreparator,
    lightning_module_type=NextItemSessionEncoder,
)

N_NEGATIVES = 5

nextaction_bce_model = NextItemTransformer(
    n_factors=64,
    n_blocks=2,
    n_heads=2,
    dropout_rate=0.2,
    use_pos_emb=True,
    train_min_user_interactions=TRAIN_MIN_USER_INTERACTIONS,
    session_max_len=SESSION_MAX_LEN,
    lr=1e-3,
    batch_size=128,
    loss="BCE",
    n_negatives=N_NEGATIVES,
    verbose=1,
    deterministic=True,
    item_net_block_types=(IdEmbeddingsItemNet, ),  # Use only item ids in ItemNetBlock
    trainer=nextitem_trainer,
    data_preparator_type=NextItemDataPreparator,
    lightning_module_type=NextItemSessionEncoder,
)

nextaction_gbce_model = NextItemTransformer(
    n_factors=64,
    n_blocks=2,
    n_heads=2,
    dropout_rate=0.2,
    use_pos_emb=True,
    train_min_user_interactions=TRAIN_MIN_USER_INTERACTIONS,
    session_max_len=SESSION_MAX_LEN,
    lr=1e-3,
    batch_size=128,
    loss="gBCE",
    n_negatives=N_NEGATIVES,
    verbose=1,
    deterministic=True,
    item_net_block_types=(IdEmbeddingsItemNet, ),  # Use only item ids in ItemNetBlock
    trainer=nextitem_trainer,
    data_preparator_type=NextItemDataPreparator,
    lightning_module_type=NextItemSessionEncoder,
)

In [208]:
%%time
nextitem_model.fit(dataset_no_features)

  unq_values = pd.unique(values)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params
---------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 421 K 
---------------------------------------------------------------
421 K     Trainable params
0         Non-trainable params
421 K     Total params
1.684     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


CPU times: user 2.26 s, sys: 161 ms, total: 2.42 s
Wall time: 1.41 s


<__main__.NextItemTransformer at 0x7f7c204e31c0>

In [209]:
%%time
nextaction_bce_model.fit(dataset_no_features)

  unq_values = pd.unique(values)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params
---------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 421 K 
---------------------------------------------------------------
421 K     Trainable params
0         Non-trainable params
421 K     Total params
1.684     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


CPU times: user 2.06 s, sys: 81.7 ms, total: 2.14 s
Wall time: 1.16 s


<__main__.NextItemTransformer at 0x7f7c1bc07790>

In [210]:
%%time
nextaction_gbce_model.fit(dataset_no_features)

  unq_values = pd.unique(values)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params
---------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 421 K 
---------------------------------------------------------------
421 K     Trainable params
0         Non-trainable params
421 K     Total params
1.684     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


CPU times: user 2.24 s, sys: 143 ms, total: 2.38 s
Wall time: 1.27 s


<__main__.NextItemTransformer at 0x7f7c1bc076a0>

In [212]:
softmax_loss_df, _ = get_log_values(nextitem_model, is_val=False)
softmax_loss_df["loss_type"] = "softmax"
bce_loss_df, _ = get_log_values(nextaction_bce_model, is_val=False)
bce_loss_df["loss_type"] = "bce"å
gbce_loss_df, _ = get_log_values(nextaction_gbce_model, is_val=False)
gbce_loss_df["loss_type"] = "gbce"
pd.concat([softmax_loss_df, bce_loss_df, gbce_loss_df], axis=1)

Unnamed: 0,epoch,train/loss,loss_type,epoch.1,train/loss.1,loss_type.1,epoch.2,train/loss.2,loss_type.2
0,0,19.713676,softmax,0,1.622266,bce,0,1.490729,gbce
1,1,16.933022,softmax,1,1.48065,bce,1,1.378032,gbce
2,2,15.447347,softmax,2,1.133395,bce,2,1.052926,gbce


In [213]:
nextitem_model_with_casual_mask = NextItemTransformer(
    n_factors=64,
    n_blocks=2,
    n_heads=2,
    dropout_rate=0.2,
    use_pos_emb=True,
    train_min_user_interactions=TRAIN_MIN_USER_INTERACTIONS,
    session_max_len=SESSION_MAX_LEN,
    use_causal_attn=True,
    lr=1e-3,
    batch_size=128,
    loss="softmax",
    verbose=1,
    deterministic=True,
    item_net_block_types=(IdEmbeddingsItemNet, ),  # Use only item ids in ItemNetBlock
    trainer=nextitem_trainer,
    data_preparator_type=NextItemDataPreparator,
    lightning_module_type=NextItemSessionEncoder,
)

In [214]:
%%time
nextitem_model_with_casual_mask.fit(dataset_no_features)

  unq_values = pd.unique(values)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params
---------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 421 K 
---------------------------------------------------------------
421 K     Trainable params
0         Non-trainable params
421 K     Total params
1.684     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


CPU times: user 2.49 s, sys: 65.6 ms, total: 2.56 s
Wall time: 1.28 s


<__main__.NextItemTransformer at 0x7f7c28295e20>

In [215]:
loss_df, _ = get_log_values(nextitem_model_with_casual_mask, is_val=False)
loss_df 

Unnamed: 0,epoch,train/loss
0,0,19.471478
1,1,16.581562
2,2,15.517271


# All Action

In [110]:
batch_size = 10
session_max_len = 10
train_session_max_len_addition = 1
x = np.zeros((batch_size, session_max_len))
y = np.zeros((batch_size, session_max_len))
yw = np.zeros((batch_size, session_max_len))

In [111]:
ses = torch.randint(2, 20, size=(batch_size, session_max_len + train_session_max_len_addition))
ses

tensor([[ 7, 16, 19, 19, 12, 10,  5, 11, 12,  7, 13],
        [16, 18, 11, 16, 16, 15,  9, 19, 13,  3, 18],
        [ 7, 17, 11,  3,  2, 12, 16, 14, 17, 14, 10],
        [ 3,  6, 13, 11,  6, 14,  8,  4, 15,  4, 10],
        [11,  4, 15, 16, 11, 12,  4, 16, 13, 15, 11],
        [11,  3,  3, 13, 15, 12,  6, 19, 16,  8,  5],
        [15,  6,  5,  2, 16, 19, 12, 12,  2, 14, 18],
        [18,  8, 15,  7,  4,  4, 10,  5,  5,  5, 17],
        [ 4,  3, 19,  5, 10, 13, 10, 15,  8, 17,  6],
        [10,  9, 18, 15, 12,  5,  3, 18, 17, 15,  7]])

In [112]:
-len(ses[0]), len(ses[0][2 :-train_session_max_len_addition])

(-11, 8)

In [113]:
ses1 = ses[0]
ses1

tensor([ 7, 16, 19, 19, 12, 10,  5, 11, 12,  7, 13])

In [114]:
x[0, -len(ses1) + 1 :] = ses1[: -1]
x

array([[ 7., 16., 19., 19., 12., 10.,  5., 11., 12.,  7.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [115]:
y[0, -len(ses1) + 1 :] = ses1[1:]
y

array([[16., 19., 19., 12., 10.,  5., 11., 12.,  7., 13.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [116]:
-len(ses1) + 1 

-10

In [17]:
K_ACTIONS = 5

In [16]:
from typing import Dict, List, Tuple

from rectools.models.nn.transformer_data_preparator import SessionEncoderDataPreparatorBase


class AllActionDataPreparator(SessionEncoderDataPreparatorBase):
    """Data preparator for SASRecModel."""

    train_session_max_len_addition: int = K_ACTIONS

    def _collate_fn_train(
        self,
        batch: List[Tuple[List[int], List[float]]],
    ) -> Dict[str, torch.Tensor]:
        """
        Truncate each session from right to keep `session_max_len` items.
        Do left padding until `session_max_len` is reached.
        Split to `x`, `y`, and `yw`.
        """
        batch_size = len(batch)
        x = np.zeros((batch_size, self.session_max_len))
        y = np.zeros((batch_size, self.train_session_max_len_addition))
        yw = np.zeros((batch_size, self.train_session_max_len_addition))
        for i, (ses, ses_weights) in enumerate(batch):
            x[i, -len(ses) + self.train_session_max_len_addition :] = ses[: -self.train_session_max_len_addition]  # ses: [session_len] -> x[i]: [session_max_len]
            y[i, -self.train_session_max_len_addition :] = ses[-self.train_session_max_len_addition :]  # ses: [session_len] -> y[i]: [train_session_max_len_addition]
            yw[i, -self.train_session_max_len_addition :] = ses_weights[-self.train_session_max_len_addition :]  # ses_weights: [session_len] -> yw[i]: [train_session_max_len_addition]

        batch_dict = {"x": torch.LongTensor(x), "y": torch.LongTensor(y), "yw": torch.FloatTensor(yw)}
        if self.n_negatives is not None:
            negatives = torch.randint(
                low=self.n_item_extra_tokens,
                high=self.item_id_map.size,
                size=(batch_size, self.train_session_max_len_addition, self.n_negatives),
            )  # [batch_size, train_session_max_len_addition, n_negatives]
            batch_dict["negatives"] = negatives
        return batch_dict


NameError: name 'K_ACTIONS' is not defined

In [342]:
from rectools.models.nn.transformer_base import SessionEncoderLightningModule


class AllActionSessionEncoder(SessionEncoderLightningModule):

    def training_step(self, batch: tp.Dict[str, torch.Tensor], batch_idx: int) -> torch.Tensor:
        """Training step."""
        prediction_k_actions = self.data_preparator.train_session_max_len_addition

        x, y, w = batch["x"], batch["y"], batch["yw"]
        if self.loss == "softmax":
            logits = self._get_full_catalog_logits(x)[:, -1: :]
            # [batch_size, prediction_k_actions, n_items]
            repeated_logits = logits.repeat(1, 1, prediction_k_actions, 1).squeeze()
            loss = self._calc_softmax_loss(repeated_logits, y, w)
        elif self.loss == "BCE":
            negatives = batch["negatives"]
            # [batch_size, prediction_k_actions, n_negatives + 1]
            repeated_logits = self._get_pos_neg_logits(x, y, negatives, prediction_k_actions)
            loss = self._calc_bce_loss(repeated_logits, y, w)
        elif self.loss == "gBCE":
            negatives = batch["negatives"]
            # [batch_size, prediction_k_actions, n_negatives + 1]
            repeated_logits = self._get_pos_neg_logits(x, y, negatives, prediction_k_actions)
            loss = self._calc_gbce_loss(repeated_logits, y, w, negatives)
        else:
            loss = self._calc_custom_loss(batch, batch_idx)

        self.log(self.train_loss_name, loss, on_step=False, on_epoch=True, prog_bar=self.verbose > 0)

        return loss
    
    def _get_pos_neg_logits(self, x: torch.Tensor, y: torch.Tensor, negatives: torch.Tensor, prediction_k_actions: int) -> torch.Tensor:
        # [n_items + n_item_extra_tokens, n_factors], [batch_size, session_max_len, n_factors]
        item_embs, session_embs = self.torch_model(x)
        # [batch_size, prediction_k_actions, n_factors]
        encoded_sessions = session_embs[:, -1:, :].repeat(1, 1, prediction_k_actions, 1).squeeze()
        pos_neg = torch.cat([y.unsqueeze(-1), negatives], dim=-1)  # [batch_size, prediction_k_actions, n_negatives + 1]
        pos_neg_embs = item_embs[pos_neg]  # [batch_size, session_max_len, n_negatives + 1, n_factors]
        # [batch_size, prediction_k_actions, n_negatives + 1]
        logits = (pos_neg_embs @ encoded_sessions.unsqueeze(-1)).squeeze(-1)
        return logits

In [343]:
from rectools.models.nn.transformer_base import TransformerModelBase

PADDING_VALUE = "PAD"


class AllActionPredictionTransformer(TransformerModelBase):

    def _init_data_preparator(self) -> None:
        if self.session_max_len > self.data_preparator_type.train_session_max_len_addition:
            self.data_preparator: SessionEncoderDataPreparatorBase = self.data_preparator_type(
                session_max_len=self.session_max_len,
                n_negatives=self.n_negatives if self.loss != "softmax" else None,
                batch_size=self.batch_size,
                dataloader_num_workers=self.dataloader_num_workers,
                train_min_user_interactions=self.train_min_user_interactions,
                item_extra_tokens=(PADDING_VALUE,),
                get_val_mask_func=self.get_val_mask_func,
            )
        else:
            raise ValueError("`session_max_len` must be more than `train_session_max_len_addition`")

In [344]:
MIN_EPOCHS = 3
MAX_EPOCHS = 3
TRAIN_MIN_USER_INTERACTIONS = K_ACTIONS + 5
SESSION_MAX_LEN = 50

In [345]:
allaction_trainer = Trainer(
    accelerator='gpu',
    devices=[0],
    min_epochs=MIN_EPOCHS,
    max_epochs=MAX_EPOCHS, 
    deterministic=True,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [346]:
allaction_model_raise = AllActionPredictionTransformer(
    n_factors=64,
    n_blocks=2,
    n_heads=2,
    dropout_rate=0.2,
    use_pos_emb=True,
    train_min_user_interactions=TRAIN_MIN_USER_INTERACTIONS,
    session_max_len=K_ACTIONS,
    use_causal_attn=False,
    lr=1e-3,
    batch_size=128,
    loss="softmax",
    verbose=1,
    deterministic=True,
    item_net_block_types=(IdEmbeddingsItemNet, ),  # Use only item ids in ItemNetBlock
    trainer=allaction_trainer,
    data_preparator_type=AllActionDataPreparator,
    lightning_module_type=AllActionSessionEncoder,
)

ValueError: `session_max_len` must be more than `train_session_max_len_addition`

In [347]:
allaction_model = AllActionPredictionTransformer(
    n_factors=64,
    n_blocks=2,
    n_heads=2,
    dropout_rate=0.2,
    use_pos_emb=True,
    train_min_user_interactions=TRAIN_MIN_USER_INTERACTIONS,
    session_max_len=SESSION_MAX_LEN,
    use_causal_attn=False,
    lr=1e-3,
    batch_size=128,
    loss="softmax",
    verbose=1,
    deterministic=True,
    item_net_block_types=(IdEmbeddingsItemNet, ),  # Use only item ids in ItemNetBlock
    trainer=allaction_trainer,
    data_preparator_type=AllActionDataPreparator,
    lightning_module_type=AllActionSessionEncoder,
)


N_NEGATIVES = 5

allaction_bce_model = AllActionPredictionTransformer(
    n_factors=64,
    n_blocks=2,
    n_heads=2,
    dropout_rate=0.2,
    use_pos_emb=True,
    train_min_user_interactions=TRAIN_MIN_USER_INTERACTIONS,
    session_max_len=SESSION_MAX_LEN,
    use_causal_attn=False,
    lr=1e-3,
    batch_size=128,
    loss="BCE",
    n_negatives=N_NEGATIVES,
    verbose=1,
    deterministic=True,
    item_net_block_types=(IdEmbeddingsItemNet, ),  # Use only item ids in ItemNetBlock
    trainer=nextitem_trainer,
    data_preparator_type=AllActionDataPreparator,
    lightning_module_type=AllActionSessionEncoder,
)

allaction_gbce_model = AllActionPredictionTransformer(
    n_factors=64,
    n_blocks=2,
    n_heads=2,
    dropout_rate=0.2,
    use_pos_emb=True,
    train_min_user_interactions=TRAIN_MIN_USER_INTERACTIONS,
    session_max_len=SESSION_MAX_LEN,
    use_causal_attn=False,
    lr=1e-3,
    batch_size=128,
    loss="gBCE",
    n_negatives=N_NEGATIVES,
    verbose=1,
    deterministic=True,
    item_net_block_types=(IdEmbeddingsItemNet, ),  # Use only item ids in ItemNetBlock
    trainer=allaction_trainer,
    data_preparator_type=AllActionDataPreparator,
    lightning_module_type=AllActionSessionEncoder,
)

In [348]:
%%time
allaction_model.fit(dataset_no_features)

  unq_values = pd.unique(values)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params
---------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 421 K 
---------------------------------------------------------------
421 K     Trainable params
0         Non-trainable params
421 K     Total params
1.686     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


CPU times: user 2.54 s, sys: 102 ms, total: 2.65 s
Wall time: 1.16 s


<__main__.AllActionPredictionTransformer at 0x7f7c1167a3d0>

In [349]:
%%time
allaction_bce_model.fit(dataset_no_features)

  unq_values = pd.unique(values)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params
---------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 421 K 
---------------------------------------------------------------
421 K     Trainable params
0         Non-trainable params
421 K     Total params
1.686     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


CPU times: user 1.9 s, sys: 66.7 ms, total: 1.97 s
Wall time: 836 ms


<__main__.AllActionPredictionTransformer at 0x7f7c1167a100>

In [350]:
%%time
allaction_gbce_model.fit(dataset_no_features)

  unq_values = pd.unique(values)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params
---------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 421 K 
---------------------------------------------------------------
421 K     Trainable params
0         Non-trainable params
421 K     Total params
1.686     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


CPU times: user 1.72 s, sys: 26.5 ms, total: 1.75 s
Wall time: 871 ms


<__main__.AllActionPredictionTransformer at 0x7f7c1167a430>

In [351]:
softmax_loss_df, _ = get_log_values(allaction_model, is_val=False)
softmax_loss_df["loss_type"] = "softmax"
bce_loss_df, _ = get_log_values(allaction_bce_model, is_val=False)
bce_loss_df["loss_type"] = "bce"
gbce_loss_df, _ = get_log_values(allaction_gbce_model, is_val=False)
gbce_loss_df["loss_type"] = "gbce"
pd.concat([softmax_loss_df, bce_loss_df, gbce_loss_df], axis=1)

Unnamed: 0,epoch,train/loss,loss_type,epoch.1,train/loss.1,loss_type.1,epoch.2,train/loss.2,loss_type.2
0,0,19.867401,softmax,0,1.606314,bce,0,1.460153,gbce
1,1,18.023518,softmax,1,1.259503,bce,1,1.073478,gbce
2,2,17.088499,softmax,2,0.901277,bce,2,0.642338,gbce


In [352]:
allaction_model_casual = AllActionPredictionTransformer(
    n_factors=64,
    n_blocks=2,
    n_heads=2,
    dropout_rate=0.2,
    use_pos_emb=True,
    train_min_user_interactions=TRAIN_MIN_USER_INTERACTIONS,
    session_max_len=SESSION_MAX_LEN,
    use_causal_attn=True,
    lr=1e-3,
    batch_size=128,
    loss="softmax",
    verbose=1,
    deterministic=True,
    item_net_block_types=(IdEmbeddingsItemNet, ),  # Use only item ids in ItemNetBlock
    trainer=allaction_trainer,
    data_preparator_type=AllActionDataPreparator,
    lightning_module_type=AllActionSessionEncoder,
)


In [353]:
%%time
allaction_model_casual.fit(dataset_no_features)

  unq_values = pd.unique(values)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                           | Params
---------------------------------------------------------------
0 | torch_model | TransformerBasedSessionEncoder | 421 K 
---------------------------------------------------------------
421 K     Trainable params
0         Non-trainable params
421 K     Total params
1.686     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


CPU times: user 2.22 s, sys: 60.1 ms, total: 2.28 s
Wall time: 796 ms


<__main__.AllActionPredictionTransformer at 0x7f7c12c97be0>

In [354]:
loss_df, _ = get_log_values(allaction_model, is_val=False)
loss_df

Unnamed: 0,epoch,train/loss
0,0,19.867401
1,1,18.023518
2,2,17.088499


# ALBERT

**without ItemNetConstructor**

In [88]:
import torch
import typing_extensions as tpe
import torch.nn as nn

from rectools.models.nn.item_net import IdEmbeddingsItemNet, ItemNetBase, ItemNetConstructor
from rectools.models.nn.transformer_data_preparator import SessionEncoderDataPreparatorBase
from rectools.models import BERT4RecModel
from rectools.models.nn.bert4rec import (
    BERT4RecModelConfig, 
    BERT4RecDataPreparator,
    PADDING_VALUE, 
    MASKING_VALUE,
) 
from rectools.models.nn.item_net import CatFeaturesItemNet
from rectools.models.nn.transformer_base import (
    SessionEncoderLightningModule,
    SessionEncoderLightningModuleBase,
    TransformerBasedSessionEncoder
)
from rectools.models.nn.transformer_net_blocks import (
    LearnableInversePositionalEncoding,
    PreLNTransformerLayers,
    PositionalEncodingBase,
    TransformerLayersBase,
)

# TODO: add MLM Data n-gramm?


class AlBertIdEmbeddingsItemNet(IdEmbeddingsItemNet):
    
    def __init__(self, n_factors: int, n_items: int, dropout_rate: float, emb_factors: int):
        super().__init__(
            n_factors=emb_factors,
            n_items=n_items,
            dropout_rate=dropout_rate,
        )
        self.fc_proj = nn.Linear(emb_factors, n_factors)

    def forward(self, items: torch.Tensor) -> torch.Tensor:
        item_embs = self.ids_emb(items.to(self.device))
        item_proj = self.fc_proj(item_embs)
        item_proj = self.drop_layer(item_proj)
        return item_proj
    
    @classmethod
    def from_dataset(cls, dataset: Dataset, n_factors: int, dropout_rate: float, emb_factors: int) -> tpe.Self:
        n_items = dataset.item_id_map.size
        return cls(n_factors, n_items, dropout_rate, emb_factors)


class AlBertItemNetConstructor(ItemNetConstructor):

    @classmethod
    def from_dataset(
        cls,
        dataset: Dataset,
        n_factors: int,
        dropout_rate: float,
        item_net_block_types: tp.Sequence[tp.Type[ItemNetBase]],
        emb_factors: tp.Optional[int] = None,
    ) -> tpe.Self:
        """
        Construct ItemNet from RecTools dataset and from various blocks of item networks.

        Parameters
        ----------
        dataset : Dataset
            RecTools dataset.
        n_factors : int
            Latent embedding size of item embeddings.
        dropout_rate : float
            Probability of a hidden unit of item embedding to be zeroed.
        item_net_block_types : sequence of `type(ItemNetBase)`
            Sequence item network block types.
        """
        n_items = dataset.item_id_map.size

        item_net_blocks: tp.List[ItemNetBase] = []
        for item_net in item_net_block_types:
            # AlBert Embs only for Item ids.
            if emb_factors is not None:
                item_net_block = item_net.from_dataset(dataset, n_factors, dropout_rate, emb_factors)
            else:
                item_net_block = item_net.from_dataset(dataset, n_factors, dropout_rate)
            if item_net_block is not None:
                item_net_blocks.append(item_net_block)

        return cls(n_items, item_net_blocks)    
    # TODO: add `from_dataset_scheme`


class AlBERTSessionEncoder(TransformerBasedSessionEncoder):

    def __init__(
        self,
        n_blocks: int,
        n_hidden_groups: int,
        n_inner_groups: int,
        n_factors: int,
        emb_factors: int,
        n_heads: int,
        session_max_len: int,
        dropout_rate: float,
        use_pos_emb: bool = True,
        use_causal_attn: bool = True,
        use_key_padding_mask: bool = False,
        transformer_layers_type: tp.Type[TransformerLayersBase] = PreLNTransformerLayers,
        item_net_block_types: tp.Sequence[tp.Type[ItemNetBase]] = (IdEmbeddingsItemNet, CatFeaturesItemNet),
        pos_encoding_type: tp.Type[PositionalEncodingBase] = LearnableInversePositionalEncoding,
    ) -> None:
        # TODO
        # n_hidden_blocks = int(n_hidden_groups * n_inner_groups)
        # self.n_hidden_blocks = n_blocks
        # self.n_inner_groups = n_inner_groups
        # self.n_layers_per_group = n_blocks / n_hidden_groups

        super().__init__(
            n_blocks=n_blocks,
            n_factors=n_factors,
            n_heads=n_heads,
            session_max_len=session_max_len,
            dropout_rate=dropout_rate,
            use_pos_emb=use_pos_emb,
            use_causal_attn=use_causal_attn,
            use_key_padding_mask=use_key_padding_mask,
            transformer_layers_type=transformer_layers_type,
            item_net_block_types=item_net_block_types,
            pos_encoding_type=pos_encoding_type,
        )
        del self.transformer_layers

        self.transformer_layer_groups = nn.ModuleList(
            [
                transformer_layers_type(
                    # number of encoder layer (AlBERTLayers)
                    # https://github.com/huggingface/transformers/blob/main/src/transformers/models/albert/modeling_albert.py#L428
                    n_blocks=n_inner_groups,
                    n_factors=n_factors,
                    n_heads=n_heads,
                    dropout_rate=dropout_rate,
                )
                # number of hidden groups (same weights)  AlBERTLayerGroups
                # https://github.com/huggingface/transformers/blob/main/src/transformers/models/albert/modeling_albert.py#L469
                for _ in range(n_hidden_groups)
            ]
        )
        self.n_blocks = n_blocks
        self.emb_factors = emb_factors

        self.n_layers_per_group = n_blocks / n_hidden_groups

    def construct_item_net(self, dataset: Dataset) -> None:
        """
        Construct network for item embeddings from dataset.

        Parameters
        ----------
        dataset : Dataset
            RecTools dataset with user-item interactions.
        """
        self.item_model = AlBertItemNetConstructor.from_dataset(
            dataset, self.n_factors, self.dropout_rate, self.item_net_block_types, self.emb_factors
        )

    def encode_sessions(self, sessions: torch.Tensor, item_embs: torch.Tensor) -> torch.Tensor:
        session_max_len = sessions.shape[1]
        attn_mask = None
        key_padding_mask = None

        timeline_mask = (sessions != 0).unsqueeze(-1)  # [batch_size, session_max_len, 1]

        seqs = item_embs[sessions]  # [batch_size, session_max_len, n_factors]
        seqs = self.pos_encoding(seqs)
        seqs = self.emb_dropout(seqs)

        if self.use_causal_attn:
            attn_mask = ~torch.tril(
                torch.ones((session_max_len, session_max_len), dtype=torch.bool, device=sessions.device)
            )
        if self.use_key_padding_mask:
            key_padding_mask = sessions == 0
            if attn_mask is not None:  # merge masks to prevent nan gradients for torch < 2.5.0
                attn_mask = self._merge_masks(attn_mask, key_padding_mask, seqs)
                key_padding_mask = None

        for layer_idx_in_group in range(self.n_blocks):
            group_idx = int(layer_idx_in_group / self.n_layers_per_group)
            # for layer_idx_in_inner_group in range(self.n_inner_groups): TODO
            seqs = self.transformer_layer_groups[group_idx](seqs, timeline_mask, attn_mask, key_padding_mask)
        return seqs


class AlBERT4RecModelConfig(BERT4RecModelConfig):

    n_hidden_groups: int = 1
    n_inner_groups: int = 1


class AlBERT4RecModel(BERT4RecModel):
    """
    https://arxiv.org/pdf/1909.11942
    """
    
    config_class = AlBERT4RecModelConfig

    def __init__(  # pylint: disable=too-many-arguments, too-many-locals
        self,
        n_blocks: int = 2,
        n_hidden_groups: int = 1,
        n_inner_groups: int = 1,
        n_heads: int = 4,
        n_factors: int = 256,
        emb_factors: int = 64,
        use_pos_emb: bool = True,
        use_causal_attn: bool = False,
        use_key_padding_mask: bool = True,
        dropout_rate: float = 0.0,
        epochs: int = 3,
        verbose: int = 0,
        deterministic: bool = False,
        recommend_batch_size: int = 256,
        recommend_accelerator: str = "auto",
        recommend_devices: tp.Union[int, tp.List[int]] = 1,
        recommend_n_threads: int = 0,
        recommend_use_gpu_ranking: bool = True,
        session_max_len: int = 100,
        n_negatives: int = 1,
        batch_size: int = 128,
        loss: str = "softmax",
        gbce_t: float = 0.2,
        lr: float = 0.001,
        dataloader_num_workers: int = 0,
        train_min_user_interactions: int = 2,
        mask_prob: float = 0.15,
        trainer: tp.Optional[Trainer] = None,
        item_net_block_types: tp.Sequence[tp.Type[ItemNetBase]] = (AlBertIdEmbeddingsItemNet, ),
        pos_encoding_type: tp.Type[PositionalEncodingBase] = LearnableInversePositionalEncoding,
        transformer_layers_type: tp.Type[TransformerLayersBase] = PreLNTransformerLayers,
        data_preparator_type: tp.Type[SessionEncoderDataPreparatorBase] = BERT4RecDataPreparator,
        lightning_module_type: tp.Type[SessionEncoderLightningModuleBase] = SessionEncoderLightningModule,
        get_val_mask_func: tp.Optional[tp.Callable] = None,
    ):
        self.n_hidden_groups = n_hidden_groups
        self.n_inner_groups = n_inner_groups
        self.emb_factors = emb_factors

        if n_blocks < n_hidden_groups:
            warnings.warn(
                "When `n_hidden_groups` less than `n_blocks` that will use in the forward only one hidden group."
            ) 

        super().__init__(
            transformer_layers_type=transformer_layers_type,
            data_preparator_type=data_preparator_type,
            n_blocks=n_blocks,
            n_heads=n_heads,
            n_factors=n_factors,
            use_pos_emb=use_pos_emb,
            use_causal_attn=use_causal_attn,
            use_key_padding_mask=use_key_padding_mask,
            dropout_rate=dropout_rate,
            session_max_len=session_max_len,
            dataloader_num_workers=dataloader_num_workers,
            batch_size=batch_size,
            loss=loss,
            n_negatives=n_negatives,
            gbce_t=gbce_t,
            lr=lr,
            epochs=epochs,
            verbose=verbose,
            deterministic=deterministic,
            recommend_batch_size=recommend_batch_size,
            recommend_accelerator=recommend_accelerator,
            recommend_devices=recommend_devices,
            recommend_n_threads=recommend_n_threads,
            recommend_use_gpu_ranking=recommend_use_gpu_ranking,
            train_min_user_interactions=train_min_user_interactions,
            mask_prob=mask_prob,
            trainer=trainer,
            item_net_block_types=item_net_block_types,
            pos_encoding_type=pos_encoding_type,
            lightning_module_type=lightning_module_type,
            get_val_mask_func=get_val_mask_func,
        )

    # def _init_data_preparator(self) -> None:
    #     self.data_preparator: SessionEncoderDataPreparatorBase = self.data_preparator_type(
    #         session_max_len=self.session_max_len,
    #         n_negatives=self.n_negatives if self.loss != "softmax" else None,
    #         batch_size=self.batch_size,
    #         dataloader_num_workers=self.dataloader_num_workers,
    #         train_min_user_interactions=self.train_min_user_interactions,
    #         item_extra_tokens=(PADDING_VALUE, MASKING_VALUE),
    #         mask_prob=self.mask_prob,
    #         get_val_mask_func=self.get_val_mask_func,
    #     )
    
    def _init_torch_model(self) -> None:
        self._torch_model = AlBERTSessionEncoder(
            n_blocks=self.n_blocks,
            n_hidden_groups=self.n_hidden_groups,
            n_inner_groups=self.n_inner_groups,
            n_factors=self.n_factors,
            emb_factors=self.emb_factors,
            n_heads=self.n_heads,
            session_max_len=self.session_max_len,
            dropout_rate=self.dropout_rate,
            use_pos_emb=self.use_pos_emb,
            use_causal_attn=self.use_causal_attn,
            use_key_padding_mask=self.use_key_padding_mask,
            transformer_layers_type=self.transformer_layers_type,
            item_net_block_types=self.item_net_block_types,
            pos_encoding_type=self.pos_encoding_type,
        )


In [89]:
MIN_EPOCHS = 3
MAX_EPOCHS = 3
TRAIN_MIN_USER_INTERACTIONS = K_ACTIONS + 5
SESSION_MAX_LEN = 50
N_NEGATIVES = 5

albert_trainer = Trainer(
    accelerator='gpu',
    devices=[0],
    min_epochs=MIN_EPOCHS,
    max_epochs=MAX_EPOCHS, 
    deterministic=True,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [95]:
albert_model = AlBERT4RecModel(
    n_factors=64,
    emb_factors=16,
    n_blocks=4,
    n_hidden_groups=2,
    n_inner_groups=2,
    n_heads=2,
    dropout_rate=0.2,
    use_pos_emb=True,
    train_min_user_interactions=TRAIN_MIN_USER_INTERACTIONS,
    session_max_len=SESSION_MAX_LEN,
    use_causal_attn=False,
    lr=1e-3,
    batch_size=128,
    loss="softmax",
    verbose=1,
    deterministic=True,
    item_net_block_types=(AlBertIdEmbeddingsItemNet, ),  # Use only item ids in ItemNetBlock
    trainer=albert_trainer,
)


N_NEGATIVES = 5

albert_model_bce = AlBERT4RecModel(
    n_factors=64,
    emb_factors=16,
    n_blocks=4,
    n_hidden_groups=2,
    n_inner_groups=2,
    n_heads=2,
    dropout_rate=0.2,
    use_pos_emb=True,
    train_min_user_interactions=TRAIN_MIN_USER_INTERACTIONS,
    session_max_len=SESSION_MAX_LEN,
    use_causal_attn=False,
    lr=1e-3,
    batch_size=128,
    loss="BCE",
    n_negatives=N_NEGATIVES,
    verbose=1,
    deterministic=True,
    item_net_block_types=(AlBertIdEmbeddingsItemNet, ),  # Use only item ids in ItemNetBlock
    trainer=albert_trainer,
)

albert_model_gbce = AlBERT4RecModel(
    n_factors=64,
    emb_factors=16,
    n_blocks=4,
    n_hidden_groups=2,
    n_inner_groups=2,
    n_heads=2,
    dropout_rate=0.2,
    use_pos_emb=True,
    train_min_user_interactions=TRAIN_MIN_USER_INTERACTIONS,
    session_max_len=SESSION_MAX_LEN,
    use_causal_attn=False,
    lr=1e-3,
    batch_size=128,
    loss="gBCE",
    n_negatives=N_NEGATIVES,
    verbose=1,
    deterministic=True,
    item_net_block_types=(AlBertIdEmbeddingsItemNet, ),  # Use only item ids in ItemNetBlock
    trainer=albert_trainer,
)

In [96]:
%%time
albert_model.fit(dataset_no_features)

  unq_values = pd.unique(values)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                 | Params
-----------------------------------------------------
0 | torch_model | AlBERTSessionEncoder | 282 K 
-----------------------------------------------------
282 K     Trainable params
0         Non-trainable params
282 K     Total params
1.130     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


CPU times: user 2.96 s, sys: 58 ms, total: 3.02 s
Wall time: 1.62 s


<__main__.AlBERT4RecModel at 0x7fdde79717f0>

In [97]:
%%time
albert_model_bce.fit(dataset_no_features)

  unq_values = pd.unique(values)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                 | Params
-----------------------------------------------------
0 | torch_model | AlBERTSessionEncoder | 282 K 
-----------------------------------------------------
282 K     Trainable params
0         Non-trainable params
282 K     Total params
1.130     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


CPU times: user 4.65 s, sys: 42.4 ms, total: 4.69 s
Wall time: 1.85 s


<__main__.AlBERT4RecModel at 0x7fddfad60c40>

In [98]:
%%time
albert_model_gbce.fit(dataset_no_features)

  unq_values = pd.unique(values)
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name        | Type                 | Params
-----------------------------------------------------
0 | torch_model | AlBERTSessionEncoder | 282 K 
-----------------------------------------------------
282 K     Trainable params
0         Non-trainable params
282 K     Total params
1.130     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


CPU times: user 4.57 s, sys: 54.9 ms, total: 4.62 s
Wall time: 1.77 s


<__main__.AlBERT4RecModel at 0x7fdde7971070>

In [99]:
softmax_loss_df, _ = get_log_values(albert_model, is_val=False)
softmax_loss_df["loss_type"] = "softmax"
bce_loss_df, _ = get_log_values(albert_model_bce, is_val=False)
bce_loss_df["loss_type"] = "bce"
gbce_loss_df, _ = get_log_values(albert_model_gbce, is_val=False)
gbce_loss_df["loss_type"] = "gbce"
pd.concat([softmax_loss_df, bce_loss_df, gbce_loss_df], axis=1)

Unnamed: 0,epoch,train/loss,loss_type,epoch.1,train/loss.1,loss_type.1,epoch.2,train/loss.2,loss_type.2
0,0,23.841927,softmax,0,3.703355,bce,0,2.883951,gbce
1,1,22.312714,softmax,1,2.29379,bce,1,1.834833,gbce
2,2,21.233828,softmax,2,1.843714,bce,2,1.557601,gbce


In [71]:
a = PreLNTransformerLayers(
    n_blocks=4, n_factors=8, n_heads=8, dropout_rate=0
)
a

PreLNTransformerLayers(
  (multi_head_attn): ModuleList(
    (0-3): 4 x MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=8, out_features=8, bias=True)
    )
  )
  (layer_norm_1): ModuleList(
    (0-3): 4 x LayerNorm((8,), eps=1e-05, elementwise_affine=True)
  )
  (dropout_1): ModuleList(
    (0-3): 4 x Dropout(p=0, inplace=False)
  )
  (layer_norm_2): ModuleList(
    (0-3): 4 x LayerNorm((8,), eps=1e-05, elementwise_affine=True)
  )
  (feed_forward): ModuleList(
    (0-3): 4 x PointWiseFeedForward(
      (ff_linear_1): Linear(in_features=8, out_features=32, bias=True)
      (ff_dropout_1): Dropout(p=0, inplace=False)
      (ff_activation): GELU(approximate='none')
      (ff_linear_2): Linear(in_features=32, out_features=8, bias=True)
    )
  )
  (dropout_2): ModuleList(
    (0-3): 4 x Dropout(p=0, inplace=False)
  )
  (dropout_3): ModuleList(
    (0-3): 4 x Dropout(p=0, inplace=False)
  )
)