# A Simple Convolutional Generative Network for Next Item Recommendation(NextItNet)



### class 관계

* [AbstractRecommender](https://github.com/RUCAIBox/RecBole/blob/master/recbole/model/abstract_recommender.py#L25)  
    * [SequentialRecommender](https://github.com/RUCAIBox/RecBole/blob/master/recbole/model/abstract_recommender.py#L146)
        * [NextItNet](https://github.com/RUCAIBox/RecBole/blob/master/recbole/model/sequential_recommender/nextitnet.py)
         
                    


### Recbole NextItNet 코드실행 예시([참고](https://github.com/RUCAIBox/RecBole/blob/master/recbole/model/sequential_recommender/nextitnet.py))

    

In [1]:
# Recbole NetxItNet 코드실행 예시 

# 1. config 
from recbole.config import Config
parameter_dict = {
    'data_path': './data',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'TIME_FIELD': 'timestamp',
    'user_inter_num_interval': "[30,inf)",
    'item_inter_num_interval': "[40,inf)",
    'load_col': {'inter': ['user_id', 'item_id', 'timestamp']},
    'train_neg_sample_args': None,
    'epochs': 1,
    'eval_args': {
        'split': {'RS': [10, 0, 0]},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full'}
}
config = Config(model='NextItNet', dataset='recbox_data', config_dict=parameter_dict) 

# 2. dataset 
from recbole.data import create_dataset, data_preparation
dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

# 3. model
from recbole.model.sequential_recommender import NextItNet
model = NextItNet(config, train_data.dataset).to(config['device']) 

# # 4. training 
# from recbole.trainer import Trainer
# trainer = Trainer(config, model)
# best_valid_score, best_valid_result = trainer.fit(train_data)

In [2]:
model

NextItNet(
  (item_embedding): Embedding(10962, 64, padding_idx=0)
  (residual_blocks): Sequential(
    (0): ResidualBlock_b(
      (conv1): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1))
      (ln1): LayerNorm((64,), eps=1e-08, elementwise_affine=True)
      (conv2): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1), dilation=(2, 2))
      (ln2): LayerNorm((64,), eps=1e-08, elementwise_affine=True)
    )
    (1): ResidualBlock_b(
      (conv1): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1), dilation=(4, 4))
      (ln1): LayerNorm((64,), eps=1e-08, elementwise_affine=True)
      (conv2): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1), dilation=(8, 8))
      (ln2): LayerNorm((64,), eps=1e-08, elementwise_affine=True)
    )
    (2): ResidualBlock_b(
      (conv1): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1))
      (ln1): LayerNorm((64,), eps=1e-08, elementwise_affine=True)
      (conv2): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1), dilation=(2, 2))
      (ln2): LayerNorm

### train data 예시([참고](https://github.com/RUCAIBox/RecBole/blob/master/recbole/trainer/trainer.py#L234))

In [3]:
for batch_idx, batch_data in enumerate(train_data):
    batch_idx = batch_idx
    interaction = batch_data
    break

USER_ID = 'user_id'
POS_ITEM_ID = 'item_id'
ITEM_SEQ = 'item_id_list'
ITEM_SEQ_LEN = 'item_length'

user_seq = interaction[USER_ID] 
item_seq = interaction[ITEM_SEQ]         
item_seq_len = interaction[ITEM_SEQ_LEN] 

In [4]:
user_seq # torch.Size([2048])

tensor([ 8681,  4622, 22968,  ..., 30080, 19206,   759])

In [5]:
item_seq # torch.Size([2048, 50])

tensor([[ 1868,   266,  2206,  ...,     0,     0,     0],
        [ 2549,  2549,    60,  ...,     0,     0,     0],
        [ 1017,   522,   265,  ...,     0,     0,     0],
        ...,
        [ 4208,   818,  6991,  ...,     0,     0,     0],
        [ 3813,  4103,  4103,  ...,  9211, 10242,   265],
        [  181,  1299,  1076,  ...,     0,     0,     0]])

In [6]:
item_seq_len # torch.Size([2048])

tensor([ 7, 26, 13,  ..., 39, 50,  7])

* 유저 8681 학습데이터 예시

In [8]:
import numpy as np
import pandas as pd

uid = train_data.dataset.id2token(train_data.dataset.uid_field, [8681])[0]
index = np.isin(train_data.dataset[train_data.dataset.uid_field].numpy(), 8681) 

user_interaction = train_data.dataset[index]
user_interaction

# df = pd.read_csv('./data/recbox_data/recbox_data.inter', sep='\t')
# ex = df[df['user_id:token'] == uid] # 유저 8681의 로그는 총 42개 (이중 39개가 train data로 사용)

The batch_size of interaction: 39
    user_id, torch.Size([39]), cpu, torch.int64
    item_id, torch.Size([39]), cpu, torch.int64
    timestamp, torch.Size([39]), cpu, torch.float32
    item_length, torch.Size([39]), cpu, torch.int64
    item_id_list, torch.Size([39, 50]), cpu, torch.int64
    timestamp_list, torch.Size([39, 50]), cpu, torch.float32


In [11]:
user_interaction[USER_ID] # torch.Size([39])

tensor([8681, 8681, 8681, 8681, 8681, 8681, 8681, 8681, 8681, 8681, 8681, 8681,
        8681, 8681, 8681, 8681, 8681, 8681, 8681, 8681, 8681, 8681, 8681, 8681,
        8681, 8681, 8681, 8681, 8681, 8681, 8681, 8681, 8681, 8681, 8681, 8681,
        8681, 8681, 8681])

In [12]:
# x
user_interaction[ITEM_SEQ] # torch.Size([39, 50]) max sequence length = 50

tensor([[1868,    0,    0,  ...,    0,    0,    0],
        [1868,  266,    0,  ...,    0,    0,    0],
        [1868,  266, 2206,  ...,    0,    0,    0],
        ...,
        [1868,  266, 2206,  ...,    0,    0,    0],
        [1868,  266, 2206,  ...,    0,    0,    0],
        [1868,  266, 2206,  ...,    0,    0,    0]])

In [13]:
user_interaction[ITEM_SEQ][-1] 

tensor([ 1868,   266,  2206,  2488,   439,  6038,  5231,  5693,    39,  6429,
         1234,  5944,  3629,   173,  8296,  7010,  6897,  6897,   106,  3858,
         3858,  1196,   643,  5705,  4854,  7869,  8703,  3170,  9258,  9774,
         9362, 10102, 10440, 10334,  8651, 10242,  6997, 10610, 10465,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])

In [14]:
# y(label)
user_interaction[POS_ITEM_ID] # torch.Size([39])

tensor([  266,  2206,  2488,   439,  6038,  5231,  5693,    39,  6429,  1234,
         5944,  3629,   173,  8296,  7010,  6897,  6897,   106,  3858,  3858,
         1196,   643,  5705,  4854,  7869,  8703,  3170,  9258,  9774,  9362,
        10102, 10440, 10334,  8651, 10242,  6997, 10610, 10465,  2939])

In [15]:
user_interaction[ITEM_SEQ_LEN] # torch.Size([39])

tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
        37, 38, 39])

### SequentialRecommender class
* [code](https://github.com/RUCAIBox/RecBole/blob/master/recbole/model/abstract_recommender.py#L146)

In [7]:
import torch
import torch.nn as nn

```python
class SequentialRecommender(AbstractRecommender):
    """
    This is a abstract sequential recommender. All the sequential model should implement This class.
    """
    type = ModelType.SEQUENTIAL

    def __init__(self, config, dataset):
            super(SequentialRecommender, self).__init__()

            # load dataset info
            self.USER_ID = config["USER_ID_FIELD"]
            self.ITEM_ID = config["ITEM_ID_FIELD"]
            self.ITEM_SEQ = self.ITEM_ID + config["LIST_SUFFIX"]
            self.ITEM_SEQ_LEN = config["ITEM_LIST_LENGTH_FIELD"]
            self.POS_ITEM_ID = self.ITEM_ID
            self.NEG_ITEM_ID = config["NEG_PREFIX"] + self.ITEM_ID
            self.max_seq_length = config["MAX_ITEM_LIST_LENGTH"]
            self.n_items = dataset.num(self.ITEM_ID)

            # load parameters info
            self.device = config["device"]
```

In [8]:
# load dataset info
USER_ID = config["USER_ID_FIELD"] 
ITEM_ID = config["ITEM_ID_FIELD"]  
ITEM_SEQ = ITEM_ID + config["LIST_SUFFIX"]
ITEM_SEQ_LEN = config["ITEM_LIST_LENGTH_FIELD"]

POS_ITEM_ID = ITEM_ID
NEG_ITEM_ID = config["NEG_PREFIX"] + ITEM_ID

max_seq_length = config["MAX_ITEM_LIST_LENGTH"]
n_items = dataset.num(ITEM_ID)

# load parameters info
device = config["device"]

print('USER_ID:', USER_ID)
print('ITEM_ID:', ITEM_ID)
print('ITEM_SEQ:', ITEM_SEQ)
print('ITEM_SEQ_LEN:', ITEM_SEQ_LEN)
print('POS_ITEM_ID:', POS_ITEM_ID)
print('NEG_ITEM_ID:', NEG_ITEM_ID)
print('max_seq_length:', max_seq_length)
print('n_items:', n_items)
print('device:', device)

USER_ID: user_id
ITEM_ID: item_id
ITEM_SEQ: item_id_list
ITEM_SEQ_LEN: item_length
POS_ITEM_ID: item_id
NEG_ITEM_ID: neg_item_id
max_seq_length: 50
n_items: 10962
device: cpu


```python
 def get_attention_mask(self, item_seq, bidirectional=False):
        """Generate left-to-right uni-directional or bidirectional attention mask for multi-head attention."""
        attention_mask = item_seq != 0
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)  # torch.bool
        if not bidirectional:
            extended_attention_mask = torch.tril(
                extended_attention_mask.expand((-1, -1, item_seq.size(-1), -1))
            )
        extended_attention_mask = torch.where(extended_attention_mask, 0.0, -10000.0)
        return extended_attention_mask
    
```

```python
 def gather_indexes(self, output, gather_index):
        """Gathers the vectors at the specific positions over a minibatch"""
        gather_index = gather_index.view(-1, 1, 1).expand(-1, -1, output.shape[-1])
        output_tensor = output.gather(dim=1, index=gather_index)
        return output_tensor.squeeze(1)
```

## NextItNet class

* AbstractRecommender
    * SequentialRecommender
        * NextItNet
            * ResidualBlock_a
            * ResidualBlock_b

### ResisualBlock_b class 

```python
class ResidualBlock_b(nn.Module):
    r"""
    Residual block (b) in the paper
    """
    def __init__(self, in_channel, out_channel, kernel_size=3, dilation=None):
        super(ResidualBlock_b, self).__init__()

        self.conv1 = nn.Conv2d(
            in_channel,
            out_channel,
            kernel_size=(1, kernel_size),
            padding=0,
            dilation=dilation,
        )
        self.ln1 = nn.LayerNorm(out_channel, eps=1e-8)
        self.conv2 = nn.Conv2d(
            out_channel,
            out_channel,
            kernel_size=(1, kernel_size),
            padding=0,
            dilation=dilation * 2,
        )
        self.ln2 = nn.LayerNorm(out_channel, eps=1e-8)

        self.dilation = dilation
        self.kernel_size = kernel_size
```

In [14]:
from torch.nn import functional as F

in_channel = 64
out_channel = 64
kernel_size = 3
dilation = 1


# 커널(필터) 생성
conv1 = nn.Conv2d(
            in_channel,                   # 채널수
            out_channel,                  # 필터수  
            kernel_size=(1, kernel_size), # 필터사이즈 
            padding=0,
            dilation=dilation,
        )

In [15]:
conv1

Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1))

In [16]:
conv1.weight.shape

torch.Size([64, 64, 1, 3])

In [13]:
# # 예시 
# conv1 = nn.Conv2d(3, 2, kernel_size=5, stride=1)
# print(conv1.weight.shape) # [필터수, 채널수(rgb), 필터사이즈]

torch.Size([2, 3, 5, 5])


In [17]:
ln1 = nn.LayerNorm(out_channel, eps=1e-8)
ln1  

LayerNorm((64,), eps=1e-08, elementwise_affine=True)

In [18]:
conv2 = nn.Conv2d(
    out_channel,
    out_channel,
    kernel_size=(1, kernel_size),
    padding=0,
    dilation=dilation * 2,
)


In [19]:
conv2.weight.shape

torch.Size([64, 64, 1, 3])

In [20]:
ln2 = nn.LayerNorm(out_channel, eps=1e-8)
ln2

LayerNorm((64,), eps=1e-08, elementwise_affine=True)

```python

def forward(self, x):  # x: [batch_size, seq_len, embed_size]
    x_pad = self.conv_pad(
        x, self.dilation
    )  # [batch_size, embed_size, 1, seq_len+(self.kernel_size-1)*dilations]
    out = self.conv1(x_pad).squeeze(2).permute(0, 2, 1)
    # [batch_size, seq_len+(self.kernel_size-1)*dilations-kernel_size+1, embed_size]
    out = F.relu(self.ln1(out))
    out_pad = self.conv_pad(out, self.dilation * 2)
    out2 = self.conv2(out_pad).squeeze(2).permute(0, 2, 1)
    out2 = F.relu(self.ln2(out2))
    return out2 + x
```

In [21]:
def conv_pad(x, dilation):
    r"""Dropout-mask: To avoid the future information leakage problem, this paper proposed a masking-based dropout
    trick for the 1D dilated convolution to prevent the network from seeing the future items.
    Also the One-dimensional transformation is completed in this function.
    """
    kernel_size = 3
    inputs_pad = x.permute(0, 2, 1)
    inputs_pad = inputs_pad.unsqueeze(2)
    pad = nn.ZeroPad2d(((kernel_size - 1) * dilation, 0, 0, 0))
    inputs_pad = pad(inputs_pad)
    return inputs_pad

# input 
x = torch.rand([2048, 50, 64]) # [batch_size, seq_len, embed_size]
x_pad = conv_pad(x, dilation)  # torch.Size([2048, 64, 1, 52])

out = conv1(x_pad).squeeze(2).permute(0, 2, 1) # torch.Size([2048, 64, 1, 50]) -> torch.Size([2048, 64, 50]) -> torch.Size([2048, 50, 64])
out = F.relu(ln1(out))                         # torch.Size([2048, 50, 64])


In [22]:
out_pad = conv_pad(out, dilation * 2)             # torch.Size([2048, 64, 1, 54])
out2 = conv2(out_pad).squeeze(2).permute(0, 2, 1) # torch.Size([2048, 64, 1, 50]) -> torch.Size([2048, 64, 50]) -> torch.Size([2048, 50, 64])
out2 = F.relu(ln2(out2))                          # torch.Size([2048, 50, 64])


In [23]:
output = out2 + x # torch.Size([2048, 50, 64])

In [None]:
# # 참고 dilation = 4
# dilation = 4

# x = torch.rand([2048, 50, 64])
# x_pad = conv_pad(x, dilation)                        # torch.Size([2048, 64, 1, 58])

# out = rb[1].conv1(x_pad).squeeze(2).permute(0, 2, 1) # torch.Size([2048, 64, 1, 50]) -> torch.Size([2048, 64, 50]) -> torch.Size([2048, 50, 64])
# out = F.relu(ln1(out))                               # torch.Size([2048, 50, 64])

# out_pad = conv_pad(out, dilation * 2)                   # torch.Size([2048, 64, 1, 66])
# out2 = rb[1].conv2(out_pad).squeeze(2).permute(0, 2, 1) # torch.Size([2048, 64, 1, 50]) -> torch.Size([2048, 64, 50]) -> torch.Size([2048, 50, 64])
# out2 = F.relu(ln2(out2))                                # torch.Size([2048, 50, 64]) 
   
# output = out2 + x # torch.Size([2048, 50, 64])

### NextItNet class

In [24]:
import torch
from torch import nn
from torch.nn.init import normal_

from recbole.model.abstract_recommender import SequentialRecommender
from recbole.model.loss import RegLoss, BPRLoss

```python

class NextItNet(SequentialRecommender):
    r"""The network architecture of the NextItNet model is formed of a stack of holed convolutional layers, which can
    efficiently increase the receptive fields without relying on the pooling operation.
    Also residual block structure is used to ease the optimization for much deeper networks.

    Note:
        As paper said, for comparison purpose, we only predict the next one item in our evaluation,
        and then stop the generating process. Although the number of parameters in residual block (a) is less
        than it in residual block (b), the performance of b is better than a.
        So in our model, we use residual block (b).
        In addition, when dilations is not equal to 1, the training may be slow. To  speed up the efficiency, please set the parameters "reproducibility" False.
    """

     def __init__(self, config, dataset):
            super(NextItNet, self).__init__(config, dataset)

            # load parameters info
            self.embedding_size = config["embedding_size"]
            self.residual_channels = config["embedding_size"]
            self.block_num = config["block_num"]
            self.dilations = config["dilations"] * self.block_num
            self.kernel_size = config["kernel_size"]
            self.reg_weight = config["reg_weight"]
            self.loss_type = config["loss_type"]

            # define layers and loss
            self.item_embedding = nn.Embedding(
                self.n_items, self.embedding_size, padding_idx=0
            )

            # residual blocks    dilations in blocks:[1,2,4,8,1,2,4,8,...]
            rb = [
                ResidualBlock_b(
                    self.residual_channels,
                    self.residual_channels,
                    kernel_size=self.kernel_size,
                    dilation=dilation,
                )
                for dilation in self.dilations
            ]
            self.residual_blocks = nn.Sequential(*rb)

            # fully-connected layer
            self.final_layer = nn.Linear(self.residual_channels, self.embedding_size)

            if self.loss_type == "BPR":
                self.loss_fct = BPRLoss()
            elif self.loss_type == "CE":
                self.loss_fct = nn.CrossEntropyLoss()
            else:
                raise NotImplementedError("Make sure 'loss_type' in ['BPR', 'CE']!")
            self.reg_loss = RegLoss()

            # parameters initialization
            self.apply(self._init_weights)

```

In [25]:
# load parameters info
embedding_size = config["embedding_size"]    # 64
residual_channels = config["embedding_size"] # 64
block_num = config["block_num"]              # 5
dilations = config["dilations"] * block_num  # [1, 4, 1, 4, 1, 4, 1, 4, 1, 4]
kernel_size = config["kernel_size"]          # 3
reg_weight = config["reg_weight"]            # 1e-05
loss_type = config["loss_type"]              # 'CE'          

reg_loss = RegLoss()

In [26]:
# define layers 
item_embedding = nn.Embedding(n_items, embedding_size, padding_idx=0) # (10962, 64)

In [27]:
from recbole.model.sequential_recommender.nextitnet import ResidualBlock_b

# residual blocks    
# dilations in blocks:[1,2,4,8,  1,2,4,8, 1,2,4,8,  1,2,4,8,  1,2,4,8]

rb = [
    ResidualBlock_b(
      residual_channels,
      residual_channels,
      kernel_size=kernel_size,
      dilation=dilation,
    )
    for dilation in dilations
    ]

residual_blocks = nn.Sequential(*rb)

In [28]:
rb

[ResidualBlock_b(
   (conv1): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1))
   (ln1): LayerNorm((64,), eps=1e-08, elementwise_affine=True)
   (conv2): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1), dilation=(2, 2))
   (ln2): LayerNorm((64,), eps=1e-08, elementwise_affine=True)
 ),
 ResidualBlock_b(
   (conv1): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1), dilation=(4, 4))
   (ln1): LayerNorm((64,), eps=1e-08, elementwise_affine=True)
   (conv2): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1), dilation=(8, 8))
   (ln2): LayerNorm((64,), eps=1e-08, elementwise_affine=True)
 ),
 ResidualBlock_b(
   (conv1): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1))
   (ln1): LayerNorm((64,), eps=1e-08, elementwise_affine=True)
   (conv2): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1), dilation=(2, 2))
   (ln2): LayerNorm((64,), eps=1e-08, elementwise_affine=True)
 ),
 ResidualBlock_b(
   (conv1): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1), dilation=(4, 4))
   (ln1): LayerNorm

In [29]:
residual_blocks = nn.Sequential(*rb)
residual_blocks

Sequential(
  (0): ResidualBlock_b(
    (conv1): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1))
    (ln1): LayerNorm((64,), eps=1e-08, elementwise_affine=True)
    (conv2): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1), dilation=(2, 2))
    (ln2): LayerNorm((64,), eps=1e-08, elementwise_affine=True)
  )
  (1): ResidualBlock_b(
    (conv1): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1), dilation=(4, 4))
    (ln1): LayerNorm((64,), eps=1e-08, elementwise_affine=True)
    (conv2): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1), dilation=(8, 8))
    (ln2): LayerNorm((64,), eps=1e-08, elementwise_affine=True)
  )
  (2): ResidualBlock_b(
    (conv1): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1))
    (ln1): LayerNorm((64,), eps=1e-08, elementwise_affine=True)
    (conv2): Conv2d(64, 64, kernel_size=(1, 3), stride=(1, 1), dilation=(2, 2))
    (ln2): LayerNorm((64,), eps=1e-08, elementwise_affine=True)
  )
  (3): ResidualBlock_b(
    (conv1): Conv2d(64, 64, kernel_size=(1, 3), st

In [30]:
# fully-connected layer
final_layer = nn.Linear(residual_channels, embedding_size) # 64, 64 

In [31]:
# define loss
loss_fct = nn.CrossEntropyLoss()

In [None]:
# parameters initialization
apply(self._init_weights)

# 해당 코드와 동일하게 작동
# for submodule in model.children():
#     _init_weights(submodule)

```python

def forward(self, item_seq):
    item_seq_emb = self.item_embedding(
        item_seq
    )  # [batch_size, seq_len, embed_size]
    # Residual locks
    dilate_outputs = self.residual_blocks(item_seq_emb)
    hidden = dilate_outputs[:, -1, :].view(
        -1, self.residual_channels
    )  # [batch_size, embed_size]
    seq_output = self.final_layer(hidden)  # [batch_size, embedding_size]
    return seq_output


```

In [32]:
item_seq.shape

torch.Size([2048, 50])

In [33]:
item_seq_emb = item_embedding(item_seq) # torch.Size([2048, 50, 64])

In [34]:
# Residual locks
dilate_outputs = residual_blocks(item_seq_emb) # torch.Size([2048, 50, 64])

In [35]:
# 마지막 데이터 

hidden = dilate_outputs[:, -1, :].view(-1, residual_channels)  # torch.Size([2048, 64])

In [36]:
seq_output = final_layer(hidden) # torch.Size([2048, 64])

```python 
def reg_loss_rb(self):
    r"""
    L2 loss on residual blocks
    """
    loss_rb = 0
    if self.reg_weight > 0.0:
        for name, parm in self.residual_blocks.named_parameters():
            if name.endswith("weight"):
                loss_rb += torch.norm(parm, 2)
    return self.reg_weight * loss_rb

```

In [37]:
loss_rb = 0
for name, parm in residual_blocks.named_parameters():
    if name.endswith("weight"):
        loss_rb += torch.norm(parm, 2) # l2 norm

l2_loss = reg_weight * loss_rb

```python
def calculate_loss(self, interaction):
        item_seq = interaction[self.ITEM_SEQ]
        # item_seq_len = interaction[self.ITEM_SEQ_LEN]
        seq_output = self.forward(item_seq)
        pos_items = interaction[self.POS_ITEM_ID]
        if self.loss_type == "BPR":
            neg_items = interaction[self.NEG_ITEM_ID]
            pos_items_emb = self.item_embedding(pos_items)
            neg_items_emb = self.item_embedding(neg_items)
            pos_score = torch.sum(seq_output * pos_items_emb, dim=-1)  # [B]
            neg_score = torch.sum(seq_output * neg_items_emb, dim=-1)  # [B]
            loss = self.loss_fct(pos_score, neg_score)
        else:  # self.loss_type = 'CE'
            test_item_emb = self.item_embedding.weight
            logits = torch.matmul(seq_output, test_item_emb.transpose(0, 1))
            loss = self.loss_fct(logits, pos_items)
        reg_loss = self.reg_loss([self.item_embedding.weight, self.final_layer.weight])
        loss = loss + self.reg_weight * reg_loss + self.reg_loss_rb()
        return loss
```

In [38]:
item_seq = interaction[ITEM_SEQ]
seq_output = torch.rand([2048, 64])  # self.forward(item_seq)
pos_items = interaction[POS_ITEM_ID] # torch.Size([2048])

In [39]:
test_item_emb = item_embedding.weight                            # torch.Size([10962, 64])
logits = torch.matmul(seq_output, test_item_emb.transpose(0, 1)) # torch.Size([2048, 10962])
loss = loss_fct(logits, pos_items)                               # scalar 

In [40]:
loss

tensor(18.6099, grad_fn=<NllLossBackward0>)

In [41]:
reg_loss = reg_loss([item_embedding.weight, final_layer.weight])

In [42]:
reg_loss

tensor(841.0498, grad_fn=<AddBackward0>)

In [43]:
loss = loss + reg_weight * reg_loss + l2_loss
loss

tensor(18.6209, grad_fn=<AddBackward0>)

```python

def predict(self, interaction):
    item_seq = interaction[self.ITEM_SEQ]
    test_item = interaction[self.ITEM_ID]
    seq_output = self.forward(item_seq)
    test_item_emb = self.item_embedding(test_item)
    scores = torch.mul(seq_output, test_item_emb).sum(dim=1)
    return scores

```

In [44]:
item_seq = interaction[ITEM_SEQ]
# test_item = interaction[ITEM_ID]
seq_output = torch.rand([2048, 64])  # self.forward(item_seq)

In [45]:
test_item_emb = item_embedding.weight
scores = torch.matmul(seq_output, test_item_emb.transpose(0, 1)) # scores.shape

In [46]:
scores

tensor([[ 0.0000, -4.1369,  1.3202,  ..., -3.4493, -1.1543,  2.7742],
        [ 0.0000, -5.8541, -2.3710,  ..., -1.0264, -7.8429,  7.3182],
        [ 0.0000, -8.0782, -6.4184,  ..., -0.7839, -5.5299,  7.7951],
        ...,
        [ 0.0000, -4.0559,  2.8834,  ..., -3.8122,  2.7357,  7.6134],
        [ 0.0000, -5.1901,  1.1562,  ..., -4.1311, -1.8685,  1.9412],
        [ 0.0000, -1.5092, -0.0786,  ..., -1.5749, -4.1443,  1.6243]],
       grad_fn=<MmBackward0>)