# Modules & Hyper-parameters

In [None]:
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [None]:
# model setting
max_len = 70
hidden_units = 50
num_heads = 1
num_layers = 2
dropout_rate=0.5
num_workers = 1
device = 'cuda' 

# training setting
lr = 0.001
batch_size = 128
num_epochs = 200
mask_prob = 0.15 # for cloze task

In [None]:
%cd /content/drive/MyDrive/[22-2]DSL_Modeling/

/content/drive/MyDrive/[22-2]DSL_Modeling


# 데이터 전처리

In [None]:
df = pd.read_csv('menu_final.csv')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,SessionID,Menu,MenuID,timestamp
0,0,0,찐빵,0.0,1
1,1,0,오징어찌개,1.0,2
2,2,0,육개장,2.0,3
3,3,0,단호박샌드,3.0,4
4,4,0,김치찌개,4.0,5


In [None]:
df = df[['SessionID','Menu','timestamp']]

In [None]:
col_list = ['user','item','time']
df.columns = col_list
df.head()

Unnamed: 0,user,item,time
0,0,찐빵,1
1,0,오징어찌개,2
2,0,육개장,3
3,0,단호박샌드,4
4,0,김치찌개,5


In [None]:
# data_path = 'dacon_menu_hr3.csv'
# df = pd.read_csv(data_path)

item_ids = df['item'].unique()
user_ids = df['user'].unique()
num_item, num_user = len(item_ids), len(user_ids)
num_batch = num_user // batch_size

# user, item indexing
item2idx = pd.Series(data=np.arange(len(item_ids))+1, index=item_ids) # item re-indexing (1~num_item), num_item+1: mask idx
user2idx = pd.Series(data=np.arange(len(user_ids)), index=user_ids) # user re-indexing (0~num_user-1)

# dataframe indexing
df = pd.merge(df, pd.DataFrame({'item': item_ids, 'item_idx': item2idx[item_ids].values}), on='item', how='inner')
df = pd.merge(df, pd.DataFrame({'user': user_ids, 'user_idx': user2idx[user_ids].values}), on='user', how='inner')
df.sort_values(['user_idx', 'time'], inplace=True)
check_user = defaultdict()
check_item = defaultdict()
for u, i in zip(df['user_idx'], df['user']):
    check_user[u] = i
for u, i in zip(df['item_idx'], df['item']):
    check_item[u] = i

del df['item'], df['user'] 

# train set, valid set 생성
users = defaultdict(list) 
user_train = {}
user_valid = {}
for u, i, t in zip(df['user_idx'], df['item_idx'], df['time']):
    users[u].append(i)

for user in users:
    user_train[user] = users[user][:-1]
    user_valid[user] = [users[user][-1]]

print(f'num users: {num_user}, num items: {num_item}')

num users: 4583, num items: 2399


In [None]:
class SeqDataset(Dataset):
    def __init__(self, user_train, num_user, num_item, max_len, mask_prob):
        self.user_train = user_train
        self.num_user = num_user
        self.num_item = num_item
        self.max_len = max_len
        self.mask_prob = mask_prob

    def __len__(self):
        # 총 user의 수 = 학습에 사용할 sequence의 수
        return self.num_user

    def __getitem__(self, user): 
        # iterator를 구동할 때 사용
        seq = self.user_train[user]
        tokens = []
        labels = []
        for s in seq:
            prob = np.random.random() 
            if prob < self.mask_prob:
                prob /= self.mask_prob

                # BERT 학습
                if prob < 0.8:
                    # masking
                    tokens.append(self.num_item + 1)  # mask_index: num_item + 1, 0: pad, 1~num_item: item index
                elif prob < 0.9:
                    tokens.append(np.random.randint(1, self.num_item+1))  # item random sampling
                else:
                    tokens.append(s)
                labels.append(s)  # 학습에 사용
            else:
                tokens.append(s)
                labels.append(0)  # 학습에 사용 X, trivial
        tokens = tokens[-self.max_len:]
        labels = labels[-self.max_len:]
        mask_len = self.max_len - len(tokens)

        # zero padding
        tokens = [0] * mask_len + tokens
        labels = [0] * mask_len + labels
        return torch.LongTensor(tokens), torch.LongTensor(labels)

# Model

In [None]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, hidden_units, dropout_rate):
        super(ScaledDotProductAttention, self).__init__()
        self.hidden_units = hidden_units
        self.dropout = nn.Dropout(dropout_rate) # dropout rate

    def forward(self, Q, K, V, mask):
        attn_score = torch.matmul(Q, K.transpose(2, 3)) / math.sqrt(self.hidden_units)
        attn_score = attn_score.masked_fill(mask == 0, -1e9)  # 유사도가 0인 지점은 -infinity로 보내 softmax 결과가 0이 되도록 함
        attn_dist = self.dropout(F.softmax(attn_score, dim=-1))  # attention distribution
        output = torch.matmul(attn_dist, V)  # dim of output : batchSize x num_head x seqLen x hidden_units
        return output, attn_dist

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, hidden_units, dropout_rate):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads # head의 수
        self.hidden_units = hidden_units
        
        # query, key, value, output 생성을 위해 Linear 모델 생성
        self.W_Q = nn.Linear(hidden_units, hidden_units, bias=False)
        self.W_K = nn.Linear(hidden_units, hidden_units, bias=False)
        self.W_V = nn.Linear(hidden_units, hidden_units, bias=False)
        self.W_O = nn.Linear(hidden_units, hidden_units, bias=False)

        self.attention = ScaledDotProductAttention(hidden_units, dropout_rate) # scaled dot product attention module을 사용하여 attention 계산
        self.dropout = nn.Dropout(dropout_rate) # dropout rate
        self.layerNorm = nn.LayerNorm(hidden_units, 1e-6) # layer normalization

    def forward(self, enc, mask):
        residual = enc # residual connection을 위해 residual 부분을 저장
        batch_size, seqlen = enc.size(0), enc.size(1)
        
        # Query, Key, Value를 (num_head)개의 Head로 나누어 각기 다른 Linear projection을 통과시킴
        Q = self.W_Q(enc).view(batch_size, seqlen, self.num_heads, self.hidden_units) 
        K = self.W_K(enc).view(batch_size, seqlen, self.num_heads, self.hidden_units)
        V = self.W_V(enc).view(batch_size, seqlen, self.num_heads, self.hidden_units)

        # Head별로 각기 다른 attention이 가능하도록 Transpose 후 각각 attention에 통과시킴
        Q, K, V = Q.transpose(1, 2), K.transpose(1, 2), V.transpose(1, 2)
        output, attn_dist = self.attention(Q, K, V, mask)

        # 다시 Transpose한 후 모든 head들의 attention 결과를 합침
        output = output.transpose(1, 2).contiguous() 
        output = output.view(batch_size, seqlen, -1)

        # Linear Projection, Dropout, Residual sum, and Layer Normalization
        output = self.layerNorm(self.dropout(self.W_O(output)) + residual)
        return output, attn_dist
    
class PositionwiseFeedForward(nn.Module):
    def __init__(self, hidden_units, dropout_rate):
        super(PositionwiseFeedForward, self).__init__()
        
        self.W_1 = nn.Linear(hidden_units, 4 * hidden_units) 
        self.W_2 = nn.Linear(4 * hidden_units, hidden_units)
        self.dropout = nn.Dropout(dropout_rate)
        self.layerNorm = nn.LayerNorm(hidden_units, 1e-6) # layer normalization

    def forward(self, x):
        residual = x
        output = self.W_2(F.gelu(self.dropout(self.W_1(x)))) # activation: relu -> gelu
        output = self.layerNorm(self.dropout(output) + residual)
        return output
    
class BERT4RecBlock(nn.Module):
    def __init__(self, num_heads, hidden_units, dropout_rate):
        super(BERT4RecBlock, self).__init__()
        self.attention = MultiHeadAttention(num_heads, hidden_units, dropout_rate)
        self.pointwise_feedforward = PositionwiseFeedForward(hidden_units, dropout_rate)

    def forward(self, input_enc, mask):
        output_enc, attn_dist = self.attention(input_enc, mask)
        output_enc = self.pointwise_feedforward(output_enc)
        return output_enc, attn_dist

# BERT4Rec

In [None]:
class BERT4Rec(nn.Module):
    def __init__(self, num_user, num_item, hidden_units, num_heads, num_layers, max_len, dropout_rate, device):
        super(BERT4Rec, self).__init__()

        self.num_user = num_user
        self.num_item = num_item
        self.hidden_units = hidden_units
        self.num_heads = num_heads
        self.num_layers = num_layers 
        self.device = device
        
        self.item_emb = nn.Embedding(num_item + 2, hidden_units, padding_idx=0) 
        self.pos_emb = nn.Embedding(max_len, hidden_units) # learnable positional encoding
        self.dropout = nn.Dropout(dropout_rate)
        self.emb_layernorm = nn.LayerNorm(hidden_units, eps=1e-6)
        
        self.blocks = nn.ModuleList([BERT4RecBlock(num_heads, hidden_units, dropout_rate) for _ in range(num_layers)])
        self.out = nn.Linear(hidden_units, num_item + 1) 
        
    def forward(self, log_seqs):
        seqs = self.item_emb(torch.LongTensor(log_seqs).to(self.device))
        positions = np.tile(np.array(range(log_seqs.shape[1])), [log_seqs.shape[0], 1])
        seqs += self.pos_emb(torch.LongTensor(positions).to(self.device))
        seqs = self.emb_layernorm(self.dropout(seqs))

        mask = torch.BoolTensor(log_seqs > 0).unsqueeze(1).repeat(1, log_seqs.shape[1], 1).unsqueeze(1).to(self.device) # mask for zero pad
        for block in self.blocks:
            seqs, attn_dist = block(seqs, mask)
        out = self.out(seqs)
        return out

# Training

In [None]:
model = BERT4Rec(num_user, num_item, hidden_units, num_heads, num_layers, max_len, dropout_rate, device)
model.to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0) # label이 0인 경우 무시
seq_dataset = SeqDataset(user_train, num_user, num_item, max_len, mask_prob)
data_loader = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, pin_memory=True) 
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
for epoch in range(1, num_epochs + 1):
    tbar = tqdm(data_loader)
    for step, (log_seqs, labels) in enumerate(tbar):
        logits = model(log_seqs)
        
        # size matching
        logits = logits.view(-1, logits.size(-1))
        labels = labels.view(-1).to(device)
        
        optimizer.zero_grad()
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        
        tbar.set_description(f'Epoch: {epoch:3d}| Step: {step:3d}| Train loss: {loss:.5f}')

Epoch:   1| Step:  35| Train loss: 7.24401: 100%|██████████| 36/36 [00:03<00:00, 10.18it/s]
Epoch:   2| Step:  35| Train loss: 6.93244: 100%|██████████| 36/36 [00:00<00:00, 51.80it/s]
Epoch:   3| Step:  35| Train loss: 6.87757: 100%|██████████| 36/36 [00:00<00:00, 51.71it/s]
Epoch:   4| Step:  35| Train loss: 6.88382: 100%|██████████| 36/36 [00:00<00:00, 51.81it/s]
Epoch:   5| Step:  35| Train loss: 6.85682: 100%|██████████| 36/36 [00:00<00:00, 52.40it/s]
Epoch:   6| Step:  35| Train loss: 6.96996: 100%|██████████| 36/36 [00:00<00:00, 53.50it/s]
Epoch:   7| Step:  35| Train loss: 6.55503: 100%|██████████| 36/36 [00:00<00:00, 52.26it/s]
Epoch:   8| Step:  35| Train loss: 6.75306: 100%|██████████| 36/36 [00:00<00:00, 53.40it/s]
Epoch:   9| Step:  35| Train loss: 6.50607: 100%|██████████| 36/36 [00:00<00:00, 54.04it/s]
Epoch:  10| Step:  35| Train loss: 6.44438: 100%|██████████| 36/36 [00:00<00:00, 53.74it/s]
Epoch:  11| Step:  35| Train loss: 6.32012: 100%|██████████| 36/36 [00:00<00:00,

In [None]:
model.eval()

NDCG = 0.0 # NDCG@10
HIT = 0.0 # HIT@10

a = []
for u in range(num_user):
    if u % 1000 == 0:
        print(u)
    seq = (user_train[u] + user_valid[u] + [num_item + 1])[-max_len:] 
    rated = set(user_train[u] + user_valid[u])
    
    item_idx = [i for i in range(num_item) if i not in rated]
    with torch.no_grad():
        predictions = - model(np.array([seq]))
        predictions = predictions[0][-1][item_idx] # sampling
        for i in range(10):
            rank = predictions.argsort()[i].item()
            a.append([check_user[u],check_item[item_idx[rank]]])

0
1000
2000
3000
4000


In [None]:
df = pd.DataFrame(a,columns=['user', 'item'])
df.to_csv('result_bert.csv', index=False)

# 결과 확인

In [None]:
result = pd.read_csv('result_bert.csv')

In [None]:
survey = result[result['user'] > 4531]

In [None]:
survey.to_csv("origin.csv")