In [5]:
import os
data_dir = os.path.join(os.getcwd(),'dataset')

In [6]:
ciao_dir = os.path.join(data_dir,'ciao')
ciao_data_dir = os.path.join(ciao_dir,'rating.mat')

In [7]:
from scipy.io import loadmat
ciao_data = loadmat(ciao_data_dir)

In [8]:
import pandas as pd
import numpy as np
ciao_df = pd.DataFrame(ciao_data['rating'], columns=['user_id','product_id','category_id','rating', 'helpfulness'])
ciao_df = ciao_df.drop(columns=['category_id','helpfulness'])

In [9]:
display(ciao_df.head())

Unnamed: 0,user_id,product_id,rating
0,1,1,3
1,1,2,4
2,1,3,4
3,1,4,5
4,1,5,4


# 사용하지 않는 columns
- Ciao
    - category_id, helpfulness
- Epinions
    - category_id, helpfulness, timestamp

## Sequence 정보를 drop하는 것에 대해
SocialTransformer
- sequence 정보 사용하지 않음
    - Epinions에는 timestamp 존재
    - Positional encoding 대신, centrality degree embedding 추가
    - 각 노드가 어느 정도의 영향력을 가지고 있는지에 대한 정보 제공
- sequence 정보가 없을 경우 생길 수 있는 문제점 
    - trend의 영향력을 간과할 수 있음
        - t시점의 interaction과 t+10 시점의 interaction을 동일 선상에 놓고 모델링하는 것은 비효율적일 수 있음
- **Sequence 정보에 대해서는 추후 더 생각해보기**

In [10]:
trust_file_ciao = loadmat(os.path.join(ciao_dir,'trustnetwork.mat'))
trust_df_ciao = pd.DataFrame(trust_file_ciao['trustnetwork'], columns=['user_id_1', 'user_id_2'])

In [11]:
display(trust_df_ciao.head())

Unnamed: 0,user_id_1,user_id_2
0,1,2
1,1,3
2,1,4
3,1,5
4,1,6


trust_df
- social interaction 관계를 나타내는 데이터

networks(nx)
- from_pandas_edgelist
    - pandas dataframe을 인자로 받아 네트워크 관계를 구성해줌

In [12]:
# reset and filter data(args : rating df, trust df)
# nx(networks) : 그래프 데이터 라이브러리(source-target 으로 연결됨)
import networkx as nx
social_network = nx.from_pandas_edgelist(trust_df_ciao, source='user_id_1', target='user_id_2')
social_ids = list(set(social_network.nodes))
user_item_ids = ciao_df['user_id'].unique().tolist()

In [13]:
# setxor1d : 두개의 arr 사이에서 겹치지 않는 부분 추출
non_users = np.setxor1d(social_ids, user_item_ids)

공통되지 않는 유저는 학습 데이터에 사용할 수 없으므로 제거
- $\because$ user-item / user-user

In [14]:
ciao_df = ciao_df[~ciao_df['user_id'].isin(non_users)]
ciao_df.reset_index(drop=True, inplace=True)

In [15]:
# user/item mapping table -> 라벨링 하는 부분인듯?
mapping_table_user = {user_id:idx+1 for idx,user_id in enumerate(social_ids)}
mapping_table_item = {item_id:idx+1 for idx,item_id in enumerate(ciao_df['product_id'].unique())}

In [16]:
ciao_df['user_id'] = ciao_df['user_id'].map(mapping_table_user)
ciao_df['product_id'] = ciao_df['product_id'].map(mapping_table_item)

`reset_and_filter_data`
1. user-item 데이터와 user-user 데이터 비교해, 공통되는 부분만 남김(필터링)
2. 필터링 후, user_id / item_id에 대해 라벨링 진행(1부터 연속적으로) & id 변경

rating matrix(sparse matrix)
- 0번쨰 row/column 비어있도록 설계되어있음
    - zero padding 때문에?
    - 0번째 user/item은 비어있는 sequence 채울떄 사용 -> 정상적인 sequence에 들어가면 안됨

In [17]:
from scipy import sparse
# user-item rating matrix 만들기
# number of user x number of item
rating_matrix = sparse.lil_matrix((ciao_df['user_id'].max()+1, ciao_df['product_id'].max()+1))
# 채워넣기
for index in ciao_df.index:
        rating_matrix[ciao_df['user_id'][index], ciao_df['product_id'][index]] = ciao_df['rating'][index]

rating_matrix = rating_matrix.toarray()

In [18]:
# shuffle data
from sklearn.utils import shuffle
ciao_df = shuffle(ciao_df, random_state=42)
num_test = int(len(ciao_df)*0.1)
rating_test_set = ciao_df.iloc[:num_test]
rating_valid_set = ciao_df.iloc[num_test:num_test*2]
rating_train_set = ciao_df.iloc[num_test*2:]

In [19]:
# generate social dataset
users  = rating_train_set.user_id.unique().tolist()
trust_df = trust_df_ciao
social_graph = trust_df[(trust_df['user_id_1'].isin(users))&(trust_df['user_id_2'].isin(users))]
# social graph -> trustnetwork_train_seed_42.csv

In [20]:
# generate user degree
social_graph = nx.from_pandas_edgelist(social_graph, source='user_id_1', target='user_id_2')
user_degree = {node:degree for node,degree in social_graph.degree()}
user_degree = pd.DataFrame(user_degree.items(), columns=['user_id','degree'])
user_degree = user_degree.sort_values(by='user_id')

In [21]:
# generate item degree
rating_df = rating_train_set
item_degree = rating_df.groupby('product_id')['user_id'].nunique().reset_index().sort_values(by='product_id')
item_degree.columns = ['product_id','degree']

In [22]:
# generate interacted items table
item_degree_dict = dict(zip(item_degree['product_id'], item_degree['degree']))
user_item_df = rating_df.groupby('user_id').agg({'product_id':list, 'rating':list}).reset_index()
user_item_df['product_degree'] = user_item_df['product_id'].apply(lambda x: [item_degree_dict[id] for id in x])
empty_data = [0, [0 for _ in range(4)], [0 for _ in range(4)], [0 for _ in range(4)]]
user_item_df.loc[-1] = empty_data
user_item_df.index = user_item_df.index+1
user_item_df = user_item_df.sort_values(by='user_id')

In [23]:
# generate interacted users table
item_user_df = rating_df.groupby('product_id').agg({'user_id':list, 'rating':list}).reset_index()
item_user_df = user_item_df.sort_values(by='product_id')

In [24]:
from tqdm import tqdm

In [25]:
# generate social random walk sequence
all_path_list = []
anchor_nodes = np.random.choice(social_graph.nodes(), size=1000, replace=False)
anchor_nodes = np.repeat(anchor_nodes,10)

In [26]:
def find_next_node(input_G, previous_node, current_node, RETURN_PARAMS):
    """
    input_G의 current_node에서 weight를 고려하여 다음 노드를 선택함. 
    - 이 과정에서 RETURN_params를 고려함. 
    - 이 값은 previous_node로 돌아가는가 돌아가지 않는가를 정하게 됨. 
    """
        
    select_probabilities = {}
    
    for node in input_G.neighbors(current_node):
        if node != previous_node:
            select_probabilities[node] = 1   
        
    select_probabilities_sum = sum(select_probabilities.values()) # len selected_probabilities
    select_probabilities = {k: v/select_probabilities_sum*(1-RETURN_PARAMS) for k, v in select_probabilities.items()}
    if previous_node is not None:
        select_probabilities[previous_node]=RETURN_PARAMS # 이 노드는 RETURN_PARAMS에 의해 결정됨. 
    
    # print(select_probabilities)
    # print(select_probabilities_sum)
    
    if select_probabilities_sum == 0:
        return 0
    
    selected_node = np.random.choice(
        a=[k for k in select_probabilities.keys()],
        p=[v for v in select_probabilities.values()]
    )
    return selected_node

- walk length==0(두번쨰 sequence)
    - prob dictionary(select_probabilities)의 value는 전부 1
    - select_probabilities의 sum == dictionary의 길이
    - select_probabilities : 1/n (uniform distribution)
    - selected_node : uniform distribution 하에서 random node choice

- sequence 목록의 마지막 원소(path_dict[node][-1])가 0인 경우(의미 없는 sequence)
    - 이후로 계속 0 채움(zero padding)

- walk length!=0
    - `find_next_node`함수에
        - 'previous_node' = 마지막에서 두번째로 추가된 노드
            - 'previous_node'와 같은 노드가 아닌 경우만 후보군에 추가
        - 'current_node' = 가장 마지막에 추가된 노드
        - RETURN_PARAMS = return_params(임의로 지정하는 hyperparameter)/10
            - 'previous_node'로 설정된 값이 다시 next_node로 등장할 확률
            - '1'인 경우에 next_node=previous_node
    - next_node가 sequence list에 이미 존재하는지 확인
        - 있으면 threshold +1
            - threshold가 10을 넘으면 zero_padding 시작
            - 중복이 10번 생기면 zero_padding
        

In [27]:
import time

In [28]:
# origin
start = time.time()
walk_length=5
all_path_list = []
for node in tqdm(anchor_nodes, desc='Genertaing random walk sequence...'):
    path_dict = {}
    path_dict[node] = [node]
    wl = 0
    threshold = 0
    # walk_length(sequence 길이)만큼 채울때 까지
    while wl<walk_length-1:
        if wl==0:
            next_node=find_next_node(social_graph, None, node, 0.0) # walk length=0
            path_dict[node].append(next_node)
        elif path_dict[node][-1]==0:
            path_dict[node].append(0)
        else:
            next_node=find_next_node(social_graph, path_dict[node][-2], path_dict[node][-1], 0.1) # walk length=0
            if next_node in path_dict[node]:
                threshold+=1
                if threshold>10: # 10번의 patience를 주고도 중복 node가 또 나온다면(==loop 안에서 돌고 있음), 그냥 zero padding시작
                    path_dict[node].append(0)
                else:
                    continue # threshold를 넘지 않을 경우, 추가하지 않고 다음 노드를 다시 찾는 작업으로 돌아감
            else:
                path_dict[node].append(next_node)
        wl+=1

    # # Get each user's degree information from degree table.
    degree_list = []
    for node_list in path_dict.values():
        for node in node_list:
            if node != 0:
                degree = user_degree['degree'].loc[user_degree['user_id'] == node].values[0]
            else:
                # If node is 0 (zero-padded value), returns 0.
                degree = 0
            degree_list.append(degree)

    path_dict = {key: [value, degree_list] for key, value in path_dict.items()}
    all_path_list.append(path_dict)

keys, walks, degrees = [], [], []
for paths in all_path_list:
    for key, value in paths.items():
        keys.append(key)
        walks.append(value[0])
        degrees.append(value[1])

result_df = pd.DataFrame({
    'user_id':keys,
    'random_walk_seq':walks,
    'degree':degrees
})
result_df.sort_values(by=['user_id'], inplace=True)
result_df.reset_index(drop=True, inplace=True)

print(time.time()-start)

Genertaing random walk sequence...: 100%|██████████| 10000/10000 [00:06<00:00, 1621.97it/s]

6.18858003616333





In [29]:
# revised - dataframe indexing(x) / hashing(o)
# 10000개 데이터 기준, 약 5초 빠름
start = time.time()
walk_length=5
anchor_seq_degree = []
user_degree_dic = dict(zip(user_degree.user_id, user_degree.degree))
for node in tqdm(anchor_nodes, desc='Genertaing random walk sequence...'):
    seqs = [node]
    wl = 0
    threshold = 0
    # walk_length(sequence 길이)만큼 채울때 까지
    while wl<walk_length-1:
        if wl==0:
            next_node=find_next_node(social_graph, None, node, 0.0) # walk length=0
            seqs.append(next_node)
        elif seqs[-1]==0:
            seqs.append(0)
        else:
            next_node=find_next_node(social_graph, seqs[-2], seqs[-1], 0.1) # walk length=0
            if next_node in seqs:
                threshold+=1
                if threshold>10: # 10번의 patience를 주고도 중복 node가 또 나온다면(==loop 안에서 돌고 있음), 그냥 zero padding시작
                    seqs.append(0)
                else:
                    continue # threshold를 넘지 않을 경우, 추가하지 않고 다음 노드를 다시 찾는 작업으로 돌아감
            else:
                seqs.append(next_node)
        wl+=1
    degrees = [0 if node==0 else user_degree_dic[node] for node in seqs]
    anchor_seq_degree.append([node,seqs,degrees])
random_walk_df = pd.DataFrame(anchor_seq_degree,columns=['user_id','random_walk_seq','degree'])
random_walk_df.sort_values(by='user_id',inplace=True)
random_walk_df.reset_index(drop=True, inplace=True)
print(time.time()-start)

Genertaing random walk sequence...: 100%|██████████| 10000/10000 [00:01<00:00, 7787.79it/s]

1.2973229885101318





In [30]:
import torch
from ast import literal_eval
# generate_input_sequence_data
'''
user_path : user sequence file dir
item_path : user item interaction file dir(user-item-rating-degree)
spd_path : shortest path between users file dir

ast.literal_eval : data load시에, list형태로 저장된 df 원소는 str로 저장되므로, 이를 변환하기 위해 사용
'''
user_df = random_walk_df 
# 저장 후 불러올때 사용
# user_df['random_walk_seq'] = user_df['random_walk_seq'].map(literal_eval)
# user_df['degree'] = user_df['degree'].map(literal_eval)
item_df = user_item_df
# item_df['product_id'] = item_df['product_id'].map(literal_eval)
# item_df['rating'] = item_df['rating'].map(literal_eval)
# item_df['product_degree'] = item_df['product_degree'].map(literal_eval)
spd_table = torch.from_numpy(np.load(os.path.join(ciao_dir, 'shortest_path_result.npy')))
# rating_matrix = np.load(os.path.join(ciao_dir, 'rating_matrix.npy'))
rating_matrix = rating_matrix

In [34]:
row = user_df.iloc[0]
current_user = row['user_id']
current_sequence = row['random_walk_seq']
current_degree = row['degree']

In [42]:
spd_table[torch.LongTensor(current_sequence).squeeze()-1,:][:,torch.LongTensor(current_sequence).squeeze()-1]

tensor([[0, 1, 2, 2, 2],
        [1, 0, 2, 2, 2],
        [2, 2, 0, 2, 2],
        [2, 2, 2, 0, 3],
        [2, 2, 2, 3, 0]])

spd matrix의 경우에는 index가 0부터 시작하기 때문에 user index-1이 필요하고

rating matrix는 user index가 1부터 시작하기 때문에 user index 그대로 사용

통일됐으면 좋겠음...

In [167]:
total_df = pd.DataFrame(columns=['user_id', 'user_sequences', 'user_degree', 'item_sequences', 'item_degree', 'item_rating', 'spd_matrix'])
# random walk sequence 한 row씩 loop
for _,data in tqdm(user_df.iterrows(), total=user_df.shape[0]):
    current_user = data['user_id'] 
    current_sequence = data['random_walk_seq']
    current_degree = data['degree']

    item_indexer = [int(x) for x in current_sequence] # 하나의 anchor에 해당하는 rw sequence
    item_list, degree_list = [],[]
    user_item_list = []

    # 생성된 sequence에 해당하는 product_id, product_degree, user_id*(product seq 개수만큼)
    

    # 전체 item, degree 뽑아내서 중복 제거 후, dictionary로 만든 후에 매핑 시키는 작업
    # 비효율적
    for index in item_indexer:
        if not index:
            continue
        # extend : ndarray가 아니라, 1d-array로 원소 붙여버림
        item_list.extend(item_df.loc[item_df['user_id']==index, 'product_id'].values[0])
        degree_list.extend(item_df.loc[item_df['user_id']==index, 'product_degree'].values[0])

    item_list_removed_duplicate = list(set(item_list))

    mapping_dict = {}
    for item,degree in zip(item_list, degree_list):
        if item not in mapping_dict:
            mapping_dict[item]=degree

    degree_list_removed_duplicate = [mapping_dict[item] for item in item_list_removed_duplicate]

    # 사용하지도 않음(없어도 되는 코드)
    user_mapping_dict = {}
    for item, user in zip(item_list, user_item_list):
        if item not in user_mapping_dict:
            user_mapping_dict[item] = user

    # padding n slicing(item & degree each)
    sliced_item_list, num_slices = slice_and_pad_list(item_list_removed_duplicate, slice_length=item_seq_len)
    sliced_degree_list, num_slices = slice_and_pad_list(degree_list_removed_duplicate, slice_length=item_seq_len)

    # shortest path matrix에서 현재 sequence에 해당하는 부분에 해당하는 nxn matrix가져오기
    spd_matrix = spd_table[torch.LongTensor(current_sequence).squeeze() - 1, :][:, torch.LongTensor(current_sequence).squeeze() - 1]

    # 

100%|██████████| 10000/10000 [00:12<00:00, 813.29it/s]


# main.py

In [1]:
import os
import torch
import torch.nn as nn
import pandas as pd
import networkx as nx
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# dataset
# train data load -> 3m 29s
data = pd.read_pickle('/home/moon/SocialTransformer/dataset/ciao/sequence_data_seed_42_walk_30_itemlen_100_rp_1_train.pkl')

In [3]:
data.head()

Unnamed: 0,user_id,user_sequences,user_degree,item_sequences,item_degree,user_item_sequences,item_rating,spd_matrix
0,1,"[1, 16, 494, 2988, 2120, 1372, 4982, 1250, 141...","[128, 227, 16, 6, 10, 346, 3, 170, 182, 9, 56,...","[24576, 1, 2, 3, 4, 5, 6, 11398, 8, 9, 10, 11,...","[1, 1, 1, 1, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...","[260, 1, 1, 1, 1, 1, 1, 112, 1, 1, 1, 1, 112, ...","[[tensor(0), tensor(3), tensor(4), tensor(4), ...","[[tensor(0), tensor(1), tensor(2), tensor(3), ..."
1,1,"[1, 16, 494, 2988, 2120, 1372, 4982, 1250, 141...","[128, 227, 16, 6, 10, 346, 3, 170, 182, 9, 56,...","[100, 101, 11416, 103, 104, 105, 57449, 8295, ...","[9, 3, 1, 11, 1, 2, 4, 18, 1, 1, 2, 1, 1, 1, 1...","[1, 1, 112, 1, 1, 1, 1372, 294, 112, 1, 1, 260...","[[tensor(3), tensor(5), tensor(0), tensor(4), ...","[[tensor(0), tensor(1), tensor(2), tensor(3), ..."
2,1,"[1, 16, 494, 2988, 2120, 1372, 4982, 1250, 141...","[128, 227, 16, 6, 10, 346, 3, 170, 182, 9, 56,...","[200, 201, 202, 203, 204, 11438, 206, 207, 208...","[1, 1, 1, 2, 1, 4, 1, 1, 2, 11, 4, 1, 2, 1, 1,...","[1, 1, 1, 1, 1, 112, 1, 1, 1, 260, 1, 1, 260, ...","[[tensor(4), tensor(5), tensor(5), tensor(5), ...","[[tensor(0), tensor(1), tensor(2), tensor(3), ..."
3,1,"[1, 16, 494, 2988, 2120, 1372, 4982, 1250, 141...","[128, 227, 16, 6, 10, 346, 3, 170, 182, 9, 56,...","[304, 24881, 307, 308, 309, 310, 311, 312, 313...","[1, 4, 1, 3, 15, 1, 4, 2, 3, 1, 1, 10, 3, 2, 1...","[1, 764, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...","[[tensor(5), tensor(0), tensor(5), tensor(3), ...","[[tensor(0), tensor(1), tensor(2), tensor(3), ..."
4,1,"[1, 16, 494, 2988, 2120, 1372, 4982, 1250, 141...","[128, 227, 16, 6, 10, 346, 3, 170, 182, 9, 56,...","[414, 415, 416, 417, 418, 419, 421, 422, 423, ...","[1, 2, 3, 2, 3, 1, 1, 3, 1, 2, 1, 1, 1, 8, 2, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 764, 1, 1, 1, 1, 1...","[[tensor(5), tensor(5), tensor(5), tensor(1), ...","[[tensor(0), tensor(1), tensor(2), tensor(3), ..."


In [4]:
sample = data.iloc[:5]
batch = dict()

In [5]:
batch['user_sequence'] = torch.tensor(np.stack(sample['user_sequences']))
batch['user_degree'] = torch.tensor(np.stack(sample['user_degree']))
batch['item_sequence'] = torch.tensor(np.stack(sample['item_sequences']))
batch['item_degree'] = torch.tensor(np.stack(sample['item_degree']))
batch['user_item_seq'] = torch.tensor(np.stack(sample['user_item_sequences']))
batch['item_rating'] = torch.tensor(np.stack(sample['item_rating']))
batch['spd_matrix'] = torch.tensor(np.stack(sample['spd_matrix']))

## Encoder
0. user embedding vector 생성
    - user index별 embedding + user degree별 embedding
        - Element wise sum
1. user sequence padding mask & Attention bias 
    - mask
        - seq==0 인 경우 -10000으로 마스킹
            - 근데, embedding table 생성할때 `padding_idx=0`으로 설정한거 같은데 굳이 -10000으로 바꿀 필요가 있나?
    - attn_bias
        - user node간의 최소 거리를 나타내는 distance matrix 중에서 사용되는 user_sequence에 해당되는 distance matrix만을 추려냄
            - [bs=5, n_user=30]이라면, 5x30x30의 형태를 가짐
        - 기준이 되는 user로 부터 distance가 클수록 영향력을 작게 설정하기 위해 제곱의 값에 역수를 취함
            - `attn_bias = torch.where(attn_bias=0, 1.0, 1/(attn_bias**2).double())`
        - 추후 계산되는 scaled dot product값과 mse_loss를 계산
2. Multi head Attention
    - Head split
        - 지정된 head의 개수만큼 tensor를 나눔
            - $5 \times 30 \times 64 \rightarrow 5\times 4 \times 30 \times 16$
    - Scaled Dot Product 계산
        - $\frac{QK^{T}}{\sqrt{d_k}}$
            - $Q : 5\times 4 \times 30 \times 16$
            - $K^{T} : 5\times 4 \times 16 \times 30$
            - $\text{output} : 5\times 4 \times 30 \times 30$
        - 계산된 dot product값은 $V$와 연산되어 최종 Attention output이 됨
            - 각 user마다 계산된 weight에 대한 V벡터의 weighted Sum
            - $\text{Scaled Dot Product} : 5\times 4 \times 30 \times 30$
            - $V : 5\times 4 \times 30 \times 16$
            - $\text{output} : 5\times 4 \times 30 \times 16$
3. FFN
    - secondary loss(\w score&attn_bias) layer마다 append
4. encoder output, loss 평균값 return

In [91]:
from config import ciao as ciao_config
ciao_config

{'dataset': {'train': 53393, 'dev': 8489, 'test': 8428},
 'model': {'num_user': 7317,
  'max_degree_user': 804,
  'num_item': 105114,
  'max_degree_item': 915,
  'max_spd_value': 15,
  'd_model': 64,
  'd_ffn': 256,
  'num_heads': 4,
  'dropout': 0.1,
  'num_layers_enc': 2,
  'num_layers_dec': 2},
 'training': {'batch_size': 128,
  'optimizer': 'adamw',
  'learning_rate': 0.0001,
  'warmup': 40,
  'lr_decay': 'linear',
  'weight_decay': 0,
  'num_epochs': 100,
  'patience': 10,
  'alpha': 1,
  'beta': 3,
  'gamma': 3}}

In [92]:
# 1. User Embedding vector 생성
n_user = ciao_config['model']['num_user']
d_model = ciao_config['model']['d_model']
user_table = nn.Embedding(n_user+1, d_model, padding_idx=0)

max_user_degree = ciao_config['model']['max_degree_user']
user_degree_table = nn.Embedding(max_user_degree+1, d_model, padding_idx=0)

user_emb = user_table(batch['user_sequence'])
user_degree_emb = user_table(batch['user_degree'])
user_input_emb = user_emb+user_degree_emb
print(user_input_emb.shape)

torch.Size([5, 30, 64])


In [93]:
# 2. generate attention padding mask
batch_size, len_seq = batch['user_sequence'].size()

pad_attn_mask = (batch['user_sequence'].data!=0).unsqueeze(1) # bs x 1 x len_user_seq
print(pad_attn_mask.shape)
pad_attn_mask = pad_attn_mask.expand(batch_size, len_seq, len_seq) # bs x len_user_seq x len_user_seq
print(pad_attn_mask.shape)

torch.Size([5, 1, 30])
torch.Size([5, 30, 30])


In [94]:
# 3. spatial encoding(Spatial Encoder) -> spatial & positional bias
num_heads = ciao_config['model']['num_heads']
# bs x len_user_seq x len_user_seq -> n_head x bs x len_user_seq x len_user_seq -> bs x len_user_seq x len_user_seq x num_heads
print(batch['spd_matrix'].shape)
spatial_bias = batch['spd_matrix'].repeat(num_heads, 1, 1, 1).permute(1,2,3,0)
print(spatial_bias.shape)

torch.Size([5, 30, 30])
torch.Size([5, 30, 30, 4])


In [95]:
import math
import torch.nn.functional as F

# Multi-head Attention

W_Q = nn.Linear(d_model,d_model)
W_K = nn.Linear(d_model,d_model)
W_V = nn.Linear(d_model,d_model)

Q, K, V = W_Q(user_input_emb), W_K(user_input_emb), W_V(user_input_emb)
print("before split :", Q.shape)
# split q,k,v vector by 'num_heads'
def split_tensor(tensor, n_head=4):
    bs, length, d_model = tensor.size()
    d_tensor = d_model // n_head
    tensor = tensor.view(bs, n_head, length, d_tensor)
    return tensor

Q, K, V = split_tensor(Q), split_tensor(K), split_tensor(V)
print("after split :", Q.shape) # bs x len_user_seq x n_head x d
mask = pad_attn_mask # bs x len_user_seq x len_user_seq
print("mask shape before :", mask.shape)
if mask!=None:
    mask = mask.unsqueeze(1).repeat(1, num_heads, 1, 1)
    print("mask shape after:",mask.shape)
attn_bias = spatial_bias.permute(0,3,2,1)
print("attn bias shape :", attn_bias.shape)
spd_param = nn.Parameter(torch.randn((30,30), dtype=torch.float), requires_grad=True)

before split : torch.Size([5, 30, 64])
after split : torch.Size([5, 4, 30, 16])
mask shape before : torch.Size([5, 30, 30])
mask shape after: torch.Size([5, 4, 30, 30])
attn bias shape : torch.Size([5, 4, 30, 30])


In [96]:
# Scaled Dot Product
d_tensor = K.size()[-1]
K_T = K.transpose(2,3)
score = torch.matmul(Q, K_T) / math.sqrt(d_tensor)

# mask(user_seq이 0인 경우에 False)
if mask!=None:
    score = score.masked_fill(mask==0, -10000) # mask(seq==0)인 경우 dot product값을 -10000으로 치환
    # 이 경우에, attn_bias값과 차이가 심할텐데, loss반영이 잘 되는지...?
    # 1/1e9로 치환하는게 낫지 않을까?
if attn_bias!=None:
    attn_bias = torch.where(attn_bias==0.0, 1.0, (1/(attn_bias)**2).double())
    loss = torch.sqrt(F.mse_loss(score.float(), attn_bias.float())) # attention 결과와 attn bias의 mse loss?

score = torch.softmax(score, dim=-1) # dim = -1 : sequence별 softmax
V = torch.matmul(score,V)
# concat
bs, n_head, length, d_tensor = V.size()
V = V.transpose(1,2).contiguous().view(bs, length, n_head*d_tensor)
W_concat = nn.Linear(d_model,d_model)
encoder_output = W_concat(V)
print(encoder_output.shape)

torch.Size([5, 30, 64])


In [77]:
# # 제대로 concat되는지 확인(transpose후 view가 맞음)
# t = torch.randn([2,2,3,2])
# t_2 = t.transpose(1,2).contiguous().view(2,3,4)
# print(t_2[0][0])
# t_3 = t.reshape(2,3,4)
# print(t_3[0][0])
# # head별로 찢고 concat
# t_4 = torch.cat([t[:,0,:], t[:,1,:]],dim=-1)
# print(t_4[0][0])

- score
    - Scaled dot product값
        - 각 sequence에 대한 가중치
- attn_bias
    - shortest distance
        - 전체 sequence 내에서 user별로 타 user간의 최소 거리 행렬을 나타낸 값의 제곱의 역수
        - 거리가 멀수록 값이 작고, 거리가 가까울 수록 값이 큼
- loss
    - score, attn_bias의 mse loss
    - 학습을 통해 attention 결과와 attn_bias를 가깝게 만들어주기 위함
- OK

- Multi head Concat시 torch 사용법
    - 단순히 reshape or view를 하게 되면, 제대로 concatenation이 일어나지 않음
    1. transpose해서 head와 length의 dimension을 바꿈
    2. contiguous하게 만들기
    3. reshape

Multi head Attention 과정
1. Q,K,V 벡터 생성
2. sequence mask & attn bias 생성
    - mask : seq==0인경우 -10000으로 치환
    - attn_bias : score(scaled dot product)와 비교해 loss 계산
3. Scaled Dot Product 계산
4. Scaled Dot product와 attn_bias와의 loss 계산
5. Attention output(V) return

## Decoder

In [99]:
batch.keys() 

dict_keys(['user_sequence', 'user_degree', 'item_sequence', 'item_degree', 'user_item_seq', 'item_rating', 'spd_matrix'])

In [100]:
# Item embedding(ItemNodeEncoder)
num_item = ciao_config['model']['num_item']
max_item_degree = ciao_config['model']['max_degree_item']
item_table = nn.Embedding(num_item+1, d_model, padding_idx=0)
item_degree_table = nn.Embedding(max_item_degree+1, d_model, padding_idx=0)
item_embedding = item_table(batch['item_sequence'])+item_degree_table(batch['item_degree'])
print(item_embedding.shape)

torch.Size([5, 100, 64])


In [109]:
# self attention mask
bs,len_item = batch['item_sequence'].size()
dec_self_attn_mask = (batch['item_sequence']!=0).unsqueeze(1).expand(-1,len_item,-1)
print(dec_self_attn_mask.shape)
# cross attention mask
bs,len_item = batch['item_sequence'].size()
bs,len_user = batch['user_sequence'].size()
dec_cross_attn_mask = (batch['user_sequence']!=0).unsqueeze(1).expand(-1,len_item,len_user)
print(dec_cross_attn_mask.shape)

torch.Size([5, 100, 100])
torch.Size([5, 100, 30])


In [110]:
# attention bias(item rating - implicit rating)
item_rating = torch.where(batch['item_rating']==0,0,1)
bs,len_user,len_item = item_rating.size()
decoder_attn_bias = item_rating.repeat(num_heads,1,1,1).permute(1,2,3,0).permute(0,3,2,1) # 원본
print(decoder_attn_bias.shape)

torch.Size([5, 4, 100, 30])


### Deocder Layers

In [111]:
# 1. self attention(decoder-decoder / attn_bias (x))
W_q, W_k, W_v  = nn.Linear(d_model,d_model), nn.Linear(d_model,d_model), nn.Linear(d_model,d_model)
Q,K,V = W_q(item_embedding), W_k(item_embedding), W_v(item_embedding)
Q,K,V = split_tensor(Q), split_tensor(K), split_tensor(V)
if dec_self_attn_mask!=None:
    print(dec_self_attn_mask.shape)
    # head수 만큼 차원 확장
    dec_self_attn_mask = dec_self_attn_mask.unsqueeze(1).expand(-1,num_heads,-1,-1)
    print(dec_self_attn_mask.shape)
    
# scaled dot product
bs, h, len_item, d_tensor = Q.shape
K_T = K.transpose(-2,-1)
score = torch.matmul(Q,K_T) / math.sqrt(d_model)
print(score.shape)

if dec_self_attn_mask != None:
    score = score.masked_fill(dec_self_attn_mask==0,-10000)
    
loss = 0
# attn_bias==None(self attention)
score = torch.softmax(score,dim=-1)
print(score.shape)
V = torch.matmul(score,V)
print(V.shape)

torch.Size([5, 100, 100])
torch.Size([5, 4, 100, 100])
torch.Size([5, 4, 100, 100])
torch.Size([5, 4, 100, 100])
torch.Size([5, 4, 100, 16])


In [112]:
# 2. cross attention(encoder-decoder / attn_bias(o))
# Q-decoder / K,V - encoder
W_q, W_k, W_v  = nn.Linear(d_model,d_model), nn.Linear(d_model,d_model), nn.Linear(d_model,d_model)
Q,K,V = W_q(item_embedding), W_k(encoder_output), W_v(encoder_output)
Q,K,V = split_tensor(Q), split_tensor(K), split_tensor(V)

if dec_cross_attn_mask!=None:
    print(dec_cross_attn_mask.shape)
    # head수 만큼 차원 확장
    dec_cross_attn_mask = dec_cross_attn_mask.unsqueeze(1).expand(-1,num_heads,-1,-1)
    print(dec_cross_attn_mask.shape)
    
# scaled dot product
bs, h, len_item, d_tensor = Q.shape
bs, h, len_user, d_tensor = K.shape
K_T = K.transpose(-2,-1)
score = torch.matmul(Q,K_T) / math.sqrt(d_model)
print(score.shape)
# zero padding -> masking 적용(0인 sequence에 대한 tensor값 -10000로 치환)
if dec_cross_attn_mask!=None:
    score = score.masked_fill(dec_cross_attn_mask==0, -10000)
    
if decoder_attn_bias!=None:
    decoder_attn_bias = torch.where(decoder_attn_bias==0,-1,1)
    loss = torch.mean(torch.abs((torch.sign(score.float())-torch.sign(decoder_attn_bias.float()))))
    print(loss)

score = torch.softmax(score,dim=-1)
V = torch.matmul(score,V)
print(V.shape)

torch.Size([5, 100, 30])
torch.Size([5, 4, 100, 30])
torch.Size([5, 4, 100, 30])
tensor(0.9198, grad_fn=<MeanBackward0>)
torch.Size([5, 4, 100, 16])


In [None]:
attn_bias))))


- attn_bias(Decoder)
    - Decoder에서 계산되는 attention(cross)에 대해서 loss 계산(MAE)
        - cross attention
            - user & item representation에 대한 연산
            - bs x head x n_user x n_item(?)
    - self-attetion에 대해서는 attn_bias 없음