In [1]:
import torch
import pandas as pd
import torch.nn as nn
import numpy as np

# Dataset

In [40]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [41]:
# dataset
path = "/data/gyuseok/ml-latest-small/ratings.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [42]:
# userid
users = df.userId.unique()
user_id = {value: idx for idx,value in enumerate(users)}
df["userId"] = df["userId"].apply(lambda x: user_id[x])

In [43]:
# itemid
items = df.movieId.unique()
item_id = {value: idx for idx, value in enumerate(items)}
df["movieId"] = df["movieId"].apply(lambda x: item_id[x])

In [44]:
# num
num_users = df["userId"].nunique()
num_items = df["movieId"].nunique()
print(f"num_users = {num_users}, num_items = {num_items}")

num_users = 610, num_items = 9724


In [45]:
df = df[["userId","movieId"]]
df.columns = ["user", "item"]

In [46]:
from collections import defaultdict
user_item_interact = defaultdict(list)
user_item_interact

defaultdict(list, {})

In [47]:
from collections import defaultdict
user_item_interact = defaultdict(list)
for row in df.itertuples():
    user_item_interact[row.user].append(row.item)

In [48]:
# k-core
k = 10
for user, items in user_item_interact.items():
    if len(items) < k:
        print(user, len(items))
        del user_item_interact[user]

In [49]:
# train & test & valid
test_size = 0.1
train_dict = defaultdict(list)
valid_dict = defaultdict(list)
test_dict = defaultdict(list)

# item split
for user in user_item_interact:
    
    items = user_item_interact[user]
    np.random.shuffle(items)
    num_test_items = int(len(items) * test_size)
    
    test_items = items[:num_test_items]
    valid_items = items[num_test_items:num_test_items*2]
    train_items = items[num_test_items*2:]
    
    # assign
    test_dict[user] = test_items
    valid_dict[user] = valid_items
    train_dict[user] = train_items

In [50]:
# filtering
train_mat_R = defaultdict(list)
for user in train_dict:
    for item in train_dict[user]:
        train_mat_R[item].append(user)

In [51]:
for u in list(valid_dict.keys()):
    for i in list(valid_dict[u]):
        if i not in train_mat_R:
            valid_dict[u].remove(i)
    
    if len(valid_dict[u]) == 0:
        del valid_dict[u]
        del test_dict[u]

In [52]:
for u in list(test_dict.keys()):
    for i in list(test_dict[u]):
        if i not in train_mat_R:
            test_dict[u].remove(i)

    if len(test_dict[u]) == 0:
        del valid_dict[u]
        del test_dict[u]

In [53]:
def create_pair(dic:dict):
    pairs = []
    for u in dic.keys():
        for i in dic[u]:
            pairs.append((u, i))
    return pairs

In [54]:
# train_pair = create_pair(train_dict)
# valid_pair = create_pair(valid_dict)
# test_pair = create_pair(test_dict)

In [55]:
num_items

9724

In [56]:
num_users

610

In [58]:
from torch.utils.data import Dataset

class CF_Train_dataset(Dataset):
    def __init__(self, train_dict, num_items, num_sample):
        super().__init__()
        
        self.train_dict = train_dict
        self.train_arr = []
        
        self.num_sample = num_sample
        self.all_items = set(range(num_items))
        self.length = 0
        
        for u in list(train_dict.keys()):
            self.length += len(train_dict[u])
        
        
    def __len__(self):
        return self.length * self.num_sample
    
    
    def __getitem__(self, idx):
        assert self.train_arr
        user, pos_item, neg_item = self.train_arr[idx]
        return user, pos_item, neg_item
    
    def negative_sampling(self):
        
        for u in list(train_dict.keys()):
            pos_items = set(train_dict[u])
            candidate_items = list(self.all_items - pos_items)
            neg_items = np.random.choice(candidate_items, size = len(pos_items) * self.num_sample)
            
            for idx, pos_i in enumerate(pos_items):
                neg_start = idx * self.num_sample
                neg_end = (idx + 1) * self.num_sample
                
                for neg_i in neg_items[neg_start:neg_end]:
                    self.train_arr.append((u, pos_i, neg_i))
                

In [59]:
class CF_Test_dataset(Dataset):
    def __init__(self, test_dict):
        super().__init__()
        
        self.test_dict = test_dict
        self.test_pairs = [(u, i) for u in list(test_dict.keys()) for i in test_dict[u]]
        self.length = len(self.test_pairs)
        
    def __len__(self):
        return self.length
    
    def __getitem__(self, idx):
        user, pos_item = self.test_pairs[idx]
        return user, pos_item
    

In [156]:
# dataset

num_sample = 2
train_dataset = CF_Train_dataset(train_dict, num_items, num_sample)
valid_dataset = CF_Test_dataset(valid_dict)
test_dataset = CF_Test_dataset(test_dict)


In [157]:
# dataloader
from torch.utils.data import DataLoader

batch_size = 2048
train_loader = DataLoader(train_dataset, batch_size = batch_size,
                          shuffle = True, drop_last = False) # 여기서 바로 tensor 주는듯.


# LightGCN

## Step1: Making the symmetrically normalized matrix (SNM)

In [158]:
# init
R = torch.zeros(num_users,num_items)

# interaction
for row in df.itertuples():
    R[row.user][row.item] = 1

In [159]:
Zero_top = torch.zeros(num_users,num_users)
Zero_under = torch.zeros(num_items, num_items)

In [160]:
upper = torch.cat([Zero_top, R], dim = 1)
lower = torch.cat([R.T, Zero_under], dim = 1)
Adj_mat = torch.cat([upper,lower])

In [161]:
torch.allclose(Adj_mat[:num_users, num_users:], R)
torch.allclose(Adj_mat[num_users:, :num_users], R.T)

True

In [162]:
Adj_mat.shape

torch.Size([10334, 10334])

In [163]:
interactions = torch.cat([torch.sum(R,dim = 1), torch.sum(R,dim = 0)])

In [164]:
D = torch.diag(interactions)
D.shape

torch.Size([10334, 10334])

In [165]:
half_D = torch.sqrt(1/D)
half_D[half_D == float("inf")] = 0
half_D

tensor([[0.0657, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.1857, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.1601,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 1.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 1.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000]])

In [166]:
# symmetrically normalized matrix
SNM = (half_D @ Adj_mat @ half_D).to(device)

## Step2: Model

In [167]:
class LightGCN(nn.Module):
    
    def __init__(self, num_users, num_items, emb_size, SNM, num_layers, last_only_use = False):
        super().__init__()
        
        self.embs = nn.Embedding(num_users + num_items, emb_size)
        self.SNM = SNM
        self.num_layer = num_layers
        self.num_users = num_users
        self.last_only_use = last_only_use
        
        nn.init.xavier_uniform_(self.embs.weight)
    
    def get_embs(self):
        embs = self.embs.weight
        all_embs = [embs]

        # propagation
        for _ in range(self.num_layer):
            embs = self.SNM @ embs
            all_embs.append(embs)
        
        if self.last_only_use:
            return all_embs[-1]
        else:
            return torch.mean(torch.stack(all_embs), dim = 0)
    
    
    def get_score(self):
        embs = self.get_embs()
        user_emb = embs[:self.num_users]
        item_emb = embs[self.num_users:]
        
        score = user_emb @ item_emb.T     
        return score
        
        
    def forward(self, users, pos_items, neg_items):
        
        embs = self.get_embs()
        
        # user, item embedding for Batch  
        user_emb = embs[users]
        pos_item_emb = embs[pos_items + self.num_users - 1] # adjust the index
        neg_item_emb = embs[neg_items + self.num_users - 1] # adjust the index
        
        # score
        pos_score = (user_emb * pos_item_emb).sum(dim = -1)
        neg_score = (user_emb * neg_item_emb).sum(dim = -1)
        
        return pos_score, neg_score    

In [186]:
# model
emb_size = 64
num_layers = 4
last_only_use = False
model = LightGCN(num_users, num_items, emb_size, SNM, num_layers, last_only_use).to(device)


# Evaluation

In [187]:
k_list = [20]

In [188]:
import copy

def to_np(x):
    return x.detach().cpu().numpy()

def create_metrics(k_list):
    metrics = {}
    for k in k_list:
        for metric in ["Hit","Recall","NDCG"]:
            metrics[f'{metric}_{k}'] = []

    eval_results = {'valid' : copy.deepcopy(metrics), 'test': copy.deepcopy(metrics)}
    return eval_results

def get_eval(train_mat, valid_mat, test_mat, sorted_mat, k_list):
    
    max_k = max(k_list)
    eval_results = create_metrics(k_list)
    
    for test_user in test_mat:

        sorted_list = to_np(sorted_mat[test_user])

        for mode in ["valid",'test']:

            if mode == "valid":
                gt_mat = valid_mat
                already_seen_items = set(train_mat[test_user]) | set(test_mat[test_user])

            elif mode == "test":
                gt_mat = test_mat
                already_seen_items = set(train_mat[test_user]) | set(valid_mat[test_user])

            sorted_list_tmp = []
            for item in sorted_list:
                if item not in already_seen_items:
                    sorted_list_tmp.append(item)

                if len(sorted_list_tmp) > max_k: break

            for k in k_list:
                hit_k = len(set(sorted_list_tmp[:k]) & set(gt_mat[test_user]))

                # Hit & Recall
                eval_results[mode][f"Hit_{k}"].append(hit_k / k)
                eval_results[mode][f"Recall_{k}"].append(hit_k / len(gt_mat[test_user]))

                # NDCG
                denom = np.log2(np.arange(2, k+2))
                dcg_k = np.sum(np.in1d(sorted_list_tmp[:k], gt_mat[test_user]) / denom)
                idcg_k = np.sum((1 / denom)[:min(len(gt_mat[test_user]), k)])
                NDCG_k = dcg_k / idcg_k

                eval_results[mode][f"NDCG_{k}"].append(NDCG_k)

    # average
    for mode in ["valid", "test"]:
        for k in k_list:
            eval_results[mode][f"Hit_{k}"] = round(np.mean(eval_results[mode][f"Hit_{k}"]), 4)
            eval_results[mode][f"Recall_{k}"] = round(np.mean(eval_results[mode][f"Recall_{k}"]), 4)
            eval_results[mode][f"NDCG_{k}"] = round(np.mean(eval_results[mode][f"NDCG_{k}"]), 4)
    return eval_results



In [189]:
results = get_eval(train_dict, valid_dict, test_dict, sorted_mat, k_list)
results

{'valid': {'Hit_20': 0.0887, 'Recall_20': 0.1665, 'NDCG_20': 0.1605},
 'test': {'Hit_20': 0.0875, 'Recall_20': 0.1667, 'NDCG_20': 0.1613}}

# Train & Val & Test

In [190]:
lr = 0.001
epochs = 50
optimizer = torch.optim.Adam(model.parameters(), lr = lr, weight_decay = 1e-4)


In [191]:
# layer = 4, neg_sample = 2, last_only_use = False, lr = 0.001 -> 단순 aggreagation은 overfitting을 심화시킴.

def cpu2gpu(data, device):
    return list(map(lambda x: x.to(device), data))

for epoch in range(epochs):
    print(f"[Epoch:{epoch}]")
    train_loader.dataset.negative_sampling()
    train_loss = 0
    for data in train_loader:
        u, i, j = cpu2gpu(data, device)
        
        # forward
        pos_score, neg_score = model(u,i,j)
        #loss = -(pos_score - neg_score).sigmoid().log().sum()
        
        # to solve nan problem
        loss = (pos_score - neg_score).sigmoid()
        loss = torch.where(loss < 0.9999, loss, 0.9999)
        loss = torch.where(loss > 0.0001, loss, 0.0001)
        loss = -loss.log().sum()
        
        train_loss += loss.item()
        
        
        
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    score_mat = model.get_score()
    sorted_mat = torch.argsort(score_mat, dim = 1, descending = True)
    results = get_eval(train_dict, valid_dict, test_dict, sorted_mat, k_list)
    
    train_loss /= len(train_loader)
    print(f"train_loss = {train_loss:.4f}")
    print(results)
    print()
    

[Epoch:0]
train_loss = 1318.4502
{'valid': {'Hit_20': 0.0711, 'Recall_20': 0.1279, 'NDCG_20': 0.1266}, 'test': {'Hit_20': 0.0699, 'Recall_20': 0.1219, 'NDCG_20': 0.1261}}

[Epoch:1]
train_loss = 1011.9600
{'valid': {'Hit_20': 0.0706, 'Recall_20': 0.1262, 'NDCG_20': 0.1261}, 'test': {'Hit_20': 0.0684, 'Recall_20': 0.1221, 'NDCG_20': 0.1242}}

[Epoch:2]
train_loss = 882.7140
{'valid': {'Hit_20': 0.0702, 'Recall_20': 0.1258, 'NDCG_20': 0.1255}, 'test': {'Hit_20': 0.0665, 'Recall_20': 0.1217, 'NDCG_20': 0.1225}}

[Epoch:3]
train_loss = 820.2934
{'valid': {'Hit_20': 0.0686, 'Recall_20': 0.1236, 'NDCG_20': 0.1248}, 'test': {'Hit_20': 0.0654, 'Recall_20': 0.1221, 'NDCG_20': 0.1209}}

[Epoch:4]
train_loss = 776.6343
{'valid': {'Hit_20': 0.0676, 'Recall_20': 0.1213, 'NDCG_20': 0.1235}, 'test': {'Hit_20': 0.0639, 'Recall_20': 0.1208, 'NDCG_20': 0.1196}}

[Epoch:5]
train_loss = 743.0702
{'valid': {'Hit_20': 0.0672, 'Recall_20': 0.12, 'NDCG_20': 0.1226}, 'test': {'Hit_20': 0.0624, 'Recall_20': 0.1

In [185]:
# layer = 4, neg_sample = 2, last_only_use = True, lr = 0.001

def cpu2gpu(data, device):
    return list(map(lambda x: x.to(device), data))

for epoch in range(epochs):
    print(f"[Epoch:{epoch}]")
    train_loader.dataset.negative_sampling()
    train_loss = 0
    for data in train_loader:
        u, i, j = cpu2gpu(data, device)
        
        # forward
        pos_score, neg_score = model(u,i,j)
        #loss = -(pos_score - neg_score).sigmoid().log().sum()
        
        # to solve nan problem
        loss = (pos_score - neg_score).sigmoid()
        loss = torch.where(loss < 0.9999, loss, 0.9999)
        loss = torch.where(loss > 0.0001, loss, 0.0001)
        loss = -loss.log().sum()
        
        train_loss += loss.item()
        
        
        
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    score_mat = model.get_score()
    sorted_mat = torch.argsort(score_mat, dim = 1, descending = True)
    results = get_eval(train_dict, valid_dict, test_dict, sorted_mat, k_list)
    
    train_loss /= len(train_loader)
    print(f"train_loss = {train_loss:.4f}")
    print(results)
    print()
    

[Epoch:0]
train_loss = 1300.0593
{'valid': {'Hit_20': 0.0742, 'Recall_20': 0.122, 'NDCG_20': 0.1285}, 'test': {'Hit_20': 0.0721, 'Recall_20': 0.1193, 'NDCG_20': 0.1268}}

[Epoch:1]
train_loss = 1057.2086
{'valid': {'Hit_20': 0.0734, 'Recall_20': 0.1214, 'NDCG_20': 0.1276}, 'test': {'Hit_20': 0.0721, 'Recall_20': 0.117, 'NDCG_20': 0.1263}}

[Epoch:2]
train_loss = 1032.3338
{'valid': {'Hit_20': 0.0743, 'Recall_20': 0.1264, 'NDCG_20': 0.1282}, 'test': {'Hit_20': 0.0717, 'Recall_20': 0.1183, 'NDCG_20': 0.1262}}

[Epoch:3]
train_loss = 1025.7658
{'valid': {'Hit_20': 0.0727, 'Recall_20': 0.1242, 'NDCG_20': 0.1266}, 'test': {'Hit_20': 0.0716, 'Recall_20': 0.1184, 'NDCG_20': 0.1262}}

[Epoch:4]
train_loss = 1020.3503
{'valid': {'Hit_20': 0.0727, 'Recall_20': 0.1261, 'NDCG_20': 0.1267}, 'test': {'Hit_20': 0.0713, 'Recall_20': 0.121, 'NDCG_20': 0.126}}

[Epoch:5]
train_loss = 1015.5147
{'valid': {'Hit_20': 0.0723, 'Recall_20': 0.1257, 'NDCG_20': 0.1262}, 'test': {'Hit_20': 0.071, 'Recall_20': 0.

# 결론

1) layer의 깊이가 4일때 가장 좋은 성능을 얻음 (i.e., NDCG20이 0.16대 진입) <br>
2) last_only_user가 True일때 가장 좋은 성능을 얻음 (모든 layer에 대한 average를 취하면, overfitting 현상 발생함) <br>
3) neg_sample의 개수가 2일 때가 가장 좋음 (개수를 높이면 overfitting 현상 발생) <br>
4) lr가 0.001일 때가 가장 좋음 (lr을 높이면 overfitting 현상 발생) <br>

결론: [layer = 4, neg_sample = 2, last_only_use = True, lr = 0.001] 에서 가장 좋은 성능을 보임!

