In [12]:
for test in [1,2,4,5]:
    print(test)

1
2
4
5


In [1]:
import argparse
import numpy as np
from data_loader import load_data
import logging
np.random.seed(2022)

In [2]:
args = argparse.Namespace()
args.dataset="naver-toy"
args.dim=16
args.n_hop=2
args.kge_weight=0.01
args.l2_weight=1e-7
args.lr=0.02
args.batch_size=1024
args.n_epoch=1
args.n_memory=32
args.item_update_mode="plus_transform"
args.using_all_hops=True
args.use_cuda = True
args.show_topk =True

In [3]:
data_info = load_data(args)

reading rating file ...
splitting dataset ...
reading KG file ...
constructing knowledge graph ...
constructing ripple set ...


In [4]:
show_loss=False

In [49]:
rating_file = "../data/"+args.dataset+"/ratings_final"
rating_np = np.load(rating_file+".npy")


In [53]:
eval_ratio = 0.2
test_ratio = 0.2
n_ratings = rating_np.shape[0]
eval_indices = np.random.choice(n_ratings, size=int(n_ratings*eval_ratio),replace=False)
eval_indices

array([ 65497,  41347, 138471, ...,   7107, 136362, 109953])

In [54]:
left = set(range(n_ratings))- set(eval_indices)
test_indices = np.random.choice(list(left), size=int(n_ratings*test_ratio), replace=False)
train_indices = list(left - set(test_indices))

In [72]:
user_history_dict = dict()
for i in train_indices[:]:
    user = rating_np[i][0]
    item = rating_np[i][1]
    rating = rating_np[i][2]
    if rating == 1:
        if user not in user_history_dict:
            user_history_dict[user] = []
        user_history_dict[user].append(item)
train_indices2 = [i for i in train_indices if rating_np[i][0] in user_history_dict]


In [73]:
len(train_indices)

95776

In [84]:
train_data = rating_np[train_indices]
eval_data = rating_np[eval_indices]
test_data = rating_np[test_indices]
len(train_data)

95776

In [85]:

len(eval_data)

31925

In [86]:
len(test_data)

31925

In [87]:
len(train_data) + len(eval_data)+len(test_data)

159626

In [82]:
train_user_set = set()
for data in train_data:
    user_idx = data[0]
    train_user_set.add(user_idx)
len(train_user_set)

13341

In [74]:
len(train_indices2)

95056

In [76]:
train_data = rating_np[train_indices2]

In [77]:
len(train_data)

95056

In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.metrics import roc_auc_score


class RippleNet(nn.Module):
    def __init__(self, args, n_entity, n_relation):
        super(RippleNet, self).__init__()

        self._parse_args(args, n_entity, n_relation)

        self.entity_emb = nn.Embedding(self.n_entity, self.dim)
        self.relation_emb = nn.Embedding(self.n_relation, self.dim * self.dim)
        self.transform_matrix = nn.Linear(self.dim, self.dim, bias=False)
        self.criterion = nn.BCELoss()

    def _parse_args(self, args, n_entity, n_relation):
        self.n_entity = n_entity
        self.n_relation = n_relation
        self.dim = args.dim
        self.n_hop = args.n_hop
        self.kge_weight = args.kge_weight
        self.l2_weight = args.l2_weight
        self.lr = args.lr
        self.n_memory = args.n_memory
        self.item_update_mode = args.item_update_mode
        self.using_all_hops = args.using_all_hops

    def forward(
        self,
        items: torch.LongTensor,
        labels: torch.LongTensor,
        memories_h: list,
        memories_r: list,
        memories_t: list,
    ):
        # [batch size, dim]
        item_embeddings = self.entity_emb(items)
        h_emb_list = []
        r_emb_list = []
        t_emb_list = []
        for i in range(self.n_hop):
            # [batch size, n_memory, dim]
            h_emb_list.append(self.entity_emb(memories_h[i]))
            # [batch size, n_memory, dim, dim]
            r_emb_list.append(
                self.relation_emb(memories_r[i]).view(
                    -1, self.n_memory, self.dim, self.dim
                )
            )
            # [batch size, n_memory, dim]
            t_emb_list.append(self.entity_emb(memories_t[i]))

        o_list, item_embeddings = self._key_addressing(
            h_emb_list, r_emb_list, t_emb_list, item_embeddings
        )
        scores = self.predict(item_embeddings, o_list)

        return_dict = self._compute_loss(
            scores, labels, h_emb_list, t_emb_list, r_emb_list
        )
        return_dict["scores"] = scores

        return return_dict

    def _compute_loss(self, scores, labels, h_emb_list, t_emb_list, r_emb_list):
        base_loss = self.criterion(scores, labels.float())

        kge_loss = 0
        for hop in range(self.n_hop):
            # [batch size, n_memory, 1, dim]
            h_expanded = torch.unsqueeze(h_emb_list[hop], dim=2)
            # [batch size, n_memory, dim, 1]
            t_expanded = torch.unsqueeze(t_emb_list[hop], dim=3)
            # [batch size, n_memory, dim, dim]
            hRt = torch.squeeze(
                torch.matmul(torch.matmul(h_expanded, r_emb_list[hop]), t_expanded)
            )
            kge_loss += torch.sigmoid(hRt).mean()
        kge_loss = -self.kge_weight * kge_loss

        l2_loss = 0
        for hop in range(self.n_hop):
            l2_loss += (h_emb_list[hop] * h_emb_list[hop]).sum()
            l2_loss += (t_emb_list[hop] * t_emb_list[hop]).sum()
            l2_loss += (r_emb_list[hop] * r_emb_list[hop]).sum()
        l2_loss = self.l2_weight * l2_loss

        loss = base_loss + kge_loss + l2_loss
        return dict(base_loss=base_loss, kge_loss=kge_loss, l2_loss=l2_loss, loss=loss)

    def _key_addressing(self, h_emb_list, r_emb_list, t_emb_list, item_embeddings):
        o_list = []
        for hop in range(self.n_hop):
            # [batch_size, n_memory, dim, 1]
            h_expanded = torch.unsqueeze(h_emb_list[hop], dim=3)

            # [batch_size, n_memory, dim]
            Rh = torch.squeeze(torch.matmul(r_emb_list[hop], h_expanded))

            # [batch_size, dim, 1]
            v = torch.unsqueeze(item_embeddings, dim=2)

            # [batch_size, n_memory]
            probs = torch.squeeze(torch.matmul(Rh, v))

            # [batch_size, n_memory]
            probs_normalized = F.softmax(probs, dim=1)

            # [batch_size, n_memory, 1]
            probs_expanded = torch.unsqueeze(probs_normalized, dim=2)

            # [batch_size, dim]
            o = (t_emb_list[hop] * probs_expanded).sum(dim=1)

            item_embeddings = self._update_item_embedding(item_embeddings, o)
            o_list.append(o)
        return o_list, item_embeddings

    def _update_item_embedding(self, item_embeddings, o):
        if self.item_update_mode == "replace":
            item_embeddings = o
        elif self.item_update_mode == "plus":
            item_embeddings = item_embeddings + o
        elif self.item_update_mode == "replace_transform":
            item_embeddings = self.transform_matrix(o)
        elif self.item_update_mode == "plus_transform":
            item_embeddings = self.transform_matrix(item_embeddings + o)
        else:
            raise Exception("Unknown item updating mode: " + self.item_update_mode)
        return item_embeddings

    def predict(self, item_embeddings, o_list):
        y = o_list[-1]
        if self.using_all_hops:
            for i in range(self.n_hop - 1):
                y += o_list[i]

        # [batch_size]
        scores = (item_embeddings * y).sum(dim=1)
        return torch.sigmoid(scores)

    def evaluate(self, items, labels, memories_h, memories_r, memories_t):
        return_dict = self.forward(items, labels, memories_h, memories_r, memories_t)
        scores = return_dict["scores"].detach().cpu().numpy()
        labels = labels.cpu().numpy()
        auc = roc_auc_score(y_true=labels, y_score=scores)
        predictions = [1 if i >= 0.5 else 0 for i in scores]
        acc = np.mean(np.equal(predictions, labels))
        return auc, acc


In [6]:

def ctr_eval(args, model, data, ripple_set, batch_size):
    auc_list = []
    f1_list = []
    model.eval()
    start = 0
    while start < data.shape[0]:
        labels = data[start:start + args.batch_size, 2]
        return_dict = model(*get_feed_dict(args, model, data, ripple_set, start, start + batch_size))
        scores = return_dict["scores"]
        scores = scores.detach().cpu().numpy()
        auc = roc_auc_score(y_true=labels, y_score=scores)
        predictions = [1 if i >= 0.5 else 0 for i in scores]
        f1 = f1_score(y_true=labels, y_pred=predictions)
        auc_list.append(auc)
        f1_list.append(f1)
        start += args.batch_size
    model.train()  
    auc = float(np.mean(auc_list))
    f1 = float(np.mean(f1_list))
    return auc, f1


def topk_eval(args, model, train_data, test_data, ripple_set):
    # logging.info('calculating recall ...')
    k_list = [5, 10, 20, 50, 100]
    recall_list = {k: [] for k in k_list}

    item_set = set(train_data[:,1].tolist() + test_data[:,1].tolist())
    train_record = _get_user_record(args, train_data, True)
    test_record = _get_user_record(args, test_data, False)
    user_list = list(set(train_record.keys()) & set(test_record.keys()))
    user_num = 13498
    if len(user_list) > user_num:
        np.random.seed()    
        user_list = np.random.choice(user_list, size=user_num, replace=False)
    data = np.vstack([train_data, test_data])
    print("len(user_list): ", len(user_list))
    # model.eval()
    # for user in user_list:
    #     test_item_list = list(item_set-set(train_record[user]))
    #     item_score_map = dict()
    #     start = 0
    #     while start + args.batch_size <= len(test_item_list):
    #         items = test_item_list[start:start + args.batch_size] 
    #         # input_data = _get_topk_feed_data(user, items)
    #         return_dict = model(*get_feed_dict(args, model, data, ripple_set, start, start + args.batch_size))
    #         scores = return_dict["scores"]
    #         for item, score in zip(items, scores):
    #             item_score_map[item] = score
    #         start += args.batch_size
    #     # padding the last incomplete mini-batch if exists
    #     if start < len(test_item_list):
    #         res_items = test_item_list[start:] + [test_item_list[-1]] * (args.batch_size - len(test_item_list) + start)
    #         # input_data = _get_topk_feed_data(user, res_items)
    #         return_dict = model(*get_feed_dict(args, model, data, ripple_set, start, start + args.batch_size))
    #         scores = return_dict["scores"]
    #         for item, score in zip(res_items, scores):
    #             item_score_map[item] = score
    #     item_score_pair_sorted = sorted(item_score_map.items(), key=lambda x: x[1], reverse=True)
    #     item_sorted = [i[0] for i in item_score_pair_sorted]
    #     for k in k_list:
    #         hit_num = len(set(item_sorted[:k]) & set(test_record[user]))
    #         recall_list[k].append(hit_num / len(set(test_record[user])))
    # model.train()  
    # recall = [np.mean(recall_list[k]) for k in k_list]
    # _show_recall_info(zip(k_list, recall))
    # return recall

    
def _init_model(args, data_info):
    n_entity = data_info[3]
    n_relation = data_info[4]
    model = RippleNet(args, n_entity, n_relation)
    if args.use_cuda:
        model.cuda()
    optimizer = torch.optim.Adam(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr = args.lr,
        weight_decay = args.l2_weight,
    )
    loss_func = nn.BCELoss()
    return model, optimizer, loss_func
    
    
def _get_feed_data(args, data, user_triple_set, item_triple_set, start, end):
    # origin item
    items = torch.LongTensor(data[start:end, 1])
    if args.use_cuda:
        items = items.cuda()
    # kg propagation embeddings
    users_triple = _get_triple_tensor(args, data[start:end,0], user_triple_set)
    items_triple = _get_triple_tensor(args, data[start:end,1], item_triple_set)
    return items, users_triple, items_triple


def _get_feed_label(args,labels):
    labels = torch.FloatTensor(labels)
    if args.use_cuda:
        labels = labels.cuda()
    return labels


def _get_triple_tensor(args, objs, triple_set):
    # [h,r,t]  h: [layers, batch_size, triple_set_size]
    h,r,t = [], [], []
    for i in range(args.n_layer):
        h.append(torch.LongTensor([triple_set[obj][i][0] for obj in objs]))
        r.append(torch.LongTensor([triple_set[obj][i][1] for obj in objs]))
        t.append(torch.LongTensor([triple_set[obj][i][2] for obj in objs]))
        if args.use_cuda:
            h = list(map(lambda x: x.cuda(), h))
            r = list(map(lambda x: x.cuda(), r))
            t = list(map(lambda x: x.cuda(), t))
    return [h,r,t]


def _get_user_record(args, data, is_train):
    user_history_dict = dict()
    for rating in data:
        user = rating[0]
        item = rating[1]
        label = rating[2]
        if is_train or label == 1:
            if user not in user_history_dict:
                user_history_dict[user] = set()
            user_history_dict[user].add(item)
    return user_history_dict


def _get_topk_feed_data(user, items):
    res = list()
    for item in items:
        res.append([user,item])
    return np.array(res)


def _show_recall_info(recall_zip):
    res = ""
    for i,j in recall_zip:
        res += "K@%d:%.4f  "%(i,j)
    logging.info(res)

def get_feed_dict(args, model, data, ripple_set, start, end):
    items = torch.LongTensor(data[start:end, 1])
    labels = torch.LongTensor(data[start:end, 2])
    memories_h, memories_r, memories_t = [], [], []
    for i in range(args.n_hop):
        memories_h.append(torch.LongTensor([ripple_set[user][i][0] for user in data[start:end, 0]]))
        memories_r.append(torch.LongTensor([ripple_set[user][i][1] for user in data[start:end, 0]]))
        memories_t.append(torch.LongTensor([ripple_set[user][i][2] for user in data[start:end, 0]]))
    if args.use_cuda:
        items = items.cuda()
        labels = labels.cuda()
        memories_h = list(map(lambda x: x.cuda(), memories_h))
        memories_r = list(map(lambda x: x.cuda(), memories_r))
        memories_t = list(map(lambda x: x.cuda(), memories_t))
    return items, labels, memories_h, memories_r,memories_t





In [78]:
train_data = data_info[0]
eval_data = data_info[1]
test_data = data_info[2]

train_record = _get_user_record(args, train_data, True)
test_record = _get_user_record(args, test_data, False)
eval_record = _get_user_record(args, eval_data,False)

In [79]:
len(train_data)

95044

In [48]:
train_user_set = set()
for data in train_data:
    user_idx = data[0]
    train_user_set.add(user_idx)
len(train_user_set)

12855

In [32]:
user_list = set(train_record.keys()) | set(test_record.keys()) | set(eval_record.keys())
len(user_list)

12855

In [27]:
len(train_record.keys())

12855

In [23]:
with open("../data/naver-toy/ratings_final.txt", "r") as f:
    ratings_final = f.readlines()

ratings_final = [rating.replace("\n","") for rating in ratings_final]
ratings_final = [rating.split("\t") for rating in ratings_final]
ratings_final[:5]

[['0', '9', '1'],
 ['0', '10', '1'],
 ['0', '3', '1'],
 ['0', '12', '1'],
 ['0', '8', '1']]

In [38]:
user_set = set()
for rating in ratings_final:
    user_idx = rating[0]
    user_set.add(int(user_idx))
len(user_set)

13498

In [83]:
len(ratings_final)

159626

In [44]:
user_set.difference(user_list)

{42,
 101,
 265,
 634,
 646,
 727,
 767,
 848,
 851,
 857,
 886,
 894,
 899,
 917,
 921,
 951,
 973,
 987,
 1003,
 1019,
 1045,
 1055,
 1179,
 1204,
 1270,
 1297,
 1306,
 1345,
 1435,
 1438,
 1481,
 1569,
 1591,
 1611,
 1621,
 1625,
 1653,
 1657,
 1663,
 1664,
 1665,
 1887,
 1966,
 1972,
 1978,
 2108,
 2111,
 2127,
 2141,
 2147,
 2153,
 2154,
 2176,
 2184,
 2296,
 2339,
 2468,
 2477,
 2527,
 2529,
 2570,
 2602,
 2631,
 2872,
 2882,
 2886,
 2905,
 3057,
 3094,
 3100,
 3107,
 3109,
 3115,
 3122,
 3154,
 3199,
 3213,
 3218,
 3304,
 3468,
 3552,
 3598,
 3671,
 3735,
 3744,
 3835,
 3848,
 3939,
 4189,
 4199,
 4202,
 4216,
 4248,
 4275,
 4304,
 4419,
 4429,
 4503,
 4504,
 4590,
 4653,
 4663,
 4665,
 4706,
 4853,
 4905,
 4936,
 4946,
 4972,
 4975,
 5097,
 5188,
 5191,
 5217,
 5232,
 5251,
 5265,
 5298,
 5399,
 5400,
 5444,
 5464,
 5499,
 5502,
 5519,
 5524,
 5532,
 5550,
 5555,
 5571,
 5602,
 5676,
 5718,
 5750,
 6029,
 6037,
 6040,
 6071,
 6082,
 6103,
 6277,
 6332,
 6335,
 6338,
 6357,
 637

In [7]:
import numpy as np
import torch
import torch.nn as nn 
from sklearn.metrics import roc_auc_score, f1_score

import logging

logging.basicConfig(format="[%(asctime)s] %(levelname)s: %(message)s", level=logging.INFO)


def train(args, data_info, show_loss):
    # logging.info("================== training CKAN ====================")
    train_data = data_info[0]
    eval_data = data_info[1]
    test_data = data_info[2]
    n_entity = data_info[3]
    n_relation = data_info[4]
    ripple_set = data_info[5]
    # model, optimizer, loss_func = _init_model(args, data_info)
    model = RippleNet(args, n_entity, n_relation)
    if args.use_cuda:
        model.cuda()
    optimizer = torch.optim.Adam(
        filter(lambda p : p.requires_grad, model.parameters()), args.lr
    )



    for step in range(10):
        np.random.shuffle(train_data)
        start = 0
        while start < train_data.shape[0]:
            return_dict = model(*get_feed_dict(args, model, train_data, ripple_set, start, start + args.batch_size))
            loss = return_dict["loss"]
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            start += args.batch_size
        eval_auc, eval_f1 = ctr_eval(args, model, eval_data, ripple_set, args.batch_size)
        test_auc, test_f1 = ctr_eval(args, model, test_data, ripple_set, args.batch_size)
        ctr_info = 'epoch %.2d    eval auc: %.4f f1: %.4f    test auc: %.4f f1: %.4f'
        logging.info(ctr_info, step, eval_auc, eval_f1, test_auc, test_f1)
        if args.show_topk:
            topk_eval(args, model, train_data, test_data, ripple_set)
        #     # recall = topk_eval(args, model, train_data, test_data, user_triple_set, item_triple_set)
        #     # print(f"recall : {recall}")



In [8]:
train(args, data_info, show_loss)

[2022-03-30 14:51:36,095] INFO: epoch 00    eval auc: 0.5036 f1: 0.3761    test auc: 0.4946 f1: 0.3677


len(user_list):  8538


[2022-03-30 14:51:39,406] INFO: epoch 01    eval auc: 0.5133 f1: 0.2496    test auc: 0.5089 f1: 0.2384


len(user_list):  8538


[2022-03-30 14:51:42,751] INFO: epoch 02    eval auc: 0.5303 f1: 0.2633    test auc: 0.5254 f1: 0.2551


len(user_list):  8538


[2022-03-30 14:51:46,015] INFO: epoch 03    eval auc: 0.5446 f1: 0.2977    test auc: 0.5401 f1: 0.2923


len(user_list):  8538


[2022-03-30 14:51:49,338] INFO: epoch 04    eval auc: 0.5501 f1: 0.3022    test auc: 0.5480 f1: 0.2933


len(user_list):  8538


[2022-03-30 14:51:52,630] INFO: epoch 05    eval auc: 0.5667 f1: 0.2885    test auc: 0.5632 f1: 0.2849


len(user_list):  8538


[2022-03-30 14:51:56,012] INFO: epoch 06    eval auc: 0.5811 f1: 0.3146    test auc: 0.5752 f1: 0.3068


len(user_list):  8538


[2022-03-30 14:51:59,355] INFO: epoch 07    eval auc: 0.5872 f1: 0.3383    test auc: 0.5806 f1: 0.3313


len(user_list):  8538


[2022-03-30 14:52:02,750] INFO: epoch 08    eval auc: 0.5976 f1: 0.3521    test auc: 0.5898 f1: 0.3434


len(user_list):  8538


[2022-03-30 14:52:06,109] INFO: epoch 09    eval auc: 0.6035 f1: 0.3759    test auc: 0.5967 f1: 0.3696


len(user_list):  8538
