In [1]:
import numpy as np
import torch
import logging
from model import RippleNet
from sklearn.metrics import roc_auc_score, f1_score

import multiprocessing
from time import time

from prettytable import PrettyTable

from data_loader import load_data
from utils import get_feed_dict, _get_topk_feed_data, _get_user_record
import os
import random
seed = 2020
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


In [2]:
import argparse
args = argparse.Namespace()
args.dataset = "naver-toy"
args.dim=32
args.n_hop=2
args.kge_weight=0.01
args.l2_weight=1e-7
args.lr = 0.001
args.batch_size = 256
args.n_epoch = 100
args.n_memory=32
args.item_update_mode="plus_transform"
args.using_all_hops=True
args.use_cuda = True
args.show_topk =True
args.gpu_id=0
args.Ks=[20,40,60,80,100]
args.test_flag = "part"



In [3]:
Ks = args.Ks
BATCH_SIZE = args.batch_size
batch_test_flag = True

cores = multiprocessing.cpu_count() // 2
device = torch.device("cuda")

In [4]:
data_info = load_data(args)
show_loss = False

reading rating file ...
splitting dataset ...
reading KG file ...
constructing knowledge graph ...
constructing ripple set ...


In [5]:
show_loss = False

# Train Function

In [6]:
n_user, n_item, train_data, eval_data, test_data, n_entity, n_relation, ripple_set= data_info




In [7]:
n_params = {
    "n_users" : n_user,
    "n_items" : n_item,
    "n_entity" : n_entity,
    "n_relation" : n_relation
}

In [8]:
train_user_set = _get_user_record(train_data, False)
test_user_set = _get_user_record(test_data, False)
user_dict = {
    "train_user_set" : train_user_set,
    "test_user_set" : test_user_set
}

In [17]:
from metrics import *
from parser import parse_args

import heapq


def ranklist_by_heapq(user_pos_test, test_items, rating, Ks):
    item_score = {}
    for i in test_items:
        item_score[i] = rating[i]

    K_max = max(Ks)
    K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get)
    # print(K_max_item_score)
    r = []
    for i in K_max_item_score:
        if i in user_pos_test:
            r.append(1)
        else:
            r.append(0)
    auc = 0.
    return r, auc

def get_auc(item_score, user_pos_test):
    item_score = sorted(item_score.items(), key=lambda kv: kv[1])
    item_score.reverse()
    item_sort = [x[0] for x in item_score]
    posterior = [x[1] for x in item_score]

    r = []
    for i in item_sort:
        if i in user_pos_test:
            r.append(1)
        else:
            r.append(0)
    auc = AUC(ground_truth=r, prediction=posterior)
    return auc

def ranklist_by_sorted(user_pos_test, test_items, rating, Ks):
    item_score = {}
    for i in test_items:
        item_score[i] = rating[i]

    K_max = max(Ks)
    K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get)

    r = []
    for i in K_max_item_score:
        if i in user_pos_test:
            r.append(1)
        else:
            r.append(0)
    auc = get_auc(item_score, user_pos_test)
    return r, auc

def get_performance(user_pos_test, r, auc, Ks):
    precision, recall, ndcg, hit_ratio = [], [], [], []

    for K in Ks:
        precision.append(precision_at_k(r, K))
        recall.append(recall_at_k(r, K, len(user_pos_test)))
        ndcg.append(ndcg_at_k(r, K, user_pos_test))
        hit_ratio.append(hit_at_k(r, K))

    return {'recall': np.array(recall), 'precision': np.array(precision),
            'ndcg': np.array(ndcg), 'hit_ratio': np.array(hit_ratio), 'auc': auc}


def test_one_user(x):
    # user u's ratings for user u
    rating = x[0] # rate_batch
    # uid
    u = x[1]      # user_list_batch
    # user u's items in the training set
    try:
        training_items = train_user_set[u]
    except Exception:
        training_items = []
    # user u's items in the test set
    user_pos_test = test_user_set[u]

    all_items = set(range(0, n_items))

    test_items = list(all_items - set(training_items))

    if args.test_flag == 'part':
        r, auc = ranklist_by_heapq(user_pos_test, test_items, rating, Ks)
    else:
        r, auc = ranklist_by_sorted(user_pos_test, test_items, rating, Ks)

    return get_performance(user_pos_test, r, auc, Ks)

def topk_settings(show_topk, train_data, test_data, n_item):
    if show_topk:
        user_num = 100
        k_list = [1, 2, 5, 10, 20, 50, 100]
        train_record = _get_user_record(train_data, True)
        test_record = _get_user_record(test_data, False)
        user_list = list(set(train_record.keys()) & set(test_record.keys()))
        if len(user_list) > user_num:
            user_list = np.random.choice(user_list, size=user_num, replace=False)
        item_set = set(list(range(n_item)))
        return user_list, train_record, test_record, item_set, k_list
    else:
        return [None] * 5


def _show_info(recall_zip, title):
    res = title+": "
    for i,j in recall_zip:
        res += "K@%d:%.4f  "%(i,j)
    logging.info(res)

In [18]:
model = RippleNet(args, n_entity, n_relation)
if args.use_cuda:
    model.cuda()

optimizer = torch.optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    args.lr,
)


In [15]:
def topk_eval(args, model, train_data, test_data, n_item, n_user, ripple_set, batch_size):
    u_batch_size = args.batch_size
    i_batch_size = args.batch_size
    user_list, train_record, test_record, item_set, k_list = topk_settings(show_topk=args.show_topk, 
                                                                            train_data=train_data, 
                                                                            test_data=test_data, 
                                                                            n_item=n_item)
                                                                        
    test_users = list(test_user_set.keys())
    n_test_users = len(test_users)
    n_user_batchs = n_test_users // u_batch_size + 1

    count = 0
    # test_item_list  = list(item_set - train_record[user])

    test_users = list(test_record.keys())
    n_user_batchs = n_test_users // u_batch_size + 1
    count = 0
    # for user in test_users:
        # test_item_list = list(item_set - train_record[user])
                                                                            
    precision_list = {k:[] for k in k_list}
    recall_list = {k:[] for k in k_list}
    ndcg_list = {k:[] for k in k_list}

    model.eval()

    item_score_map = dict()
    n_item_batchs = n_item // i_batch_size + 1
    i_count = 0
    for user in test_users:
        for i_batch_id in range(n_item_batchs):
            i_start = i_batch_id * i_batch_size
            i_end = min((i_batch_id+1)*i_batch_size, n_item)

            batch_test_data = test_data[i_start, i_end]
            items, labels, memories_h, memories_r,memories_t = get_feed_dict(args, model, test_data, ripple_set, 0, args.batch_size)
            return_dict = model(items, labels, memories_h, memories_r,memories_t)
            scores = return_dict["scores"]
            
            for item, score in zip(items, scores):
                item_score_map[item] = score

        item_score_pair_sorted = sorted(item_score_map.items(), key=lambda x : x[1], reverse=True)
        item_sorted = [i[0] for i in item_score_pair_sorted]
        for k in k_list:
            hit_num = len(set(item_sorted[:k]) & set(test_record[user]))
            recall_list[k].append(hit_num / len(set(test_record[user])))
            precision_list[k].append(hit_num / k)
    model.train()  
    recall_k = [np.mean(recall_list[k]) for k in k_list]
    precision_k = [np.mean(precision_list[k]) for k in k_list]
    _show_info(zip(k_list, recall_k), "recall")
    _show_info(zip(k_list, precision_k), "precision")


SyntaxError: invalid syntax (534117491.py, line 18)

In [None]:

log_directory = f"./training_log/{args.dataset}/"
if not os.path.exists(log_directory):
    os.makedirs(log_directory)
with open(log_directory+f"RippleNet_{args.lr}.txt","w") as f:
    for step in range(args.n_epoch):
        # training
        np.random.shuffle(train_data)
        start = 0
        train_s_t = time()
        while start < train_data.shape[0]:
            return_dict = model(*get_feed_dict(args, model, train_data, ripple_set, start, start + args.batch_size))
            loss = return_dict["loss"]

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            start += args.batch_size
            if show_loss:
                print('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss.item()))
        train_e_t = time()
        ret = test(model, user_dict, n_params)
        test_s_t = time()
        
        test_e_t = time()

        test_res = PrettyTable()
        test_res.field_names = ["Epoch", "training time", "tesing time", "Loss", "recall",  "precision", "ndcg"]
        test_res.add_row(
            [step, train_e_t-train_s_t, test_e_t-test_s_t, loss.item(), ret["recall"], ret["precision"], ret["ndcg"]]
        )
        print(test_res)