In [1]:
import numpy as np
import torch
import logging
from model import RippleNet
from sklearn.metrics import roc_auc_score, f1_score

import multiprocessing
from time import time

from prettytable import PrettyTable

from data_loader import load_data

In [2]:
import argparse
args = argparse.Namespace()
args.dataset = "naver-toy"
args.dim=32
args.n_hop=2
args.kge_weight=0.01
args.l2_weight=1e-7
args.lr = 0.02
args.batch_size = 256
args.n_epoch = 10
args.n_memory=32
args.item_update_mode="plus_transform"
args.using_all_hops=True
args.use_cuda = True
args.show_topk =True
args.gpu_id=0
args.Ks=[20,40,60,80,100]
args.test_flag = "part"


In [3]:
Ks = args.Ks
BATCH_SIZE = args.batch_size
batch_test_flag = True

cores = multiprocessing.cpu_count() // 2
device = torch.device("cuda")

In [4]:
data_info = load_data(args)
show_loss = False

reading rating file ...
splitting dataset ...
reading KG file ...
constructing knowledge graph ...
constructing ripple set ...


# Train

In [5]:
from utils import get_feed_dict, _get_user_record, _get_item_record
from helper import early_stopping
import heapq
from metrics import *

In [7]:
train_data = data_info[0]
eval_data = data_info[1]
test_data = data_info[2]
n_entity = data_info[3]
n_relation = data_info[4]
ripple_set = data_info[5]

In [8]:
all_data = np.vstack([train_data, eval_data, train_data])

In [9]:
all_user_set = _get_user_record(all_data, True)
all_item_set = _get_item_record(all_data)

In [10]:
all_data.shape

(221122, 3)

In [11]:
n_items = len(all_user_set.keys())
n_users = len(all_item_set.keys())

In [12]:
def make_cf(data):
    def _get_user_record2(data):
        user_history_dict = dict()
        for rating in data:
            user = rating[0]
            item = rating[1]
            label = rating[2]
            if label == 1:
                if user not in user_history_dict:
                    user_history_dict[user] = set()
                    # user_history_dict[user] = list()
                user_history_dict[user].add(item)
                # user_history_dict[user].append(item)
        return user_history_dict
    train_user_set2 = _get_user_record2(data)
    arr = list()
    for key in train_user_set2.keys():
        for value in train_user_set2[key]:
            arr.append(np.array([key, value]))
    arr = np.array(arr)
    return arr

In [13]:
train_cf = make_cf(train_data)
test_cf = make_cf(test_data)

In [14]:
train_cf

array([[    0,     1],
       [    0,     2],
       [    0,     5],
       ...,
       [13494,  7281],
       [13496,  7283],
       [13497,  7284]], dtype=int32)

In [15]:
from collections import defaultdict
def remap_item(train_data, test_data):
    # global n_users, n_items
    train_user_dict = defaultdict(list)
    test_user_dict = defaultdict(list)
    n_users = max(max(train_data[:, 0]), max(test_data[:, 0])) + 1
    n_items = max(max(train_data[:, 1]), max(test_data[:, 1])) + 1

    for u_id, i_id in train_data:
        train_user_dict[int(u_id)].append(int(i_id))
    for u_id, i_id in test_data:
        test_user_dict[int(u_id)].append(int(i_id))
    return n_users, n_items, train_user_dict, test_user_dict

n_users2, n_items2, train_user_dict, test_user_dict = remap_item(train_cf, test_cf)

In [16]:
n_users

7133

In [17]:
n_users2

13498

In [18]:
n_items2


7285

In [19]:
n_items

12846

In [20]:
n_entity

7348

In [None]:


def ranklist_by_heapq(user_pos_test, test_items, rating, Ks):
    item_score = {}
    for i in test_items:
        print(f"i: {i}, len(rating): {len(rating)}")
        item_score[i] = rating[i]

    K_max = max(Ks)
    K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get)

    r = []
    for i in K_max_item_score:
        if i in user_pos_test:
            r.append(1)
        else:
            r.append(0)
    auc = 0.
    return r, auc

def get_auc(item_score, user_pos_test):
    item_score = sorted(item_score.items(), key=lambda kv: kv[1])
    item_score.reverse()
    item_sort = [x[0] for x in item_score]
    posterior = [x[1] for x in item_score]

    r = []
    for i in item_sort:
        if i in user_pos_test:
            r.append(1)
        else:
            r.append(0)
    auc = AUC(ground_truth=r, prediction=posterior)
    return auc

def ranklist_by_sorted(user_pos_test, test_items, rating, Ks):
    item_score = {}
    for i in test_items:
        item_score[i] = rating[i]

    K_max = max(Ks)
    K_max_item_score = heapq.nlargest(K_max, item_score, key=item_score.get)

    r = []
    for i in K_max_item_score:
        if i in user_pos_test:
            r.append(1)
        else:
            r.append(0)
    auc = get_auc(item_score, user_pos_test)
    return r, auc

def get_performance(user_pos_test, r, auc, Ks):
    precision, recall, ndcg, hit_ratio = [], [], [], []

    for K in Ks:
        precision.append(precision_at_k(r, K))
        recall.append(recall_at_k(r, K, len(user_pos_test)))
        ndcg.append(ndcg_at_k(r, K, user_pos_test))
        hit_ratio.append(hit_at_k(r, K))

    return {'recall': np.array(recall), 'precision': np.array(precision),
            'ndcg': np.array(ndcg), 'hit_ratio': np.array(hit_ratio), 'auc': auc}


def test_one_user(x):
    # user u's ratings for user u
    rating = x[0] # rate_batch
    # uid
    u = x[1]      # user_list_batch
    # user u's items in the training set
    try:
        training_items = train_user_set[u]
    except Exception:
        training_items = []
    # user u's items in the test set
    user_pos_test = test_user_set[u]

    # all_items = set(range(0, n_entity))
    all_items = set(range(0, n_items))

    test_items = list(all_items - set(training_items))

    if args.test_flag == 'part':
        r, auc = ranklist_by_heapq(user_pos_test, test_items, rating, Ks)
    else:
        r, auc = ranklist_by_sorted(user_pos_test, test_items, rating, Ks)

    return get_performance(user_pos_test, r, auc, Ks)



def test(args, model, data_info):
    global Ks
    Ks = args.Ks
    
    result = {'precision': np.zeros(len(Ks)),
              'recall': np.zeros(len(Ks)),
              'ndcg': np.zeros(len(Ks)),
              'hit_ratio': np.zeros(len(Ks)),
              'auc': 0.}
    # args = parse_args()
    
    device = torch.device("cuda")
    BATCH_SIZE = args.batch_size
    batch_test_flag = args.batch_size

    global n_entity
    train_data = data_info[0]
    eval_data = data_info[1]
    test_data = data_info[2]
    n_entity = data_info[3]
    n_relation = data_info[4]
    ripple_set = data_info[5]

    train_cf = make_cf(train_data)
    test_cf = make_cf(test_data)
    global test_user_set, train_user_set
    n_users, n_items, train_user_set, test_user_set = remap_item(train_cf, test_cf)

    user_dict = {
        "train_user_set":train_user_set,
        "test_user_set":test_user_set
    }

    n_nodes = n_entity + n_users

    
    
    # train_user_set = user_dict['train_user_set']
    # test_user_set = user_dict['test_user_set']

    pool = multiprocessing.Pool(cores)

    u_batch_size = BATCH_SIZE
    i_batch_size = BATCH_SIZE

    test_users = list(test_user_set.keys())
    n_test_users = len(test_users)
    n_user_batchs = n_test_users // u_batch_size + 1

    count = 0
    
        
    
    for u_batch_id in range(n_user_batchs):
        start = u_batch_id * u_batch_size
        end = min((u_batch_id+1)*u_batch_size, n_test_users)

        user_list_batch = test_users[start: end]
        user_batch = torch.LongTensor(np.array(user_list_batch)).to(device)
        user_emb = model.get_user_embeddings(*get_feed_dict(args, model, test_data, ripple_set, start, end))

        if batch_test_flag:
            # batch-item test
            n_item_batchs = n_items // i_batch_size + 1
            rate_batch = np.zeros(shape=(len(user_batch), n_items))

            i_count = 0
            for i_batch_id in range(n_item_batchs):
                i_start = i_batch_id * i_batch_size
                i_end = min((i_batch_id + 1) * i_batch_size, n_items)
                # print(f"i_start: {i_start}, i_end: {i_end}")
                item_batch = torch.LongTensor(np.array(range(i_start, i_end))).view(i_end-i_start).to(device)
                item_emb = model.entity_emb(item_batch)

                i_rate_batch = model.rating(user_emb, item_emb).detach().cpu()
                # print(f"rate_batch.shape: {rate_batch[:, i_start:i_end].shape}")
                # print(f"i_rate_batch.shape: {i_rate_batch.shape}")
                rate_batch[:, i_start: i_end] = i_rate_batch
                # print(f"len(user_batch) : {len(user_batch)}")
                
                i_count += i_rate_batch.shape[1]

            assert i_count == n_items
        else:
            # all-item test
            item_batch = torch.LongTensor(np.array(range(0, n_items))).view(n_items, -1).to(device)
            
            rate_batch = model.rating(user_emb, item_emb).detach().cpu()

        user_batch_rating_uid = zip(rate_batch, user_list_batch)
        batch_result = pool.map(test_one_user, user_batch_rating_uid)
        count += len(batch_result)
        for re in batch_result:
            result['precision'] += re['precision']/n_test_users
            result['recall'] += re['recall']/n_test_users
            result['ndcg'] += re['ndcg']/n_test_users
            result['hit_ratio'] += re['hit_ratio']/n_test_users
            result['auc'] += re['auc']/n_test_users

    assert count == n_test_users
    pool.close()
    return result


In [None]:
train_data = data_info[0]
eval_data = data_info[1]
test_data = data_info[2]
n_entity = data_info[3]
n_relation = data_info[4]
ripple_set = data_info[5]

model = RippleNet(args, n_entity, n_relation)
if args.use_cuda:
    model.cuda()
optimizer = torch.optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    args.lr,
)



cur_best_pre_0 = 0
stopping_step = 0
should_stop = False
with open(f"./training_log/RippleNet_{args.dataset}_{args.lr}.txt","w") as f:
    for step in range(args.n_epoch):
        # training
        np.random.shuffle(train_data)
        start = 0
        train_s_t = time()
        while start < train_data.shape[0]:
            return_dict = model(*get_feed_dict(args, model, train_data, ripple_set, start, start + args.batch_size))
            loss = return_dict["loss"]

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            start += args.batch_size
            if show_loss:
                print('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss.item()))
        train_e_t = time()
        # evaluation

        test_s_t = time()
        ret = test(args, model, data_info)
        test_e_t = time()
        result_table = PrettyTable()
        result_table.field_names = ["Epoch", "training time", "tesing time", "Loss", "recall", "ndcg", "precision", "hit_ratio"]
        result_table.add_row(
            [step, train_e_t - train_s_t, test_e_t - test_s_t, loss.item(), ret['recall'], ret['ndcg'], ret['precision'], ret['hit_ratio']]
        )

        # cur_best_pre_0, stopping_step, should_stop = early_stopping(ret['recall'][0], cur_best_pre_0,
        #                                                                     stopping_step, expected_order='acc',
        #                                                                     flag_step=10)
        if ret['recall'][0] == cur_best_pre_0 and args.save:

            torch.save(model.state_dict(), args.out_dir + 'model_' + args.dataset + '.ckpt')
        print(result_table)
        f.write(str(result_table)+"\n")