In [1]:
import numpy as np
import torch
import logging
from model import RippleNet
from sklearn.metrics import roc_auc_score, f1_score

import multiprocessing
from time import time

from prettytable import PrettyTable

from data_loader import load_data
from utils import get_feed_dict, _get_topk_feed_data, _get_user_record, topk_settings
import os

In [2]:
import argparse
args = argparse.Namespace()
args.dataset = "naver-toy"
args.dim=32
args.n_hop=2
args.kge_weight=0.01
args.l2_weight=1e-7
args.lr = 0.02
args.batch_size = 256
args.n_epoch = 2
args.n_memory=32
args.item_update_mode="plus_transform"
args.using_all_hops=True
args.use_cuda = True
args.show_topk =True
args.gpu_id=0
args.Ks=[20,40,60,80,100]
args.test_flag = "part"


In [3]:
Ks = args.Ks
BATCH_SIZE = args.batch_size
batch_test_flag = True

cores = multiprocessing.cpu_count() // 2
device = torch.device("cuda")

In [4]:
data_info = load_data(args)
show_loss = False

reading rating file ...
splitting dataset ...
reading KG file ...
constructing knowledge graph ...
constructing ripple set ...


In [5]:
def ctr_eval(args, model, data, ripple_set, batch_size):
    auc_list = []
    f1_list = []
    model.eval()
    start = 0
    while start < data.shape[0]:
        labels = data[start:start + args.batch_size, 2]
        return_dict = model(*get_feed_dict(args, model, data, ripple_set, start, start + batch_size))
        scores = return_dict["scores"]
        scores = scores.detach().cpu().numpy()
        auc = roc_auc_score(y_true=labels, y_score=scores)
        predictions = [1 if i >= 0.5 else 0 for i in scores]
        f1 = f1_score(y_true=labels, y_pred=predictions)
        auc_list.append(auc)
        f1_list.append(f1)
        start += args.batch_size
    model.train()  
    auc = float(np.mean(auc_list))
    f1 = float(np.mean(f1_list))
    return auc, f1



def evaluation(args, model, data, ripple_set, batch_size):
    start = 0
    auc_list = []
    acc_list = []
    model.eval()
    while start < data.shape[0]:
        auc, acc = model.evaluate(*get_feed_dict(args, model, data, ripple_set, start, start + batch_size))
        auc_list.append(auc)
        acc_list.append(acc)
        start += batch_size
    model.train()
    return float(np.mean(auc_list)), float(np.mean(acc_list))




def _show_recall_info(recall_zip):
    res = ""
    for i,j in recall_zip:
        res += "K@%d:%.4f  "%(i,j)
    # logging.info(res)
    return res

def topk_eval(args, model, train_data, test_data, n_item, ripple_set, batch_size):
    user_list, train_record, test_record, item_set, k_list = topk_settings(show_topk=True, 
                                                                        train_data=train_data, 
                                                                        test_data=test_data, 
                                                                        n_item=n_item)
    precision_list = {k:[] for k in k_list}
    recall_list = {k:[] for k in k_list}
    ndcg_list = {k:[] for k in k_list}
    
    model.eval()
    for user in user_list:
        test_item_list = list(item_set - train_record[user])
        item_score_map = dict()
        start = 0

        while start + batch_size <= len(test_item_list):
            items = test_item_list[start:start+batch_size]
            input_data = _get_topk_feed_data(user, items)
            return_dict = model(*get_feed_dict(args, model, test_data, ripple_set, 0, batch_size))
            scores = return_dict["scores"]
            for item, score in zip(items, scores):
                item_score_map[item] = score
            start += args.batch_size

        if start < len(test_item_list):
            res_items = test_item_list[start:] + [test_item_list[-1]] * (args.batch_size - len(test_item_list) + start)
            input_data = _get_topk_feed_data(user, res_items)
            # scores = model(*_get_feed_data(args, input_data, user_triple_set, item_triple_set, 0, args.batch_size))
            return_dict = model(*get_feed_dict(args, model, test_data, ripple_set, 0, args.batch_size))
            scores = return_dict["scores"]
            for item, score in zip(res_items, scores):
                item_score_map[item] = score
        item_score_pair_sorted = sorted(item_score_map.items(), key=lambda x: x[1], reverse=True)
        ground_truth = []
        item_sorted = [i[0] for i in item_score_pair_sorted]
        
        ground_truth = []
        for i in item_sorted:
            if i in test_item_list:
                ground_truth.append(1)
            else:
                ground_truth.append(0)
                
        
        for k in k_list:
            hit_num = len(set(item_sorted[:k]) & set(test_record[user]))
            recall_list[k].append(hit_num / len(set(test_record[user])))
            ndcg_list[k].append(ndcg_at_k(item_sorted[:k], k, ground_truth=ground_truth))
            precision_list[k].append(hit_num / len(set(item_sorted[:k])))
    model.train()  
    recall = [np.mean(recall_list[k]) for k in k_list]
    precision = [np.mean(precision_list[k]) for k in k_list]
    ndcg = [np.mean(ndcg_list[k]) for k in k_list]
    
    return recall, precision, ndcg

# Train

In [6]:
from utils import get_feed_dict, _get_user_record, _get_item_record
from helper import early_stopping
import heapq
from metrics import *

In [7]:
n_user, n_item, n_entity, n_relation, train_data, eval_data, test_data, ripple_set = data_info

In [8]:
n_item

7285

In [9]:
model = RippleNet(args, n_entity, n_relation)
if args.use_cuda:
    model.cuda()
optimizer = torch.optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()),
    args.lr,
)

user_list, train_record, test_record, item_set, k_list = topk_settings(show_topk=True, train_data=train_data, test_data=test_data, n_item=n_item)

In [10]:
log_directory = f"./training_log/{args.dataset}/"
if not os.path.exists(log_directory):
    os.makedirs(log_directory)
with open(log_directory+f"RippleNet_{args.lr}.txt","w") as f:
    for step in range(args.n_epoch):
        # training
        np.random.shuffle(train_data)
        start = 0
        train_s_t = time()
        while start < train_data.shape[0]:
            return_dict = model(*get_feed_dict(args, model, train_data, ripple_set, start, start + args.batch_size))
            loss = return_dict["loss"]

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            start += args.batch_size
            if show_loss:
                print('%.1f%% %.4f' % (start / train_data.shape[0] * 100, loss.item()))
        train_e_t = time()
        train_auc, train_acc = evaluation(args, model, train_data, ripple_set, args.batch_size)
        test_s_t = time()
        precision_k, recall_k, ndcg_k = topk_eval(args, model, train_data, test_data, n_item, ripple_set, args.batch_size)
        test_e_t = time()

        test_res = PrettyTable()
        test_res.field_names = ["Epoch", "training time", "tesing time", "Loss", "recall",  "precision", "ndcg"]
        test_res.add_row(
            [step, train_e_t-train_s_t, test_e_t-test_s_t, loss.item(), recall_k, precision_k, ndcg_k]
        )
        print(test_res)