In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

import jax
import jax.numpy as jnp
import flax.linen as nn
from tqdm import tqdm
import polars as pl
import pandas as pd
import numpy as np

import mlflow
from dotenv import load_dotenv
load_dotenv("../.env")

import sys
sys.path.append("..")
from herec.utils import *
from herec.loader import *
from herec.reader import *
from herec.trainer import *
from herec.model import *

In [None]:
def restoreDataAndPrediction(datasetName, modelName, seed):

    if datasetName == "Twitch100K":
        reader = Twitch100K()
    if datasetName == "ML100K_IMPLICIT":
        reader = ML100K_IMPLICIT()
    if datasetName == "ML1M_IMPLICIT":
        reader = ML1M_IMPLICIT()
    if datasetName == "FourSquare":
        reader = FourSquare()

    print( f"{datasetName}-{modelName}-TEST", seed )
    
    # DATA READ
    DATA = reader.get(seed, "test")
    
    # Run IDを取得
    run_id = pl.from_pandas(mlflow.search_runs( experiment_names=[f"{datasetName}-{modelName}-TEST"] )).filter( pl.col("params.seed").cast(int) == seed ).get_column("run_id")[0]
    
    # モデルパラメータ/ハイパーパラメータを取得
    params = restoreModelParams( run_id, -1 )
    hyparams = restoreHyperParams( run_id )
    
    # モデルを取得
    model = getModel( modelName, hyparams, DATA )
    
    pred_scores = model.apply({"params": params}, DATA["df_EVALUATION"]["user_ids"], method=model.get_all_scores_by_user_ids)
    topk_indices = jax.lax.top_k( pred_scores, 100 )[1]

    return DATA, topk_indices

In [None]:
def calcNovelty(DATA, topk_indices, k):

    p = DATA["df_TRAIN"].group_by("item_id").count().with_columns(
        pl.col("count") / DATA["df_TRAIN"].get_column("user_id").n_unique()
    )
    p = dict(zip(*p))
    p = {itemId: (p[itemId] if itemId in p.keys() else 0) for itemId in range(DATA["item_num"])}
    novelty = jnp.array([[p[itemId] for itemId in recList] for recList in topk_indices[:, :k].tolist()])
    novelty = novelty[novelty != 0]
    novelty = (- jnp.log2(novelty)).mean()
    
    return novelty.tolist()

In [None]:
datasetName = "ML100K_IMPLICIT"

for modelName in ["MF_BPR", "ProtoMF_BPR", "HE_MF_USER_BPR", "HE_MF_ITEM_BPR", "HE_MF_BPR", "NeuMF_BPR", "HE_NeuMF_BPR"]:

    values = []
    for seed in range(3):
        DATA, topk_indices = restoreDataAndPrediction(datasetName, modelName, seed)
        values.append( calcNovelty(DATA, topk_indices, 10) )
    print(np.array(values))
    print("$", np.mean(values).round(3), " \pm ", np.std(values).round(3), "$", sep="", end="\n\n")

In [None]:
datasetName = "ML1M_IMPLICIT"

for modelName in ["MF_BPR", "ProtoMF_BPR", "HE_MF_USER_BPR", "HE_MF_ITEM_BPR", "HE_MF_BPR", "NeuMF_BPR", "HE_NeuMF_BPR"]:

    values = []
    for seed in range(3):
        DATA, topk_indices = restoreDataAndPrediction(datasetName, modelName, seed)
        values.append( calcNovelty(DATA, topk_indices, 10) )
    print(np.array(values))
    print("$", np.mean(values).round(3), " \pm ", np.std(values).round(3), "$", sep="", end="\n\n")

In [None]:
datasetName = "Twitch100K"

for modelName in ["MF_BPR", "ProtoMF_BPR", "HE_MF_USER_BPR", "HE_MF_ITEM_BPR", "HE_MF_BPR"]:

    values = []
    for seed in range(3):
        DATA, topk_indices = restoreDataAndPrediction(datasetName, modelName, seed)
        values.append( calcNovelty(DATA, topk_indices, 10) )
    print(np.array(values))
    print("$", np.mean(values).round(3), " \pm ", np.std(values).round(3), "$", sep="", end="\n\n")

In [None]:
datasetName = "FourSquare"

for modelName in ["MF_BPR", "ProtoMF_BPR", "HE_MF_USER_BPR", "HE_MF_ITEM_BPR", "HE_MF_BPR", "NeuMF_BPR", "HE_NeuMF_BPR"]:

    values = []
    for seed in range(3):
        DATA, topk_indices = restoreDataAndPrediction(datasetName, modelName, seed)
        values.append( calcNovelty(DATA, topk_indices, 10) )
    print(np.array(values))
    print("$", np.mean(values).round(3), " \pm ", np.std(values).round(3), "$", sep="", end="\n\n")