In [1]:
import os
import shutil
import mlflow
from glob import glob
from tqdm import tqdm
import numpy as np
import pandas as pd
import polars as pl
import re
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import display
from scipy import stats

from dotenv import load_dotenv
load_dotenv("../.env")

import sys
sys.path.append("..")
from herec.utils import resultLoader



## Get Test Results

In [2]:
experiments = {experiment.experiment_id: experiment.name for experiment in mlflow.search_experiments( view_type="ACTIVE_ONLY" ) if experiment.name.endswith("-TEST")}

df_RESULT = pl.from_pandas(mlflow.search_runs( experiment_ids=experiments.keys() ))
df_RESULT = df_RESULT.select(
    pl.col("experiment_id").replace(experiments, default=None).str.split("-").list.get(0).alias("dataset_name"),
    pl.col("experiment_id").replace(experiments, default=None).str.split("-").list.get(1).alias("model_name"),
    pl.col("params.seed").cast(int),
    pl.col("^metrics(.*?)$").exclude("metrics.TRAIN_LOSS/ROUGH"),
).sort("dataset_name", "model_name")
df_RESULT = df_RESULT.with_columns( np.sqrt(pl.col("metrics.VALID_LOSS")) ) # RMSEに変換

df_RESULT

  series = f(lambda out: ufunc(*args, out=out, dtype=dtype_char, **kwargs))


dataset_name,model_name,params.seed,metrics.VALID_LOSS,metrics.MRR_30,metrics.Coverage_10,metrics.Precision_100,metrics.Coverage_30,metrics.Recall_30,metrics.nDCG_10,metrics.HitRate_10,metrics.nDCG_50,metrics.Recall_10,metrics.HitRate_30,metrics.HitRate_50,metrics.Coverage_50,metrics.nDCG_30,metrics.MRR_50,metrics.Precision_10,metrics.Recall_100,metrics.Precision_50,metrics.MRR_10,metrics.Recall_50,metrics.MRR_100,metrics.nDCG_100,metrics.Coverage_100,metrics.HitRate_100,metrics.Precision_30
str,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Ciao""","""FM""",2,0.960782,,,,,,,,,,,,,,,,,,,,,,,,
"""Ciao""","""FM""",1,0.942458,,,,,,,,,,,,,,,,,,,,,,,,
"""Ciao""","""FM""",0,1.011424,,,,,,,,,,,,,,,,,,,,,,,,
"""Ciao""","""HE_FM""",0,0.999538,,,,,,,,,,,,,,,,,,,,,,,,
"""Ciao""","""HE_MF""",2,0.962832,,,,,,,,,,,,,,,,,,,,,,,,
"""Ciao""","""HE_MF""",1,0.93599,,,,,,,,,,,,,,,,,,,,,,,,
"""Ciao""","""HE_MF""",0,0.995068,,,,,,,,,,,,,,,,,,,,,,,,
"""Ciao""","""MF""",2,1.426118,,,,,,,,,,,,,,,,,,,,,,,,
"""Ciao""","""MF""",1,1.387956,,,,,,,,,,,,,,,,,,,,,,,,
"""Ciao""","""MF""",0,1.39698,,,,,,,,,,,,,,,,,,,,,,,,


## Common

In [3]:
def getTestResults( datasetNameList, modelNameList, seedList, metricMap ):

    df_TARGET = df_RESULT.filter(
        pl.col("dataset_name").is_in(datasetNameList)
        & pl.col("model_name").is_in(modelNameList)
        & pl.col("params.seed").is_in(seedList)
    ).select(
        [pl.all().exclude("^metrics(.*?)$")] + [pl.col(f"metrics.{old_name}").alias(new_name) for old_name, new_name in metricMap.items()]
    )
    
    df_MEAN = df_TARGET.to_pandas().groupby(["dataset_name", "model_name"]).mean().reset_index()
    df_MEAN = df_MEAN.pivot_table( values=metricMap.values(), index="model_name", columns="dataset_name" ).swaplevel(axis=1)
    df_MEAN = df_MEAN.loc[modelNameList, [(datasetName, metricName) for datasetName in datasetNameList for metricName in metricMap.values()]]
    
    df_STD = df_TARGET.to_pandas().groupby(["dataset_name", "model_name"]).std().reset_index().fillna(-np.inf)
    df_STD = df_STD.pivot_table( values=metricMap.values(), index="model_name", columns="dataset_name" ).swaplevel(axis=1)
    df_STD = df_STD.loc[modelNameList, [(datasetName, metricName) for datasetName in datasetNameList for metricName in metricMap.values()]]
    
    df_COUNT = df_TARGET.to_pandas().groupby(["dataset_name", "model_name"]).count().reset_index()
    df_COUNT = df_COUNT.pivot_table( values=metricMap.values(), index="model_name", columns="dataset_name" ).swaplevel(axis=1)
    df_COUNT = df_COUNT.loc[modelNameList, [(datasetName, metricName) for datasetName in datasetNameList for metricName in metricMap.values()]]

    df_STATS = df_TARGET.group_by(["dataset_name", "model_name"]).agg( pl.all() ).drop("params.seed").to_pandas().set_index(["dataset_name", "model_name"])
    df_STATS = df_STATS.loc[[(datasetName, modelName) for datasetName in datasetNameList for modelName in modelNameList]]
    for columnName in metricMap.values():
        for i in reversed(range(df_STATS.shape[0])):
            df_STATS[columnName].iloc[i] = stats.ttest_rel(df_STATS[columnName].iloc[i], df_STATS[columnName].iloc[0]).pvalue
    df_STATS = df_STATS.fillna(np.inf)
    df_STATS = df_STATS.pivot_table( values=metricMap.values(), index="model_name", columns="dataset_name" ).swaplevel(axis=1)
    df_STATS = df_STATS.loc[modelNameList, [(datasetName, metricName) for datasetName in datasetNameList for metricName in metricMap.values()]]
    
    return df_MEAN, df_STD, df_COUNT.astype(int), df_STATS

In [4]:
def k_wise_visualize( df_MEAN, df_COUNT ):

    df_COUNT = df_COUNT.T.groupby(level=0).max().T

    # データセットのユニークリスト
    datasetNameList = df_MEAN.columns.get_level_values(0).unique()
    
    for datasetName in datasetNameList:
    
        # 対象データセットの結果を抽出
        df = df_MEAN[datasetName]
        
        # 評価指標のユニークリスト
        metricNameList = df.columns.str.replace("@\d+", "", regex=True).unique()
    
        # matplotlibの初期化
        fig, axes = plt.subplots(1, len(metricNameList), figsize=(3*len(metricNameList), 3))
        
        for i, metricName in enumerate(metricNameList):
    
            # 対象評価指標データセットの結果を抽出
            df2 = df.loc[ :, df.columns.str.startswith(metricName) ]
            df2.columns = df2.columns.str.replace(f"{metricName}@", "")
            
            for j, (modelName, scores) in enumerate(df2.iterrows()):
                (axes[i] if len(metricNameList) > 1 else axes).plot(scores.index, scores.values, linewidth=0.5, linestyle="dashed", marker=list(matplotlib.lines.Line2D.markers)[j+2], markersize=8, fillstyle="none", label=f"{modelName} ({df_COUNT.loc[modelName, datasetName]})" )
            
            axes[i].set_xlabel("$k$")
            axes[i].set_ylabel(metricName)
            # axes[i].yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
        
        print(datasetName)
        plt.suptitle( datasetName )
        axes[-1].legend( bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0 )
        # plt.tight_layout()
        plt.subplots_adjust(wspace=0.4)
        plt.show()

In [5]:
def bar_visualize( df_MEAN, df_COUNT ):

    for (datasetName, metricName), modelName in df_MEAN.T.iterrows():
        
        fig, ax = plt.subplots( figsize=(4, 3) )
    
        labels = modelName.index[::-1] + " (" + df_COUNT[(datasetName, metricName)].values[::-1].astype(str) + ")"
        values = modelName.values[::-1]
        bar = ax.barh( labels, values, color=np.where(modelName.index.str.contains("HE")[::-1], "black", "gray") )
        ax.bar_label(bar, labels=values.round(3))
        ax.set_xlabel( "Model" )
        ax.set_ylabel( metricName )
        
        plt.suptitle( datasetName )
        plt.show()

In [6]:
def to_latex( df_MEAN, df_STD, ascending=True ):

    df_PREFIX = np.where( df_MEAN.rank(axis=0, ascending=ascending) == 1, "\\textbf{", "{" )
    df_PREFIX = np.where( df_MEAN.rank(axis=0, ascending=ascending) == 2, "\\uline{", df_PREFIX )
    df_PREFIX = np.where( df_MEAN.rank(axis=0, ascending=ascending) == 3, "\\dashuline{", df_PREFIX )
    df_SUFFIX = np.where( df_MEAN.rank(axis=0, ascending=ascending) == 1, "}", "}" )
    df_LATEX = df_PREFIX + df_MEAN.map( lambda s: '{:.3f}'.format(s) ) + " $\pm$ " + df_STD.map( lambda s: '{:.3f}'.format(s) ) + df_SUFFIX
    df_LATEX.index.name = None
    df_LATEX.columns.names = (None, None)

    display(df_LATEX)
    print(df_LATEX.to_latex())

## ExplicitDataset

In [7]:
df_MEAN, df_STD, df_COUNT, df_STATS = getTestResults(

    datasetNameList = ["ML100K", "ML1M", "Ciao", "Ciao_PART"],
    modelNameList = ["MF", "HE_MF"],
    seedList = range(3),
    metricMap = {"VALID_LOSS": "RMSE"},

)

# bar_visualize( df_MEAN, df_COUNT )
to_latex( df_MEAN, df_STD )
display(df_STATS < 0.05)

Unnamed: 0_level_0,ML100K,ML1M,Ciao,Ciao_PART
Unnamed: 0_level_1,RMSE,RMSE,RMSE,RMSE
MF,\uline{0.995 $\pm$ 0.051},\uline{0.889 $\pm$ 0.013},\uline{1.404 $\pm$ 0.020},\uline{1.919 $\pm$ 0.161}
HE_MF,\textbf{0.972 $\pm$ 0.014},\textbf{0.887 $\pm$ 0.021},\textbf{0.965 $\pm$ 0.030},\textbf{0.892 $\pm$ 0.043}


\begin{tabular}{lllll}
\toprule
 & ML100K & ML1M & Ciao & Ciao_PART \\
 & RMSE & RMSE & RMSE & RMSE \\
\midrule
MF & \uline{0.995 $\pm$ 0.051} & \uline{0.889 $\pm$ 0.013} & \uline{1.404 $\pm$ 0.020} & \uline{1.919 $\pm$ 0.161} \\
HE_MF & \textbf{0.972 $\pm$ 0.014} & \textbf{0.887 $\pm$ 0.021} & \textbf{0.965 $\pm$ 0.030} & \textbf{0.892 $\pm$ 0.043} \\
\bottomrule
\end{tabular}



dataset_name,ML100K,ML1M,Ciao,Ciao_PART
Unnamed: 0_level_1,RMSE,RMSE,RMSE,RMSE
model_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
MF,False,False,True,True
HE_MF,False,False,False,True


In [8]:
df_MEAN, df_STD, df_COUNT, df_STATS = getTestResults(

    datasetNameList = ["ML100K", "Ciao_PART"],
    modelNameList = ["FM", "HE_FM"],
    seedList = range(3),
    metricMap = {"VALID_LOSS": "RMSE"},

)

# bar_visualize( df_MEAN, df_COUNT )
to_latex( df_MEAN, df_STD )
display(df_STATS < 0.05)

Unnamed: 0_level_0,ML100K,Ciao_PART
Unnamed: 0_level_1,RMSE,RMSE
FM,\textbf{0.957 $\pm$ 0.010},\uline{0.934 $\pm$ 0.019}
HE_FM,\uline{0.960 $\pm$ 0.017},\textbf{0.893 $\pm$ 0.032}


\begin{tabular}{lll}
\toprule
 & ML100K & Ciao_PART \\
 & RMSE & RMSE \\
\midrule
FM & \textbf{0.957 $\pm$ 0.010} & \uline{0.934 $\pm$ 0.019} \\
HE_FM & \uline{0.960 $\pm$ 0.017} & \textbf{0.893 $\pm$ 0.032} \\
\bottomrule
\end{tabular}



dataset_name,ML100K,Ciao_PART
Unnamed: 0_level_1,RMSE,RMSE
model_name,Unnamed: 1_level_2,Unnamed: 2_level_2
FM,False,False
HE_FM,False,False


## ImplicitDataset

In [9]:
df_MEAN, df_STD, df_COUNT, df_STATS = getTestResults(

    datasetNameList = ["ML100K_IMPLICIT"],
    modelNameList = ["MF_BPR", "ProtoMF_BPR", "HE_MF_USER_BPR", "HE_MF_ITEM_BPR", "HE_MF_BPR"],
    seedList = range(3),
    metricMap = { f"{metricName}_{k}": f"{metricName}@{k}" for metricName in ["HitRate", "Recall", "nDCG", "Coverage"] for k in [10] },

)

to_latex(df_MEAN, df_STD, False)
display(df_STATS < 0.05)

Unnamed: 0_level_0,ML100K_IMPLICIT,ML100K_IMPLICIT,ML100K_IMPLICIT,ML100K_IMPLICIT
Unnamed: 0_level_1,HitRate@10,Recall@10,nDCG@10,Coverage@10
MF_BPR,\uline{0.215 $\pm$ 0.065},\dashuline{0.027 $\pm$ 0.012},\uline{0.047 $\pm$ 0.021},\textbf{0.072 $\pm$ 0.059}
ProtoMF_BPR,\dashuline{0.210 $\pm$ 0.166},\uline{0.030 $\pm$ 0.025},{0.043 $\pm$ 0.039},\dashuline{0.045 $\pm$ 0.025}
HE_MF_USER_BPR,\textbf{0.233 $\pm$ 0.020},\textbf{0.030 $\pm$ 0.008},\textbf{0.049 $\pm$ 0.012},{0.016 $\pm$ 0.011}
HE_MF_ITEM_BPR,{0.209 $\pm$ 0.053},{0.022 $\pm$ 0.008},\dashuline{0.044 $\pm$ 0.008},\uline{0.048 $\pm$ 0.007}
HE_MF_BPR,{0.197 $\pm$ 0.093},{0.020 $\pm$ 0.013},{0.041 $\pm$ 0.032},{0.018 $\pm$ 0.014}


\begin{tabular}{lllll}
\toprule
 & \multicolumn{4}{r}{ML100K_IMPLICIT} \\
 & HitRate@10 & Recall@10 & nDCG@10 & Coverage@10 \\
\midrule
MF_BPR & \uline{0.215 $\pm$ 0.065} & \dashuline{0.027 $\pm$ 0.012} & \uline{0.047 $\pm$ 0.021} & \textbf{0.072 $\pm$ 0.059} \\
ProtoMF_BPR & \dashuline{0.210 $\pm$ 0.166} & \uline{0.030 $\pm$ 0.025} & {0.043 $\pm$ 0.039} & \dashuline{0.045 $\pm$ 0.025} \\
HE_MF_USER_BPR & \textbf{0.233 $\pm$ 0.020} & \textbf{0.030 $\pm$ 0.008} & \textbf{0.049 $\pm$ 0.012} & {0.016 $\pm$ 0.011} \\
HE_MF_ITEM_BPR & {0.209 $\pm$ 0.053} & {0.022 $\pm$ 0.008} & \dashuline{0.044 $\pm$ 0.008} & \uline{0.048 $\pm$ 0.007} \\
HE_MF_BPR & {0.197 $\pm$ 0.093} & {0.020 $\pm$ 0.013} & {0.041 $\pm$ 0.032} & {0.018 $\pm$ 0.014} \\
\bottomrule
\end{tabular}



dataset_name,ML100K_IMPLICIT,ML100K_IMPLICIT,ML100K_IMPLICIT,ML100K_IMPLICIT
Unnamed: 0_level_1,HitRate@10,Recall@10,nDCG@10,Coverage@10
model_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
MF_BPR,False,False,False,False
ProtoMF_BPR,False,False,False,False
HE_MF_USER_BPR,False,False,False,False
HE_MF_ITEM_BPR,False,False,False,False
HE_MF_BPR,False,False,False,False


In [10]:
df_MEAN, df_STD, df_COUNT, df_STATS = getTestResults(

    datasetNameList = ["Twitch100K"],
    modelNameList = ["MF_BPR", "HE_MF_BPR"],
    seedList = range(3),
    metricMap = { f"{metricName}_{k}": f"{metricName}@{k}" for metricName in ["HitRate", "Recall", "nDCG", "Coverage"] for k in [10] },

)

to_latex(df_MEAN, df_STD, False)
display(df_STATS < 0.05)

Unnamed: 0_level_0,Twitch100K,Twitch100K,Twitch100K,Twitch100K
Unnamed: 0_level_1,HitRate@10,Recall@10,nDCG@10,Coverage@10
MF_BPR,\textbf{0.258 $\pm$ 0.008},\textbf{0.189 $\pm$ 0.007},\textbf{0.100 $\pm$ 0.008},\textbf{0.272 $\pm$ 0.110}
HE_MF_BPR,\uline{0.219 $\pm$ 0.027},\uline{0.158 $\pm$ 0.019},\uline{0.087 $\pm$ 0.011},\uline{0.184 $\pm$ 0.067}


\begin{tabular}{lllll}
\toprule
 & \multicolumn{4}{r}{Twitch100K} \\
 & HitRate@10 & Recall@10 & nDCG@10 & Coverage@10 \\
\midrule
MF_BPR & \textbf{0.258 $\pm$ 0.008} & \textbf{0.189 $\pm$ 0.007} & \textbf{0.100 $\pm$ 0.008} & \textbf{0.272 $\pm$ 0.110} \\
HE_MF_BPR & \uline{0.219 $\pm$ 0.027} & \uline{0.158 $\pm$ 0.019} & \uline{0.087 $\pm$ 0.011} & \uline{0.184 $\pm$ 0.067} \\
\bottomrule
\end{tabular}



dataset_name,Twitch100K,Twitch100K,Twitch100K,Twitch100K
Unnamed: 0_level_1,HitRate@10,Recall@10,nDCG@10,Coverage@10
model_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
MF_BPR,False,False,False,False
HE_MF_BPR,False,False,False,False
