## RQ2

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as ss

from sklearn.metrics import ndcg_score

from dotenv import load_dotenv

load_dotenv()
project_root = os.environ["PROJECT_ROOT"]
sys.path.append(project_root)

In [None]:
import src.modules.result_analysis.loading as result_loading
import src.modules.result_analysis.model_standardization as ms

In [None]:
figures_root = os.path.join(project_root, "latex", "figures")
os.makedirs(figures_root, exist_ok=True)

In [None]:
plt.rc('font', size=20)
plt.rc('text', usetex=True)
plt.rc('text.latex', preamble=r'\usepackage{amsmath,amssymb,bm,bbm,lmodern}')

In [None]:
def loglik(df):
    return np.log(np.take_along_axis((df[bins_mass_cols].values+1e-6)/(1.+1e-5), (df["rating"]*2-1).astype(int).values[:,None], axis=1)).sum()

In [None]:
bins_mass_cols = [f"bins_mass_{x}" for x in range(10)]

In [None]:
NUM_FOLDS = 10

data_path_templates = {
    "MF_128": os.path.join(project_root, "logs", "LBD_results", "MF_128", "MF_128-{}-0", "export"),
    "CMF_128": os.path.join(project_root, "logs", "LBD_results", "CMF_128", "CMF_128-{}-0", "export"),
    "OrdRec-UI_512": os.path.join(project_root, "logs", "LBD_results", "OrdRec-UI_512", "OrdRec-UI_512-{}-0", "export"),
    "LBDS_512_sum_ab": os.path.join(project_root, "logs", "LBD_results", "LBDS_512_sum_ab", "LBDS_512_sum_ab-{}-0", "export"),
    "LBDA_512_sum_ab": os.path.join(project_root, "logs", "LBD_results", "LBDA_512_sum_ab", "LBDA_512_sum_ab-{}-0", "export")
}

print("Loading data")
data = {k: [result_loading.path_to_df(v.format(i)) for i in range(NUM_FOLDS)] for k, v in data_path_templates.items()}
print("Standardising")
confidence_models = {k: [ms.standardise_model(k, df) for df in dfs] for k, dfs in data.items()}

### Table 2

In [None]:
# RMSE
alternative = "less"
metric = {k: [np.sqrt((df["err_mean"]**2).mean()) for df in dfs] for k, dfs in confidence_models.items()}
keys = list(metric.keys())
stat_sign = np.zeros((len(metric), len(metric)))
for i, k in enumerate(keys):
    for j, k2 in enumerate(keys):
        if i == j:
            continue
        stat_sign[i,j] = ss.wilcoxon(metric[k], metric[k2], alternative=alternative).pvalue
print(f"RMSE: (i,j) is p-value for alternative hypothesis that i is {alternative} than j.")
print(pd.DataFrame(stat_sign, index=keys, columns=keys))

In [None]:
# MAE
metric = {k: [np.mean(np.abs(df["err_mean"])) for df in dfs] for k, dfs in confidence_models.items()}
alternative = "less"
keys = list(metric.keys())
stat_sign = np.zeros((len(metric), len(metric)))
for i, k in enumerate(keys):
    for j, k2 in enumerate(keys):
        if i == j:
            continue
        stat_sign[i,j] = ss.wilcoxon(metric[k], metric[k2], alternative=alternative).pvalue
print("MAE")
for m, v in metric.items():
    print(f"{m}: {np.mean(v)} ({np.std(v)})")
print(f"\n(i,j) is p-value for alternative hypothesis that i is {alternative} than j.")
print(pd.DataFrame(stat_sign, index=keys, columns=keys))

In [None]:
met

In [None]:
# Accuracy
metric = {k: [np.mean(df["highest_correct"]) for df in dfs] for k, dfs in confidence_models.items()}
alternative = "greater"
keys = list(metric.keys())
stat_sign = np.zeros((len(metric), len(metric)))
for i, k in enumerate(keys):
    for j, k2 in enumerate(keys):
        if i == j:
            continue
        stat_sign[i,j] = ss.wilcoxon(metric[k], metric[k2], alternative=alternative).pvalue
print("Accuracy")
for m, v in metric.items():
    print(f"{m}: {np.mean(v)} ({np.std(v)})")
print(f"\n(i,j) is p-value for alternative hypothesis that i is {alternative} than j.")
print(pd.DataFrame(stat_sign, index=keys, columns=keys))

In [None]:
# Loglik
metric = {k: [loglik(df) for df in dfs] for k, dfs in confidence_models.items()}
alternative = "greater"
keys = list(metric.keys())
stat_sign = np.zeros((len(metric), len(metric)))
for i, k in enumerate(keys):
    for j, k2 in enumerate(keys):
        if i == j:
            continue
        stat_sign[i,j] = ss.wilcoxon(metric[k], metric[k2], alternative=alternative).pvalue
print("Loglik")
for m, v in metric.items():
    print(f"{m}: {np.mean(v)} ({np.std(v)})")
print(f"\n(i,j) is p-value for alternative hypothesis that i is {alternative} than j.")
print(pd.DataFrame(stat_sign, index=keys, columns=keys))

In [None]:
# NDCG@3
ndcg_fn = lambda x: ndcg_score(x["rating"].values[None,:], x["mean"].values[None,:], k=3) if len(x) > 1 else 1.
metric = {k: [np.mean(df.groupby("uid")[["rating", "mean"]].apply(ndcg_fn)) for df in dfs] for k, dfs in confidence_models.items()}
alternative = "greater"
keys = list(metric.keys())
stat_sign = np.zeros((len(metric), len(metric)))
for i, k in enumerate(keys):
    for j, k2 in enumerate(keys):
        if i == j:
            continue
        stat_sign[i,j] = ss.wilcoxon(metric[k], metric[k2], alternative=alternative).pvalue
print("NDCG@3")
for m, v in metric.items():
    print(f"{m}: {np.mean(v)} ({np.std(v)})")
print(f"\n(i,j) is p-value for alternative hypothesis that i is {alternative} than j.")
print(pd.DataFrame(stat_sign, index=keys, columns=keys))

In [None]:
# NDCG@10
ndcg_fn = lambda x: ndcg_score(x["rating"].values[None,:], x["mean"].values[None,:], k=10) if len(x) > 1 else 1.
metric = {k: [np.mean(df.groupby("uid")[["rating", "mean"]].apply(ndcg_fn)) for df in dfs] for k, dfs in confidence_models.items()}
alternative = "greater"
keys = list(metric.keys())
stat_sign = np.zeros((len(metric), len(metric)))
for i, k in enumerate(keys):
    for j, k2 in enumerate(keys):
        if i == j:
            continue
        stat_sign[i,j] = ss.wilcoxon(metric[k], metric[k2], alternative=alternative).pvalue
print("NDCG@10")
for m, v in metric.items():
    print(f"{m}: {np.mean(v)} ({np.std(v)})")
print(f"\n(i,j) is p-value for alternative hypothesis that i is {alternative} than j.")
print(pd.DataFrame(stat_sign, index=keys, columns=keys))