## RQ1

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys

import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import ndcg_score

from dotenv import load_dotenv

load_dotenv()
project_root = os.environ["PROJECT_ROOT"]
sys.path.append(project_root)

In [None]:
import src.modules.result_analysis.loading as result_loading
import src.modules.result_analysis.model_standardization as ms

In [None]:
figures_root = os.path.join(project_root, "latex", "figures")
os.makedirs(figures_root, exist_ok=True)

In [None]:
plt.rc('font', size=20)
plt.rc('text', usetex=True)
plt.rc('text.latex', preamble=r'\usepackage{amsmath,amssymb,bm,bbm,lmodern}')

In [None]:
def loglik(df):
    return np.log(np.take_along_axis((df[bins_mass_cols].values+1e-6)/(1.+1e-5), (df["rating"]*2-1).astype(int).values[:,None], axis=1)).sum()

In [None]:
bins_mass_cols = [f"bins_mass_{x}" for x in range(10)]

In [None]:
NUM_FOLDS = 10

data_path_templates = {
    "LBDS_512_sum_no_bias": os.path.join(project_root, "logs", "LBD_results", "LBDS_512_sum_no_bias", "LBDS_512_sum_no_bias-{}-0", "export"),
    "LBDS_512_sum_mn": os.path.join(project_root, "logs", "LBD_results", "LBDS_512_sum_mn", "LBDS_512_sum_mn-{}-0", "export"),
    "LBDS_512_sum_ab": os.path.join(project_root, "logs", "LBD_results", "LBDS_512_sum_ab", "LBDS_512_sum_ab-{}-0", "export"),
    "LBDS_512_norm_ab": os.path.join(project_root, "logs", "LBD_results", "LBDS_512_norm_ab", "LBDS_512_norm_ab-{}-0", "export"),
    "LBDS_512_dot_ab": os.path.join(project_root, "logs", "LBD_results", "LBDS_512_dot_ab", "LBDS_512_dot_ab-{}-0", "export"),
    "LBDS_256_256_ab": os.path.join(project_root, "logs", "LBD_results", "LBDS_256_256_ab", "LBDS_256_256_ab-{}-0", "export"),
    "LBDA_512_sum_ab": os.path.join(project_root, "logs", "LBD_results", "LBDA_512_sum_ab", "LBDA_512_sum_ab-{}-0", "export")
}
print("Loading data")
data = {k: [result_loading.path_to_df(v.format(i)) for i in range(NUM_FOLDS)] for k, v in data_path_templates.items()}
print("Standardising")
confidence_models = {k: [ms.standardise_model(k, df) for df in dfs] for k, dfs in data.items()}

### Table 1

In [None]:
# RMSE
metric = {k: [np.sqrt((df["err_mean"]**2).mean()) for df in dfs] for k, dfs in confidence_models.items()}
print("RMSE")
print({k: np.mean(v) for k, v in metric.items()})

In [None]:
# MAE
metric = {k: [np.mean(np.abs(df["err_mean"])) for df in dfs] for k, dfs in confidence_models.items()}
print("MAE")
print({k: np.mean(v) for k, v in metric.items()})

In [None]:
# Accuracy
metric = {k: [np.mean(df["highest_correct"]) for df in dfs] for k, dfs in confidence_models.items()}
print("Accuracy")
print({k: np.mean(v) for k, v in metric.items()})

In [None]:
# Loglik
metric = {k: [loglik(df) for df in dfs] for k, dfs in confidence_models.items()}
print("Loglik")
print({k: np.mean(v) for k, v in metric.items()})

In [None]:
# NDCG@3
ndcg_fn = lambda x: ndcg_score(x["rating"].values[None,:], x["mean"].values[None,:], k=3) if len(x) > 1 else 1.
metric = {k: [np.mean(df.groupby("uid")[["rating", "mean"]].apply(ndcg_fn)) for df in dfs] for k, dfs in confidence_models.items()}
print("NDCG@3")
print({k: np.mean(v) for k, v in metric.items()})

In [None]:
# NDCG@10
ndcg_fn = lambda x: ndcg_score(x["rating"].values[None,:], x["mean"].values[None,:], k=10) if len(x) > 1 else 1.
metric = {k: [np.mean(df.groupby("uid")[["rating", "mean"]].apply(ndcg_fn)) for df in dfs] for k, dfs in confidence_models.items()}
print("NDCG@10")
print({k: np.mean(v) for k, v in metric.items()})